#### Classify nationality from names

In [1]:
import os, re, time
import pandas as pd
import numpy as np
np.random.seed(99)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

from keras.models import Model
from keras.layers import Dense, Input, Activation, Dropout, LSTM, BatchNormalization, GRU, Bidirectional
from keras import regularizers
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.models import model_from_json

Using TensorFlow backend.


In [2]:
def data_prep(data, outcomes):

    np.random.seed(99)
    shuffled_ix = np.random.permutation(len(data))
    data = data[shuffled_ix,:]
    outcomes = outcomes[shuffled_ix]

    ## split data to train/validation:
    validation_split = 0.2
    nb_validation_samples = int(validation_split * len(data))

    x_train = data[:-nb_validation_samples,:]
    y_train = outcomes[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:,:]
    y_val = outcomes[-nb_validation_samples:]
    
    return x_train, x_val, y_train, y_val

def sentence_to_ix(names, seq_len=30):
    
    unknown_ix = vocab_size
    
    #split up names
    split_names = [list(str(name)) for name in list(names)]
    
    #convert medical terms to indices:
    char_ixs = [[char_to_ix[character] for character in name] for name in split_names]

    #shorten or pad sequences at seq_len:
    trms_ixs = sequence.pad_sequences(char_ixs, maxlen=seq_len, value=unknown_ix)    
    
    return np.array(trms_ixs)  

In [3]:
nams = pd.read_csv('./create_data/names_dataset.csv')
nams.shape

(321385, 4)

In [4]:
nams.head()

Unnamed: 0,family,given,hun,full_name
0,petrovics,bernadett,1,"petrovics,bernadett"
1,balatoni,irma,1,"balatoni,irma"
2,rusznyak,valeria,1,"rusznyak,valeria"
3,paternina,tyanna,0,"paternina,tyanna"
4,stommes,evett,0,"stommes,evett"


In [5]:
#obtain character set:
allnames = []
for name in nams.full_name:
    allnames += name
chars = list(set(allnames))
vocab_size = len(chars)

print('There are %d total characters and %d unique characters in your data.' % (len(allnames), len(chars)))

There are 4340681 total characters and 28 unique characters in your data.


In [6]:
lens = np.array([len(i) for i in nams.full_name])
seq_len = lens.max()
print('max character length {} is set as sequence length'.format(seq_len))

max character length 27 is set as sequence length


In [7]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: ' ', 1: ',', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z'}


In [16]:
main_input = Input(shape=(seq_len,), dtype='int32', name='main_input')
# Embedding layer to encode the input sequence into 50-dimensional embedding vectors.
x = Embedding(output_dim=50, input_dim=vocab_size+1, input_length=seq_len)(main_input)
x = LSTM(32)(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=main_input, outputs=x)

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 27)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 27, 50)            1450      
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                10624     
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 12,107
Trainable params: 12,107
Non-trainable params: 0
_________________________________________________________________


In [18]:
adam_optim = Adam(lr=0.002, beta_1=0.85)
model.compile(optimizer=adam_optim, loss='binary_crossentropy', metrics=['accuracy']) 

In [13]:
data = sentence_to_ix(nams.full_name, seq_len=seq_len)
outcomes = nams.hun.values
x_train, x_val, y_train, y_val = data_prep(data, outcomes)

In [36]:
tstart = time.time()
model.fit(x_train, y_train, epochs = 3, batch_size = 512, shuffle=True, validation_data=(x_val, y_val), verbose=1)
tend = time.time()
print('elapsed training time: {} sec'.format(tend-tstart))

y_pred = model.predict(x_train)
print('train roc: {}'.format(roc_auc_score(y_train, y_pred)))

y_pred = model.predict(x_val)
print('test roc: {}'.format(roc_auc_score(y_val, y_pred)))

Train on 257108 samples, validate on 64277 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
elapsed training time: 48.16281175613403 sec
train roc: 0.9977544221428531
test roc: 0.9976069324322784


#### Evaluate performance

In [45]:
yhat = np.array(y_pred>0.75).astype('int')
target_names = ['American', 'Hungarian']
print(classification_report(y_val, yhat, target_names=target_names))

             precision    recall  f1-score   support

   American       0.99      0.98      0.98     32338
  Hungarian       0.98      0.99      0.98     31939

avg / total       0.98      0.98      0.98     64277



In [58]:
confusion_matrix(y_val, yhat)

array([[31699,   639],
       [  375, 31564]])

#### Try some examples

In [50]:
aa = sentence_to_ix(['perge,janos'], seq_len=seq_len)
model.predict(aa)

array([[ 0.93117267]], dtype=float32)

In [52]:
aa = sentence_to_ix(['robertson,kelsea'], seq_len=seq_len)
model.predict(aa)

array([[  3.16900390e-08]], dtype=float32)

In [53]:
aa = sentence_to_ix(['kovacs,kelsea'], seq_len=seq_len)
model.predict(aa)

array([[ 0.83647567]], dtype=float32)

In [54]:
aa = sentence_to_ix(['robertson,janos'], seq_len=seq_len)
model.predict(aa)

array([[  4.67025494e-08]], dtype=float32)

In [55]:
aa = sentence_to_ix(['tseng,leo'], seq_len=seq_len)
model.predict(aa)

array([[  6.45191976e-05]], dtype=float32)