In [1]:
import pandas as pd
import numpy as np

# Load male data
df_male = pd.read_csv('data/male.txt', sep="\n", header=4)
df_male.columns = ['name']
df_male['y'] = pd.Series('M', index=df_male.index)

# Load female data
df_female = pd.read_csv('data/female.txt', sep="\n", header=4)
df_female.columns = ['name']
df_female['y'] = pd.Series('F', index=df_female.index)

# Join in same DataFrame
df = pd.concat([df_male, df_female])


In [2]:
df['name_length'] = pd.Series([len(el) for el in df['name']])

In [3]:
# Wordify names
df['wordified_names'] = [list(name.lower()) for name in df['name']]

In [4]:
df = df[['name', 'wordified_names', 'name_length', 'y']]
df.head(5)

Unnamed: 0,name,wordified_names,name_length,y
0,Aamir,"[a, a, m, i, r]",5,M
1,Aaron,"[a, a, r, o, n]",5,M
2,Abbey,"[a, b, b, e, y]",5,M
3,Abbie,"[a, b, b, i, e]",5,M
4,Abbot,"[a, b, b, o, t]",5,M


In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(random_state=42)

train_index, test_index = next(sss.split(df['name'], df['y']))
df_train = df.iloc[train_index]
df_test = df.iloc[test_index]

print("The average person's name is {} characters long, "
  "99% of people's names are {} characters long or less, "
  "while the longest name is {} characters long.".format(df_train['name_length'].mean(),
                                                         df_train['name_length'].quantile(0.99),
                                                         df_train['name_length'].max()))

max_seq_len = min(df_train['name_length'].max(), 20)
print('Max seq len will be {}'.format(max_seq_len))

The average person's name is 5.97552105189537 characters long, 99% of people's names are 10.0 characters long or less, while the longest name is 15 characters long.
Max seq len will be 15


In [119]:
from keras.preprocessing.text import Tokenizer

X_tr = list(df_train['name'])
X_te = list(df_test['name'])

y_tr = list(df_train['y'])
y_te = list(df_test['y'])

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_tr_bin = lb.fit_transform(y_tr)
y_te_bin = lb.transform(y_te)

keras_tokenizer = Tokenizer(lower=True, char_level=True, oov_token='<OOV>')
keras_tokenizer.fit_on_texts(X)


[[1]
 [0]
 [0]
 ...
 [0]
 [1]
 [1]]


In [130]:
from keras.preprocessing.sequence import pad_sequences
X_tr_seq = keras_tokenizer.texts_to_sequences(X_tr) # paddings are 0s, OOVs are 1s, and rest are the one hot representations
X_tr_pad = pad_sequences(X_tr_seq, maxlen=15, dtype='int32', padding='post', truncating='post', value=0.0)
X_tr_pad_oh = [keras_tokenizer.sequences_to_matrix([[i] for i in X_tr_pad_i]) for X_tr_pad_i in X_tr_pad]
X_tr_pad_oh = np.array(X_tr_pad_oh)
num_oh_vecs = len(X_tr_pad_oh[0])
len_oh_vecs = len(X_tr_pad_oh[0][0])
print(num_oh_vecs, len_oh_vecs)

X_te_seq = keras_tokenizer.texts_to_sequences(X_te) # paddings are 0s, OOVs are 1s, and rest are the one hot representations
X_te_pad = pad_sequences(X_te_seq, maxlen=15, dtype='int32', padding='post', truncating='post', value=0.0)
X_te_pad_oh = [keras_tokenizer.sequences_to_matrix([[i] for i in X_te_pad_i]) for X_te_pad_i in X_te_pad]
X_te_pad_oh = np.array(X_te_pad_oh)

15 31


In [144]:
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

BATCH_SIZE = 16

model = Sequential()
model.add(LSTM(256, return_sequences=False, input_shape=(num_oh_vecs, len_oh_vecs)))
model.add(Dropout(0.2))
model.add(Dense(units=128))
model.add(Dropout(0.2))
model.add(Dense(units=1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_16 (LSTM)               (None, 256)               294912    
_________________________________________________________________
dropout_15 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 129       
_________________________________________________________________
activation_14 (Activation)   (None, 1)                 0         
Total params: 327,937
Trainable params: 327,937
Non-trainable params: 0
_________________________________________________________________


In [146]:
model.fit(X_tr_pad_oh, y_tr_bin, batch_size=BATCH_SIZE, epochs=200, validation_data=(X_te_pad_oh, y_te_bin))

Train on 7149 samples, validate on 795 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200

KeyboardInterrupt: 

In [151]:
y_hat = np.round(model.predict(X_te_pad_oh))
y_hat


# from keras.preprocessing.sequence import pad_sequences
# # keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)
# X_pad = pad_sequences(df_train['wordified_names'], maxlen=max_seq_len, dtype='object', padding='post', truncating='post', value=0)
# X_pad = [c for c in [x for x in X_pad]]
# X_pad 

array([[0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],

In [152]:
from sklearn.metrics import classification_report

print(classification_report(y_te_bin, y_hat))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       500
           1       0.75      0.77      0.76       295

   micro avg       0.82      0.82      0.82       795
   macro avg       0.81      0.81      0.81       795
weighted avg       0.82      0.82      0.82       795



In [187]:
np.set_printoptions(suppress=True)
test_sample = 'fernandinha'
sample_seq = keras_tokenizer.texts_to_sequences([test_sample])
sample_pad = pad_sequences(sample_seq, maxlen=15, dtype='int32', padding='post', truncating='post', value=0.0)
sample_pad_oh = [keras_tokenizer.sequences_to_matrix([[i] for i in sample_pad_i]) for sample_pad_i in sample_pad]
sample_pad_oh = np.array(sample_pad_oh)

model.predict(sample_pad_oh)
# print(test_sample_preprocessed)

array([[0.26187357]], dtype=float32)

In [None]:
def int_to_onehot(n, n_classes):
    v = [0] * n_classes
    v[n] = 1
    return v

def onehot_to_int(v):
    return v.index(1)

In [None]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
data = [['c', 'o', 'l', 'd'], ['w', 'a', 'r', 'm'], ['h', 'o', 't']]
# Get dictionary

values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

In [None]:
from keras.preprocessing.text import one_hot

chartypes = set.union(*[set(n) for n in df_train['wordified_names']])
num_chartypes = len(chartypes)
def name_to_bow(name, encoding_size):
    return [int_to_onehot(one_hot(c, encoding_size)[0], encoding_size) for c in name]

sample = df_train['wordified_names'].iloc[1]
# print(sample)
sample2 = [' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
print(sample2)
# print(sample)
# [one_hot(c, num_chartypes)[0] for c in sample]
[one_hot(c, num_chartypes+20) for c in sample2]
# oh = [one_hot(c, num_chartypes)[0] for c in [name for name in df_train['wordified_names']]]

# X = [name_to_bow(name, num_chartypes) for name in df_train['wordified_names']]

# le = LabelEncoder()
# X = le.fit_transform(df_train['wordified_names'])
# len(X)

In [None]:
sample = df_train['wordified_names'].iloc[1]
sample

In [None]:
names = ['hello', 'world']
