In [None]:
from xpinyin import Pinyin
import pandas as pd
from keras.layers import LSTM, Dense
from keras.models import Sequential
import numpy as np

In [2]:
p = Pinyin()
df = pd.read_csv('ChineseNames.csv')

In [3]:
#change headers to english
df.columns = ['Name', 'Gender']

In [4]:
#strip last name
def strip_last_name(name):
    return name[1:]

def convert_to_pinyin(name):
    return p.get_pinyin(name)

def process_name(name):
    name = strip_last_name(name)
    name = convert_to_pinyin(name)
    return name

In [5]:
#visualize before after processing
print(df.tail(5))
df['Name'] = df['Name'].apply(process_name)
print(df.tail(5))

     Name Gender
9792  左婉怡      F
9793  左烜晅      F
9794  左雨晴      F
9795   左越      F
9796  左子烨      F
           Name Gender
9792     wan-yi      F
9793  xuan-xuan      F
9794    yu-qing      F
9795        yue      F
9796      zi-ye      F


In [6]:
#get all unique chars
characters = {}
for i, name in enumerate(df['Name']):
    for char in name:
        if char in characters:
            characters[char] += 1
        else:
            characters[char] = 1

In [7]:
#find the longest name
maxlen = df['Name'].str.len().max()

In [8]:
#create mapping.  though it would be nice if they mapping is alphabetically ordered, but it doesn't matter mathematically.
idx = 0
for c in characters.keys():
    characters[c] = idx
    idx += 1

In [9]:
#create training example of dimension (example #, timestep, # of features (length of 1-hot vector))
num_examples = df.shape[0]
time_steps = maxlen
num_features = len(characters)
def char_to_vec(char):
    vector = np.zeros((1, num_features), dtype=int)
    idx = characters[char]
    vector[0,idx] = 1
    return vector

def name_to_vec(name):
    example = np.zeros((maxlen, num_features), dtype = int)
    for i in range(maxlen):
        if i < len(name):
            char = name[i]
            vector = char_to_vec(char)
            example[i,:] = vector[0,:]
        else:
            example[i,:] = np.zeros((num_features),dtype=int)
    return example

In [10]:
#convert all examples to vector format
X = np.empty((df.shape[0], maxlen, num_features))
for i, name in enumerate(df['Name']):
    example = name_to_vec(name)
    X[i,:,:] = example

In [11]:
Y = df['Gender'].values
Y[Y == 'M'] = 1
Y[Y == 'F'] = 0

In [109]:
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, num_features), return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='tanh'))
model.add(Dense(64, activation='tanh'))
# model.add(Dense(64, activation='tanh'))
# model.add(Dense(64, activation='tanh'))
# model.add(Dense(64, activation='tanh'))
# model.add(Dense(64, activation='tanh'))
# model.add(Dense(64, activation='tanh'))
# model.add(Dense(64, activation='tanh'))
model.add(Dense(1, activation='tanh'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [110]:
model.fit(X,Y, batch_size=32, epochs=10, verbose=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1752cb575c0>

In [111]:
#let's test on some real life examples:
X_test = np.empty((0, maxlen, num_features), dtype=int)
Y_test = np.empty((0), dtype=int)
people = ['long', #jackie chan
          'xiao-long', #bruce lee
          'bing-bing', #fan bingbing
          'zi-yi', #zhang ziyi
          'zi-dan', #donnie yen
          'lian-jie', #jet li
          'yu-ling', #lucy liu
          'en-mei', #amy tan
          'ming-na', #ming-na wen
          'ze-dong', #chairman mao
         'jin-ping'] #chairman xi
Y_test = np.array([1,1,0,0,1,1,0,0,0,1,1])
for name in people:
    array = name_to_vec(name)
    X_test = np.concatenate((X_test, np.expand_dims(array, 0)), 0)

In [112]:
#predict
predictions = model.predict_classes(X_test)
for i, prediction in enumerate(predictions):
    if prediction != Y_test[i]:
        print('mis-classified: ', people[i])
print()
print('accuracy: ', sum(predictions[:,0] == Y_test)/Y_test.shape[0])

mis-classified:  bing-bing
mis-classified:  lian-jie
mis-classified:  jin-ping

accuracy:  0.7272727272727273


### Result Analysis Part 1 - Famous People [72% Accuracy]
The result is more impressive than I expected even after just 10 epochs of training. 72% accuracy is VERY HIGH for classifying chinese names.  I would have expected Bing Bing to be classified as female since any name ending in -ing is more commonly female than male in my experience.  I am however, not at all surprised that Jet Li and Chairman Xi were mis-classified since both Lian and Ping are both very feminine names in my experience. 

In [113]:
#some of the names that comes to mind from grade school
X_test = np.empty((0, maxlen, num_features), dtype=int)
Y_test = np.empty((0), dtype=int)
people = ['ting-pei',
          'xin',
          'jin-hao',
          'zhe-an',
          'yi-cheng',
          'zi-jun',
          'zhi-hao',
          'wei-han',
          'guan-yu',
          'xiu-qi',
          'jun-de']
Y_test = np.array([0,0,1,1,1,1,1,1,0,1,1])
for name in people:
    array = name_to_vec(name)
    X_test = np.concatenate((X_test, np.expand_dims(array, 0)), 0)

In [114]:
#predict
predictions = model.predict_classes(X_test)
for i, prediction in enumerate(predictions):
    if prediction != Y_test[i]:
        print('mis-classified: ', people[i])
print()
print('accuracy: ', sum(predictions[:,0] == Y_test)/Y_test.shape[0])

mis-classified:  ting-pei
mis-classified:  xiu-qi

accuracy:  0.8181818181818182


### Result Analysis Part 2 - Names from Grade School [72% Accuracy]
Again we have 72% accuracy, which is awesome!  I was surprised that ting-pei was classified as male, because ting by itself will classify as female and pei is a more commonly a feminine name base on my experience. I knew both zi-jun and xiu-qi were going to be mis-classified right off the bat and they were. This shows the model's predictive power is similiar to that of people.

### Result Analysis Part 3 - Names of Immediate Family Members [80% Accuracy]
The model achieved a whopping 80% accuracy for 15 of my close family members. The names are withheld for privacy purposes. 3 of whom were mis-classified, and out of the 3, 2 of them were known to have names that leans towards their opposite gender.

In [115]:
#some of the names that comes to mind from grade school
X_test = np.empty((0, maxlen, num_features), dtype=int)
Y_test = np.empty((0), dtype=int)
people = ['xiao-qi', 
          'fu-qi', 
          'ping', 
          'hua', 
          'yong-gang', 
          'jing-zhong', 
          'zhong-yu', 
          'jia-yu', 
          'yu-ling', 
          'rei-wen']
Y_test = np.array([0,0,0,0,1,1,1,1,0,1])
for name in people:
    array = name_to_vec(name)
    X_test = np.concatenate((X_test, np.expand_dims(array, 0)), 0)
    
#predict
predictions = model.predict_classes(X_test)
for i, prediction in enumerate(predictions):
    if prediction != Y_test[i]:
        print('mis-classified: ', people[i])
print()
print('accuracy: ', sum(predictions[:,0] == Y_test)/Y_test.shape[0])

mis-classified:  ping
mis-classified:  hua
mis-classified:  jia-yu

accuracy:  0.7


### Reference:
Dataset: https://www.researchgate.net/publication/269630594_9800_Chinese_Names_with_Gender
