## Mandarin name gender predictor

### Overview

Binary classification
1. 

- Using Bag of words
- Using Pinyin (tone on/off is a parameter)
- Using Chinese word embeddings (expect a much higher accuracy)

In [127]:
import pandas as pd
import numpy as np

def load_data():
    df = pd.read_excel("./Data/9800ChineseNamesnamegender_ORIGINAL.xlsx")
    df = df[["姓名", "性别"]]
    df.columns =["Name", "Gender"]

    # Remove surname
    df['Name'] = df['Name'].map(lambda x: x[1:])

    # Remove 3-character names
    nameLength = df['Name'].map(len)
    df = df[nameLength.between(1,2)]

    # Encode the labels => M:0, F:1
    df["Gender"] = df["Gender"].map({"男":0, "女":1})

    return df

### 1. Inputs containing only phonetic information (pinyin only)

This is applicable to the original question at hand. However, since we are completely discarding the semantic information of the character, it not expected to perform as well as using/combining character embeddings.

In [128]:
import pinyin

def getPinyinDF():

    def processToPinyin(x):
        # Strip removes tones
        return pinyin.get(x, format="strip", delimiter=" ")

    df = load_data()
    df["Name"] = df["Name"].map(processToPinyin)
    return df

df = getPinyinDF()
df.head(3)

Unnamed: 0,Name,Gender
0,chao,0
1,fang zhou,0
2,lin feng,0


##### 1.1. Bag of word representation

Convert each pinyin syllable to the BoW representation. This approach works suprisingly well because there's only around 400 valid syllables in Mandarin (1600 if we also include the tone combinations).

<img src="img/BoW.png" width=450 height=250/>

In [129]:
def getBagOfWords(names):

    # 'chao' -> ['chao'], 'fang zhou' -> ['fang', 'zhou']
    names = names.map(lambda x: x.split()).to_list()
    
    # Flatten the list of lists and get all the unique values
    uniqueNames = set(i for j in names for i in j)
    mappingDict = {n: i for i, n in enumerate(uniqueNames)}

    # R : number of training instances
    R, N = len(df), len(mappingDict)
    bagOfWords = np.zeros((R, N))

    for key, val in enumerate(names):
        for i in val:
            bagOfWords[key][mappingDict[i]] += 1

    return bagOfWords

The performance of baseline models on BoW without any hyperparameter tuning is reasonable.

In [130]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

def evalBaseModels(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Initialise all models (no optimization performed)
    models = [KNeighborsClassifier(),
              BernoulliNB(),
              MultinomialNB(),
              LogisticRegression(random_state=0),
              DecisionTreeClassifier(random_state=0),
              SVC(random_state=0),
              RandomForestClassifier(random_state=0)]

    modelAcc = {}
    for clf in models:
        try:
            clf.fit(X_train, y_train)
            modelAcc[clf.__class__.__name__] = clf.score(X_test, y_test)
        except:
            Exception

    # Print according to ascending accuracy
    accs = sorted(modelAcc.items(), key=lambda x:x[1])
    for model, acc in accs:
        print(f"Accuracy of {model}: {acc:.4f}")

evalBaseModels(getBagOfWords(df["Name"]), df["Gender"])

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy of KNeighborsClassifier: 0.6553
Accuracy of DecisionTreeClassifier: 0.6865
Accuracy of RandomForestClassifier: 0.6923
Accuracy of BernoulliNB: 0.7005
Accuracy of MultinomialNB: 0.7009
Accuracy of LogisticRegression: 0.7046
Accuracy of SVC: 0.7120


#### 1.2. Bi-LSTM model


In [131]:
import string

df = getPinyinDF()

# 1. Get the (unique) character list and mapping dictionary
def getCList(names):
    charList = list(string.ascii_lowercase)
    charList += ['[PAD]', '[UNKNOWN]']
    charList.sort()

    # 0 -> [1, 0, ...], 1 -> [0, 1, 0, ...]
    def getOneHotVec(i):
        x = np.zeros(len(charList)); x[i] = 1
        return x

    oneHotMapper = {val: getOneHotVec(i) for i, val in enumerate(charList)}
    return (charList, oneHotMapper)

def getEncodedNames(names, mappingDict):
    seq_length = max(len(n) for n in names)
    namesEncoded = []
    for n in names:
        name = []
        for i in range(seq_length):
            # Pad characters if reached the end or space of 2 characters
            if i > len(n)-1 or n[i]==' ':
                name.append(mappingDict['[PAD]'])
            # Unknown character
            elif n[i] not in mappingDict:
                name.append(mappingDict['[UNKNOWN]'])
            else:
                name.append(mappingDict[n[i]])
        namesEncoded.append(np.array(name))
    return np.array(namesEncoded)

names = df['Name']
charList, mapper = getCList(names)
encodedNames = getEncodedNames(names, mapper)

In [132]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix

class Bi_LSTM1(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Bi_LSTM1, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # Get the embeded tensor
        lstm_out, (h_n, c_n) = self.lstm(x)
        hidden_out = torch.cat((h_n[0,:,:],h_n[1,:,:]),1)
        z = self.linear(hidden_out)
        return z

def trainNetwork(X, y, model, criterion, optimiser, nEpochs=1000, printInterval=200):

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for epoch in range(nEpochs):

        # Train mode
        model.train()

        # Zero the parameter gradients
        optimiser.zero_grad()

        # Forward-propagation
        y_pred = model(X_train)

        # Calculate error
        loss = criterion(y_pred, y_train)

        # Optimise
        loss.backward()
        optimiser.step()

        # Print statistics (every 200 epochs)
        if epoch % printInterval == printInterval-1:    
            model.eval()
            predicted = torch.argmax(y_pred, 1)
            train_acc = accuracy_score(predicted, y_train)
            print('Epoch %d, loss: %.4f, train accuracy: %.4f' %(epoch + 1, loss.item(), train_acc))

    # Result
    y_pred = model(X_test)
    predicted = torch.argmax(y_pred, 1)
    testAcc = accuracy_score(predicted, y_test)
    print(f'\nTest accuracy : {testAcc:.4f}')

hidden_size = 10
model = Bi_LSTM1(len(charList), hidden_size, 2)
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.1)

X = torch.from_numpy(encodedNames).float()
y = torch.from_numpy(np.array(df['Gender']))

trainNetwork(X, y, model, criterion, optimizer, 500, 50)

Epoch 50, loss: 0.5921, train accuracy: 0.6853
Epoch 100, loss: 0.5373, train accuracy: 0.7206
Epoch 150, loss: 0.4816, train accuracy: 0.7460
Epoch 200, loss: 0.4478, train accuracy: 0.7699
Epoch 250, loss: 0.4203, train accuracy: 0.7876
Epoch 300, loss: 0.4171, train accuracy: 0.7912
Epoch 350, loss: 0.3806, train accuracy: 0.8122
Epoch 400, loss: 0.3587, train accuracy: 0.8270
Epoch 450, loss: 0.3590, train accuracy: 0.8231
Epoch 500, loss: 0.3571, train accuracy: 0.8251

Test accuracy : 0.6689


In [133]:
def getPrediction(model, name, mappingDict):
    seq_length = len(charList)
    nameEncoding = []
    for i in range(seq_length):
        if i > len(name)-1 or name[i]==' ':
            nameEncoding.append(mappingDict['[PAD]'])
        # Unknown character
        elif name[i] not in mappingDict:
            nameEncoding.append(mappingDict['[UNKNOWN]'])
        else:
            nameEncoding.append(mappingDict[name[i]])
    nameEncoding = np.array([np.array(nameEncoding)])
    output = model(torch.Tensor(nameEncoding))
    y_pred = int(torch.argmax(output))
    D = {0:"Male", 1:"Female"}
    print(f"Prediction for {name}: {D[y_pred]}")

names = ['xiao long',
         'xiang dong',
         'yi nuo',
         'xiao ming',
         'xin yi',
         'zi han']

for name in names: getPrediction(model, name, mapper)

Prediction for xiao long: Male
Prediction for xiang dong: Male
Prediction for yi nuo: Female
Prediction for xiao ming: Female
Prediction for xin yi: Female
Prediction for zi han: Male


### 2. Inputs containing all character information (pre-trained embeddings)

Directly use Chinese characters via embeddings. The embeddings can be trained separately but we use pre-trained embeddings derived here : https://www.kaggle.com/datasets/guiyihan/chinesewordvectors.
The embedding dimension is 300 (assumed to be be Word2Vec embeddings).



In [134]:
# May take a few minutes
embeddings = {}
with open('./Data/sgns.merge.word.txt', 'r') as f:
    # Skip first line
    next(f)
    for line in f:
        # Split once
        char, vec = line.split(' ', 1)
        embeddings[char] = np.fromstring(vec, sep=' ')

In [135]:
# Get embedding of each name
def getNameEmbeddings(nameList, embDict):

    emb = []
    for name in nameList:

        # 1. Get embedding of each character
        tmp = [embDict.get(char, np.zeros(300)) for char in name]

        # 2. Get the average of the embeddings
        # Averaging is the most common way but array concatenation could be also
        # be considered especially since we only have a maximum of 2 characters
        tmp = np.mean(tmp, axis=0)

        # 3. Add to list
        emb.append(tmp)

    return np.array(emb)

The performance of baseline models on character embeddings is noticably better than the BoW representation.

In [136]:
df = load_data()
X, y = getNameEmbeddings(df['Name'], embeddings), np.array(df['Gender'])
evalBaseModels(X, y)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy of DecisionTreeClassifier: 0.7071
Accuracy of BernoulliNB: 0.7301
Accuracy of KNeighborsClassifier: 0.7486
Accuracy of RandomForestClassifier: 0.7675
Accuracy of LogisticRegression: 0.7818
Accuracy of SVC: 0.7925


In [137]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score
import torch.nn.functional as F

class ANN1(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(ANN1, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        z1 = self.linear1(x)
        Zout = self.linear2(F.relu(z1))
        return Zout

model = ANN1(300, 10, 2)
criterion = nn.CrossEntropyLoss() 
optimiser = optim.SGD(model.parameters(), lr=0.01)

X = torch.from_numpy(getNameEmbeddings(df['Name'], embeddings)).float()
y = torch.from_numpy(np.array(df['Gender']))

trainNetwork(X, y, model, criterion, optimiser)

Epoch 200, loss: 0.6725, train accuracy: 0.5144
Epoch 400, loss: 0.6288, train accuracy: 0.7488
Epoch 600, loss: 0.5800, train accuracy: 0.7703
Epoch 800, loss: 0.5354, train accuracy: 0.7796
Epoch 1000, loss: 0.5023, train accuracy: 0.7876

Test accuracy : 0.7576


In [138]:
def getPrediction(model, name, embDict):
    nameEmb = np.mean([embDict.get(char, np.zeros(300)) for char in name[1:]], axis=0)
    output = model(torch.Tensor(nameEmb))
    y_pred = int(torch.argmax(output))
    D = {0:"Male", 1:"Female"}
    print(f"Prediction for {name}: {D[y_pred]}")

names = ['马云',    # Jack Ma
         '赵丽颖',  # Zanilia Zhao
         '周杰伦',   # Jay chou
         '邓小平']   # Deng xiaoping

for name in names: getPrediction(model, name, embeddings)

Prediction for 马云: Female
Prediction for 赵丽颖: Female
Prediction for 周杰伦: Male
Prediction for 邓小平: Male


In [143]:
# 1. Get the (unique) character list
char_set = set(list("".join(df['Name'])))
char_set.add('[PAD]'); char_set.add('[UNKNOWN]')
char_list = list(char_set)
char_list.sort()

# 2. Make character index
char_index = {val: index for index, val in enumerate(char_list)}

In [140]:
def getEmbeddingTable(embDict, char_list):
    emb_table = []
    for c in char_list:
        emb = embDict[c] if c in embeddings else [0]*300
        emb_table.append(emb)
    return torch.from_numpy(np.array(emb_table))

def getEncodedNames(char_index):
    names_encoded = []
    for name in df['Name']:
        tmp = []
        if len(name) == 1:
            tmp = [char_index[name]] + [char_index['[PAD]']]
        else:
            for char in name:
                tmp += [char_index[char]]
        names_encoded.append(tmp)
    return torch.from_numpy(np.array(names_encoded))

In [141]:
emb_table = getEmbeddingTable(embeddings, char_list)
char_count = len(char_list)

In [142]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define the model
class Bi_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Bi_LSTM, self).__init__()

        self.emb = nn.Embedding(char_count, input_size)
        self.emb.weight.data.copy_(emb_table)
        self.emb.weight.requires_grad = False

        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # Get the embedded tensor
        x = self.emb(x)        
        lstm_out, (h_n, c_n) = self.lstm(x)
        hidden_out = torch.cat((h_n[0,:,:],h_n[1,:,:]),1)
        z = self.linear(hidden_out)
        return z

hidden_size = 10
model = Bi_LSTM(300, hidden_size, 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
X_train = getEncodedNames(char_index)

X = getEncodedNames(char_index)
y = torch.from_numpy(np.array(df['Gender']))

trainNetwork(X, y, model, criterion, optimizer, nEpochs=200, printInterval=20)

Epoch 20, loss: 0.4162, train accuracy: 0.8062
Epoch 40, loss: 0.3395, train accuracy: 0.8489
Epoch 60, loss: 0.2613, train accuracy: 0.8903
Epoch 80, loss: 0.1915, train accuracy: 0.9234
Epoch 100, loss: 0.1446, train accuracy: 0.9408
Epoch 120, loss: 0.1212, train accuracy: 0.9462
Epoch 140, loss: 0.1110, train accuracy: 0.9467
Epoch 160, loss: 0.1064, train accuracy: 0.9467
Epoch 180, loss: 0.1046, train accuracy: 0.9466
Epoch 200, loss: 0.1028, train accuracy: 0.9467

Test accuracy : 0.7728
