In [1]:
from gender import Gender

In [2]:
#loading test file

def test_set(file='data/train_ds.txt', full_name=True):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    for line in f:
        line = line.split('\t')
        try:
            if full_name:
            # line[2] = full name; line[1] = first name
                names.append(line[2])
            else:
                names.append(line[1])
            genders.append(line[3].strip())
        except:
            pass
    return names, genders


names, genders = test_set()
len(names), len(genders)

(2926486, 2926486)

In [3]:
# calculating the prediction accuracy based on full names

def accuracy(method='lap', include_U=True):
    global names, genders
    def comp(idx):
        nonlocal right, mismatch
        pred = predict(names[idx], method, False)
        real = genders[idx]
        if pred[1] == real: right += 1
        else: mismatch.append((pred, '<------>', genders[idx]))
        
    right = 0
    mismatch = [('prediction', '<------>', 'observation')]
    predict = Gender().predict
    sample_size = len(names)
    if include_U:
        for i in range(len(names)):
            comp(i)
    else:
        for i in range(len(names)):
            if genders[i] != 'U': comp(i)
            else: sample_size -= 1
    return right/sample_size, mismatch

## Test against full names

In [4]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9376819161273965,
 [('prediction', '<------>', 'observation'),
  (('贾晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('秋紫俊', 'F', 0.6188568425156774), '<------>', 'M'),
  (('武美啸', 'M', 0.5175399048014521), '<------>', 'F'),
  (('林文冰', 'F', 0.5265047592527463), '<------>', 'U'),
  (('宋文会', 'M', 0.5966601669895142), '<------>', 'U'),
  (('吴乐懿', 'M', 0.6334930288052831), '<------>', 'U'),
  (('林水荣', 'M', 0.7984595416770441), '<------>', 'U'),
  (('张彩龙', 'F', 0.5669448429175324), '<------>', 'M'),
  (('褚飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('沈云华', 'M', 0.3768863196945002), '<------>', 'U'),
  (('施正漪', 'F', 0.7944377272158326), '<------>', 'M'),
  (('吉晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('柯逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('吴宝懿', 'M', 0.684011403531615), '<------>', 'U'),
  (('赵珣', 'M', 0.4626865671641791), '<------>', 'U'),
  (('魏俊华', 'M', 0.812477928343533), '<------>', 'U'),
  (('卞郡', 'F', 0.5093457943925234), '<------>', 'M'),
  (('

In [5]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9624090238263583,
 [('prediction', '<------>', 'observation'),
  (('秋紫俊', 'F', 0.6188568425156774), '<------>', 'M'),
  (('武美啸', 'M', 0.5175399048014521), '<------>', 'F'),
  (('张彩龙', 'F', 0.5669448429175324), '<------>', 'M'),
  (('褚飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('施正漪', 'F', 0.7944377272158326), '<------>', 'M'),
  (('吉晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('卞郡', 'F', 0.5093457943925234), '<------>', 'M'),
  (('段常桃', 'F', 0.6254946956457782), '<------>', 'M'),
  (('栾德慧', 'F', 0.5536782694266356), '<------>', 'M'),
  (('李旭彤', 'M', 0.5467137154925025), '<------>', 'F'),
  (('乔海云', 'M', 0.5714315674455618), '<------>', 'F'),
  (('王越M', 'Undefined', 0.7078066158956854), '<------>', 'F'),
  (('游松菱', 'F', 0.7711401793521472), '<------>', 'M'),
  (('饶璃', 'F', 0.5862068965517242), '<------>', 'M'),
  (('闻思嘉', 'F', 0.5991541905808809), '<------>', 'M'),
  (('杨坷潼', 'F', 0.6194318626785917), '<------>', 'M'),
  (('蒲厘杉', 'M', 0.6062027508955053), '<------>', 

In [6]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9307989171996722,
 [('prediction', '<------>', 'observation'),
  (('贾晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('秋紫俊', 'F', 0.6227565027133336), '<------>', 'M'),
  (('武美啸', 'M', 0.7945392799222982), '<------>', 'F'),
  (('林文冰', 'F', 0.5265047592527463), '<------>', 'U'),
  (('宋文会', 'M', 0.5966601669895142), '<------>', 'U'),
  (('吴乐懿', 'M', 0.6174312283858132), '<------>', 'U'),
  (('林水荣', 'M', 0.7984595416770441), '<------>', 'U'),
  (('张彩龙', 'F', 0.7571218017776794), '<------>', 'M'),
  (('沈云华', 'M', 0.3768863196945002), '<------>', 'U'),
  (('施正漪', 'F', 0.7621473175264708), '<------>', 'M'),
  (('吉晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('柯逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('吴宝懿', 'M', 0.662937360224399), '<------>', 'U'),
  (('赵珣', 'M', 0.8688826744629464), '<------>', 'U'),
  (('魏俊华', 'M', 0.812477928343533), '<------>', 'U'),
  (('贾渝', 'F', 0.5544634224943603), '<------>', 'M'),
  (('卞郡', 'F', 0.5190476190476191), '<------>', 'M'),
  (('陈游

In [7]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.955344517019629,
 [('prediction', '<------>', 'observation'),
  (('秋紫俊', 'F', 0.6227565027133336), '<------>', 'M'),
  (('武美啸', 'M', 0.7945392799222982), '<------>', 'F'),
  (('张彩龙', 'F', 0.7571218017776794), '<------>', 'M'),
  (('施正漪', 'F', 0.7621473175264708), '<------>', 'M'),
  (('吉晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('贾渝', 'F', 0.5544634224943603), '<------>', 'M'),
  (('卞郡', 'F', 0.5190476190476191), '<------>', 'M'),
  (('段常桃', 'F', 0.6131184187129176), '<------>', 'M'),
  (('余舁', 'F', 0.5190522056268192), '<------>', 'M'),
  (('姜明瑾', 'M', 0.5441911101311979), '<------>', 'F'),
  (('栾德慧', 'F', 0.5536782694266356), '<------>', 'M'),
  (('李旭彤', 'M', 0.5475283303662931), '<------>', 'F'),
  (('乔海云', 'M', 0.5714315674455618), '<------>', 'F'),
  (('王越M', 'Undefined', 0.7078066158956854), '<------>', 'F'),
  (('宋悠也', 'M', 0.6709454839161347), '<------>', 'F'),
  (('潘珑瑚', 'Undefined', 0.5830852073228867), '<------>', 'F'),
  (('游松菱', 'F', 0.754092362381194), '<------

## Test against first names

In [8]:
# load names as first names

names, genders = test_set(full_name=False)
len(names), len(genders)

(2926486, 2926486)

**Repeat the above procedure**

In [9]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.8816987335664684,
 [('prediction', '<------>', 'observation'),
  (('晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('洪君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('佳故', 'M', 0.6799999999999999), '<------>', 'F'),
  (('苗利', 'M', 0.6120908683305932), '<------>', 'F'),
  (('美啸', 'M', 0.9722753346080306), '<------>', 'F'),
  (('文冰', 'F', 0.7104938856763675), '<------>', 'U'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('文会', 'M', 0.48157606712878515), '<------>', 'U'),
  (('乐懿', 'F', 0.5717213114754098), '<------>', 'U'),
  (('水荣', 'M', 0.7442228401572538), '<------>', 'U'),
  (('飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('云华', 'M', 0.6056708097239992), '<------>', 'U'),
  (('正漪', 'F', 0.9671931956257594), '<------>', 'M'),
  (('晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('君烨', 'M', 0.5495184444848265), '<------>', 'F'),
  (('丹彬', 'M', 0.8731220657276995), '<------>', 'F'),
  (('晨云', 'F', 0.

In [10]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9049495387360695,
 [('prediction', '<------>', 'observation'),
  (('洪君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('佳故', 'M', 0.6799999999999999), '<------>', 'F'),
  (('苗利', 'M', 0.6120908683305932), '<------>', 'F'),
  (('美啸', 'M', 0.9722753346080306), '<------>', 'F'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('正漪', 'F', 0.9671931956257594), '<------>', 'M'),
  (('晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('君烨', 'M', 0.5495184444848265), '<------>', 'F'),
  (('丹彬', 'M', 0.8731220657276995), '<------>', 'F'),
  (('晨云', 'F', 0.47780957019359555), '<------>', 'M'),
  (('雪逸', 'M', 0.6708860759493671), '<------>', 'F'),
  (('兴敏', 'F', 0.6382878398910253), '<------>', 'M'),
  (('树君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('雪嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('素卿', 'M', 0.5344497607655502), '<------>', 'F'),
  (('张昕', 'F', 0.5725490196078431), '<------>', 'M'),
  (('凯瑜', 'F', 0

In [11]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.8776351569766607,
 [('prediction', '<------>', 'observation'),
  (('晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('洪君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('佳故', 'M', 0.6887939560667892), '<------>', 'F'),
  (('德幸', 'F', 0.552486187845304), '<------>', 'M'),
  (('苗利', 'M', 0.6120908683305932), '<------>', 'F'),
  (('美啸', 'M', 0.9821908438599142), '<------>', 'F'),
  (('文冰', 'F', 0.7104938856763675), '<------>', 'U'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('文会', 'M', 0.48157606712878515), '<------>', 'U'),
  (('乐懿', 'F', 0.5571745562130178), '<------>', 'U'),
  (('水荣', 'M', 0.7442228401572538), '<------>', 'U'),
  (('云华', 'M', 0.6056708097239992), '<------>', 'U'),
  (('正漪', 'F', 0.9611189107631795), '<------>', 'M'),
  (('晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('君烨', 'M', 0.5495184444848265), '<------>', 'F'),
  (('丹彬', 'M', 0.9245587869748942), '<------>', 'F'),
  (('晨云', 'F', 0.47

In [12]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9007788037439817,
 [('prediction', '<------>', 'observation'),
  (('洪君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('佳故', 'M', 0.6887939560667892), '<------>', 'F'),
  (('德幸', 'F', 0.552486187845304), '<------>', 'M'),
  (('苗利', 'M', 0.6120908683305932), '<------>', 'F'),
  (('美啸', 'M', 0.9821908438599142), '<------>', 'F'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('正漪', 'F', 0.9611189107631795), '<------>', 'M'),
  (('晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('君烨', 'M', 0.5495184444848265), '<------>', 'F'),
  (('丹彬', 'M', 0.9245587869748942), '<------>', 'F'),
  (('晨云', 'F', 0.47780957019359555), '<------>', 'M'),
  (('雪逸', 'M', 0.6708860759493671), '<------>', 'F'),
  (('兴敏', 'F', 0.6382878398910253), '<------>', 'M'),
  (('树君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('雪嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('崇杜', 'F', 0.6783042394014962), '<------>', 'M'),
  (('素卿', 'M', 0.6513119533527697), '<------>', 'F'),
  (('张昕', 'F', 0.5

## Test again first/full names (randomly)

In [13]:
#loading test file
from random import random, seed


def test_set(file='data/train_ds.txt', seed_=0):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    seed(seed_)
    for line in f:
        line = line.split('\t')
        try:
            # line[2] = full name; line[1] = first name
            name = line[2] if random() >= 0.5 else line[1]
            names.append(name)
            genders.append(line[3].strip())
        except:
            pass
    return names, genders


names, genders = test_set()
len(names), len(genders)

(2926486, 2926486)

In [14]:
# check the names
names[:20]

['阎莹暂',
 '吕荣辉',
 '泽彬',
 '二庄',
 '华治权',
 '哧天',
 '荣彦成',
 '其荷',
 '志峰',
 '王治诚',
 '贾晔',
 '秋紫俊',
 '絮雨',
 '黄洪君',
 '戴世荣',
 '佳讯',
 '翟谨德',
 '蒋会川',
 '阮彦嘉',
 '耿艺贤']

**Repeat the above procedure**

In [15]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9097282542954246,
 [('prediction', '<------>', 'observation'),
  (('贾晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('秋紫俊', 'F', 0.6188568425156774), '<------>', 'M'),
  (('美啸', 'M', 0.9722753346080306), '<------>', 'F'),
  (('林文冰', 'F', 0.5265047592527463), '<------>', 'U'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('宋文会', 'M', 0.5966601669895142), '<------>', 'U'),
  (('乐懿', 'F', 0.5717213114754098), '<------>', 'U'),
  (('林水荣', 'M', 0.7984595416770441), '<------>', 'U'),
  (('张彩龙', 'F', 0.5669448429175324), '<------>', 'M'),
  (('飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('云华', 'M', 0.6056708097239992), '<------>', 'U'),
  (('正漪', 'F', 0.9671931956257594), '<------>', 'M'),
  (('吉晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('丹彬', 'M', 0.8731220657276995), '<------>', 'F'),
  (('树君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('雪嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('吴宝懿', 

In [16]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9337182109468817,
 [('prediction', '<------>', 'observation'),
  (('秋紫俊', 'F', 0.6188568425156774), '<------>', 'M'),
  (('美啸', 'M', 0.9722753346080306), '<------>', 'F'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('张彩龙', 'F', 0.5669448429175324), '<------>', 'M'),
  (('飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('正漪', 'F', 0.9671931956257594), '<------>', 'M'),
  (('吉晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('丹彬', 'M', 0.8731220657276995), '<------>', 'F'),
  (('树君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('雪嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('凯瑜', 'F', 0.7490205116386265), '<------>', 'M'),
  (('一诺', 'F', 0.6630982367758185), '<------>', 'M'),
  (('卞郡', 'F', 0.5093457943925234), '<------>', 'M'),
  (('美银', 'M', 0.5146726862302483), '<------>', 'F'),
  (('俊君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('段常桃', 'F', 0.6254946956457782), '<------>', 'M'),
  (('孝涵', 'F', 0.6990136671237119), '<------>', 'M'),
  (('德慧', 'F'

In [17]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9042267757303469,
 [('prediction', '<------>', 'observation'),
  (('贾晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('秋紫俊', 'F', 0.6227565027133336), '<------>', 'M'),
  (('德幸', 'F', 0.552486187845304), '<------>', 'M'),
  (('美啸', 'M', 0.9821908438599142), '<------>', 'F'),
  (('林文冰', 'F', 0.5265047592527463), '<------>', 'U'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('宋文会', 'M', 0.5966601669895142), '<------>', 'U'),
  (('乐懿', 'F', 0.5571745562130178), '<------>', 'U'),
  (('林水荣', 'M', 0.7984595416770441), '<------>', 'U'),
  (('张彩龙', 'F', 0.7571218017776794), '<------>', 'M'),
  (('云华', 'M', 0.6056708097239992), '<------>', 'U'),
  (('正漪', 'F', 0.9611189107631795), '<------>', 'M'),
  (('吉晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('丹彬', 'M', 0.9245587869748942), '<------>', 'F'),
  (('树君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('雪嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('吴宝懿', 'M

In [18]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9280716558365039,
 [('prediction', '<------>', 'observation'),
  (('秋紫俊', 'F', 0.6227565027133336), '<------>', 'M'),
  (('德幸', 'F', 0.552486187845304), '<------>', 'M'),
  (('美啸', 'M', 0.9821908438599142), '<------>', 'F'),
  (('斐然', 'M', 0.5373563218390804), '<------>', 'F'),
  (('张彩龙', 'F', 0.7571218017776794), '<------>', 'M'),
  (('正漪', 'F', 0.9611189107631795), '<------>', 'M'),
  (('吉晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('丹彬', 'M', 0.9245587869748942), '<------>', 'F'),
  (('树君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('雪嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('崇杜', 'F', 0.6783042394014962), '<------>', 'M'),
  (('凯瑜', 'F', 0.7490205116386265), '<------>', 'M'),
  (('一诺', 'F', 0.6673003802281369), '<------>', 'M'),
  (('贾渝', 'F', 0.5544634224943603), '<------>', 'M'),
  (('卞郡', 'F', 0.5190476190476191), '<------>', 'M'),
  (('美银', 'M', 0.5146726862302483), '<------>', 'F'),
  (('俊君', 'F', 0.5167110483125917), '<------>', 'M'),
  (('家荥', 'F', 0