In [1]:
from gender import Gender

In [2]:
#loading test file

def test_set(file='data/test_ds.txt', full_name=True):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    for line in f:
        line = line.split('\t')
        try:
            if full_name:
            # line[2] = full name; line[1] = first name
                names.append(line[2])
            else:
                names.append(line[1])
            genders.append(line[3].strip())
        except:
            pass
    return names, genders


names, genders = test_set()
len(names), len(genders)

(731622, 731622)

In [3]:
# calculating the prediction accuracy based on full names

def accuracy(method='lap', include_U=True):
    global names, genders
    def comp(idx):
        nonlocal right, mismatch
        pred = predict(names[idx], method, False)
        real = genders[idx]
        if pred[1] == real: right += 1
        else: mismatch.append((pred, '<------>', genders[idx]))
        
    right = 0
    mismatch = [('prediction', '<------>', 'observation')]
    predict = Gender().predict
    sample_size = len(names)
    if include_U:
        for i in range(len(names)):
            comp(i)
    else:
        for i in range(len(names)):
            if genders[i] != 'U': comp(i)
            else: sample_size -= 1
    return right/sample_size, mismatch

## Test against full names

In [4]:
# used to be: 0.9371465046157715 (trained on first names)

# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9207117883278524,
 [('prediction', '<------>', 'observation'),
  (('张继燕', 'M', 0.5078611344330047), '<------>', 'F'),
  (('万宇烟', 'F', 0.5654871460207109), '<------>', 'M'),
  (('阮涵之', 'F', 0.5438453662196723), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5893905285861586), '<------>', 'F'),
  (('付睿', 'M', 0.712606419452691), '<------>', 'U'),
  (('班柳淳', 'M', 0.5678185646975151), '<------>', 'F'),
  (('郭连', 'M', 0.5720968653852523), '<------>', 'U'),
  (('游丁', 'M', 0.5942377411017877), '<------>', 'U'),
  (('贝学敏', 'F', 0.65171017063093), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('梅必霏', 'F', 0.8899741213824529), '<------>', 'M'),
  (('林乔', 'M', 0.6775205089690965), '<------>', 'U'),
  (('闫韶华', 'Undefined', 0.5292034726456268), '<------>', 'U'),
  (('季纯融', 'F', 0.5706811701507302), '<------>', 'M'),
  (('盛建梅', 'M', 0.597223876622354), '<------>', 'F'),
  (('奚雷筠', 'F', 0.8758288485000173), '<------>', 'M'),
  (('连国萌', 'F', 0.6067968034998574), '<------>', 'M'),

In [5]:
# used to be: 0.9623151018895784 (trained on first names)


# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9454390045011347,
 [('prediction', '<------>', 'observation'),
  (('张继燕', 'M', 0.5078611344330047), '<------>', 'F'),
  (('万宇烟', 'F', 0.5654871460207109), '<------>', 'M'),
  (('阮涵之', 'F', 0.5438453662196723), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5893905285861586), '<------>', 'F'),
  (('班柳淳', 'M', 0.5678185646975151), '<------>', 'F'),
  (('贝学敏', 'F', 0.65171017063093), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('梅必霏', 'F', 0.8899741213824529), '<------>', 'M'),
  (('季纯融', 'F', 0.5706811701507302), '<------>', 'M'),
  (('盛建梅', 'M', 0.597223876622354), '<------>', 'F'),
  (('奚雷筠', 'F', 0.8758288485000173), '<------>', 'M'),
  (('连国萌', 'F', 0.6067968034998574), '<------>', 'M'),
  (('武亭', 'M', 0.5524695604437723), '<------>', 'F'),
  (('汤云瑞', 'F', 0.45723945764495905), '<------>', 'M'),
  (('王敏利', 'Undefined', 0.5732817087884837), '<------>', 'F'),
  (('梁颜', 'M', 0.5044711197801947), '<------>', 'F'),
  (('司空嘉桐', 'F', 0.5071564461314124), '<------>', 

In [6]:
# used to be: 0.9300936822566844 (trained on first names)

# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.91423303290497,
 [('prediction', '<------>', 'observation'),
  (('钱穗君', 'M', 0.423255170201506), '<------>', 'F'),
  (('张继燕', 'M', 0.5147383902924803), '<------>', 'F'),
  (('万宇烟', 'F', 0.7834142743741946), '<------>', 'M'),
  (('阮涵之', 'F', 0.5459413805222921), '<------>', 'M'),
  (('卞佳臻', 'M', 0.7434085763069487), '<------>', 'F'),
  (('付睿', 'M', 0.712606419452691), '<------>', 'U'),
  (('班柳淳', 'M', 0.5680285349769285), '<------>', 'F'),
  (('纪嵫', 'F', 0.5140797010713717), '<------>', 'M'),
  (('郭连', 'M', 0.5720968653852523), '<------>', 'U'),
  (('游丁', 'M', 0.605408830931151), '<------>', 'U'),
  (('贝学敏', 'F', 0.6493241711705547), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('梅必霏', 'F', 0.9151787042813125), '<------>', 'M'),
  (('林乔', 'M', 0.6775205089690965), '<------>', 'U'),
  (('邵子奕', 'F', 0.4869604261972831), '<------>', 'M'),
  (('闫韶华', 'Undefined', 0.4836621778927178), '<------>', 'U'),
  (('许刀鸾', 'M', 0.6512152768217652), '<------>', 'F'),
 

In [7]:
# used to be: 0.9550728644873521 (trained on first names)


# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9387862515386246,
 [('prediction', '<------>', 'observation'),
  (('钱穗君', 'M', 0.423255170201506), '<------>', 'F'),
  (('张继燕', 'M', 0.5147383902924803), '<------>', 'F'),
  (('万宇烟', 'F', 0.7834142743741946), '<------>', 'M'),
  (('阮涵之', 'F', 0.5459413805222921), '<------>', 'M'),
  (('卞佳臻', 'M', 0.7434085763069487), '<------>', 'F'),
  (('班柳淳', 'M', 0.5680285349769285), '<------>', 'F'),
  (('纪嵫', 'F', 0.5140797010713717), '<------>', 'M'),
  (('贝学敏', 'F', 0.6493241711705547), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('梅必霏', 'F', 0.9151787042813125), '<------>', 'M'),
  (('邵子奕', 'F', 0.4869604261972831), '<------>', 'M'),
  (('许刀鸾', 'M', 0.6512152768217652), '<------>', 'F'),
  (('季纯融', 'F', 0.5715775824437949), '<------>', 'M'),
  (('白榇译', 'M', 0.5138879955890765), '<------>', 'F'),
  (('盛建梅', 'M', 0.5999932821925059), '<------>', 'F'),
  (('钭奕渝', 'F', 0.9298705240593117), '<------>', 'M'),
  (('奚雷筠', 'F', 0.702070718579464), '<------>', 'M'),
  

## Test against first names

In [8]:
# load names as first names

names, genders = test_set(full_name=False)
len(names), len(genders)

(731622, 731622)

**Repeat the above procedure**

In [9]:
# corresponding full name model = 0.9207117883278524

# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9268215007203173,
 [('prediction', '<------>', 'observation'),
  (('宇烟', 'F', 0.613008130470299), '<------>', 'M'),
  (('佳臻', 'M', 0.5110809349392665), '<------>', 'F'),
  (('睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('雁林', 'F', 0.4971546052901239), '<------>', 'M'),
  (('连', 'M', 0.5639068463275956), '<------>', 'U'),
  (('丁', 'M', 0.5716382443388314), '<------>', 'U'),
  (('雨杨', 'F', 0.5171689917383303), '<------>', 'M'),
  (('必霏', 'F', 0.5628201585717663), '<------>', 'M'),
  (('乔', 'M', 0.5002984056611817), '<------>', 'U'),
  (('韶华', 'M', 0.6117234847115117), '<------>', 'U'),
  (('纯融', 'F', 0.5276217216421958), '<------>', 'M'),
  (('建梅', 'M', 0.5072885264113434), '<------>', 'F'),
  (('思宁', 'F', 0.5542480753756921), '<------>', 'M'),
  (('奕渝', 'F', 0.486317054198091), '<------>', 'M'),
  (('雷筠', 'F', 0.8898538716579548), '<------>', 'M'),
  (('国萌', 'F', 0.6109439118463438), '<------>', 'M'),
  (('畏', 'M', 0.78125), '<------>', 'U'),
  (('云夕', 'F', 0.47663201243602304

In [10]:
# corresponding full name model = 0.9454390045011347

# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9517128031809703,
 [('prediction', '<------>', 'observation'),
  (('宇烟', 'F', 0.613008130470299), '<------>', 'M'),
  (('佳臻', 'M', 0.5110809349392665), '<------>', 'F'),
  (('雁林', 'F', 0.4971546052901239), '<------>', 'M'),
  (('雨杨', 'F', 0.5171689917383303), '<------>', 'M'),
  (('必霏', 'F', 0.5628201585717663), '<------>', 'M'),
  (('纯融', 'F', 0.5276217216421958), '<------>', 'M'),
  (('建梅', 'M', 0.5072885264113434), '<------>', 'F'),
  (('思宁', 'F', 0.5542480753756921), '<------>', 'M'),
  (('奕渝', 'F', 0.486317054198091), '<------>', 'M'),
  (('雷筠', 'F', 0.8898538716579548), '<------>', 'M'),
  (('国萌', 'F', 0.6109439118463438), '<------>', 'M'),
  (('云夕', 'F', 0.47663201243602304), '<------>', 'M'),
  (('嘉桐', 'F', 0.49647584866570316), '<------>', 'M'),
  (('乔楚', 'F', 0.6271522217742468), '<------>', 'M'),
  (('寒宫', 'F', 0.5741479704400875), '<------>', 'M'),
  (('蕙全', 'F', 0.9188883716698855), '<------>', 'M'),
  (('琳伟', 'F', 0.7119658359552217), '<------>', 'M'),
  (('若熙', 'F', 0

In [11]:
# corresponding full name model = 0.91423303290497


# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.920558703811531,
 [('prediction', '<------>', 'observation'),
  (('宇烟', 'F', 0.7961985689456496), '<------>', 'M'),
  (('佳臻', 'M', 0.6743927726548445), '<------>', 'F'),
  (('睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('雁林', 'F', 0.4971546052901239), '<------>', 'M'),
  (('连', 'M', 0.5639068463275956), '<------>', 'U'),
  (('丁', 'M', 0.5716382443388314), '<------>', 'U'),
  (('雨杨', 'F', 0.5171689917383303), '<------>', 'M'),
  (('必霏', 'F', 0.6319644944483971), '<------>', 'M'),
  (('乔', 'M', 0.5002984056611817), '<------>', 'U'),
  (('韶华', 'M', 0.6365399569496712), '<------>', 'U'),
  (('刀鸾', 'M', 0.6437430130782102), '<------>', 'F'),
  (('纯融', 'F', 0.5285534138911746), '<------>', 'M'),
  (('榇译', 'M', 0.5846302505197468), '<------>', 'F'),
  (('建梅', 'M', 0.5096760450675578), '<------>', 'F'),
  (('思宁', 'F', 0.5542480753756921), '<------>', 'M'),
  (('奕渝', 'F', 0.7007850699524022), '<------>', 'M'),
  (('雷筠', 'F', 0.7311327420073551), '<------>', 'M'),
  (('国萌', 'F', 0.7568

In [12]:
# corresponding full name model = 0.9387862515386246

# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9452818086505438,
 [('prediction', '<------>', 'observation'),
  (('宇烟', 'F', 0.7961985689456496), '<------>', 'M'),
  (('佳臻', 'M', 0.6743927726548445), '<------>', 'F'),
  (('雁林', 'F', 0.4971546052901239), '<------>', 'M'),
  (('雨杨', 'F', 0.5171689917383303), '<------>', 'M'),
  (('必霏', 'F', 0.6319644944483971), '<------>', 'M'),
  (('刀鸾', 'M', 0.6437430130782102), '<------>', 'F'),
  (('纯融', 'F', 0.5285534138911746), '<------>', 'M'),
  (('榇译', 'M', 0.5846302505197468), '<------>', 'F'),
  (('建梅', 'M', 0.5096760450675578), '<------>', 'F'),
  (('思宁', 'F', 0.5542480753756921), '<------>', 'M'),
  (('奕渝', 'F', 0.7007850699524022), '<------>', 'M'),
  (('雷筠', 'F', 0.7311327420073551), '<------>', 'M'),
  (('国萌', 'F', 0.7568867258219952), '<------>', 'M'),
  (('云夕', 'F', 0.4833743005472063), '<------>', 'M'),
  (('嘉桐', 'F', 0.49647584866570316), '<------>', 'M'),
  (('乔楚', 'F', 0.6271522217742468), '<------>', 'M'),
  (('寒宫', 'F', 0.5755866213099856), '<------>', 'M'),
  (('蕙全', 'F', 

## Test again first/full names (randomly)

In [13]:
#loading test file
from random import random, seed


def test_set(file='data/test_ds.txt', seed_=0):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    seed(seed_)
    for line in f:
        line = line.split('\t')
        try:
            # line[2] = full name; line[1] = first name
            name = line[2] if random() >= 0.5 else line[1]
            names.append(name)
            genders.append(line[3].strip())
        except:
            pass
    return names, genders


names, genders = test_set()
len(names), len(genders)

(731622, 731622)

In [14]:
# check the names
names[:20]

['冯瑞琳',
 '曹凯棋',
 '义祥',
 '识闻',
 '钮缤鲃',
 '颖函',
 '尹唱又',
 '竺歌',
 '博辰',
 '舒梅云',
 '钱穗君',
 '张继燕',
 '维娜',
 '冯梦饶',
 '曹瑞鹏',
 '倩璐',
 '姚蕴珈',
 '薛俊英',
 '李倩雅',
 '肖井刚']

**Repeat the above procedure**

In [15]:
# corresponding full name model = 0.9207117883278524
# corresponding first name model = 0.9268215007203173

# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9238910256935958,
 [('prediction', '<------>', 'observation'),
  (('张继燕', 'M', 0.5078611344330047), '<------>', 'F'),
  (('万宇烟', 'F', 0.5654871460207109), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5893905285861586), '<------>', 'F'),
  (('付睿', 'M', 0.712606419452691), '<------>', 'U'),
  (('班柳淳', 'M', 0.5678185646975151), '<------>', 'F'),
  (('郭连', 'M', 0.5720968653852523), '<------>', 'U'),
  (('丁', 'M', 0.5716382443388314), '<------>', 'U'),
  (('贝学敏', 'F', 0.65171017063093), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('必霏', 'F', 0.5628201585717663), '<------>', 'M'),
  (('乔', 'M', 0.5002984056611817), '<------>', 'U'),
  (('闫韶华', 'Undefined', 0.5292034726456268), '<------>', 'U'),
  (('纯融', 'F', 0.5276217216421958), '<------>', 'M'),
  (('建梅', 'M', 0.5072885264113434), '<------>', 'F'),
  (('奚雷筠', 'F', 0.8758288485000173), '<------>', 'M'),
  (('连国萌', 'F', 0.6067968034998574), '<------>', 'M'),
  (('刘畏', 'M', 0.6206374916510611), '<------>', 'U'),
  ((

In [16]:
# corresponding full name model = 0.9454390045011347
# corresponding first name model = 0.9517128031809703

# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9487036254696577,
 [('prediction', '<------>', 'observation'),
  (('张继燕', 'M', 0.5078611344330047), '<------>', 'F'),
  (('万宇烟', 'F', 0.5654871460207109), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5893905285861586), '<------>', 'F'),
  (('班柳淳', 'M', 0.5678185646975151), '<------>', 'F'),
  (('贝学敏', 'F', 0.65171017063093), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('必霏', 'F', 0.5628201585717663), '<------>', 'M'),
  (('纯融', 'F', 0.5276217216421958), '<------>', 'M'),
  (('建梅', 'M', 0.5072885264113434), '<------>', 'F'),
  (('奚雷筠', 'F', 0.8758288485000173), '<------>', 'M'),
  (('连国萌', 'F', 0.6067968034998574), '<------>', 'M'),
  (('云夕', 'F', 0.47663201243602304), '<------>', 'M'),
  (('武亭', 'M', 0.5524695604437723), '<------>', 'F'),
  (('王敏利', 'Undefined', 0.5732817087884837), '<------>', 'F'),
  (('司空嘉桐', 'F', 0.5071564461314124), '<------>', 'M'),
  (('诸乔楚', 'F', 0.5988092872838239), '<------>', 'M'),
  (('寒宫', 'F', 0.5741479704400875), '<------>', 'M'

In [17]:
# corresponding full name model = 0.91423303290497
# corresponding first name model = 0.91423303290497

# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9174628428341411,
 [('prediction', '<------>', 'observation'),
  (('钱穗君', 'M', 0.423255170201506), '<------>', 'F'),
  (('张继燕', 'M', 0.5147383902924803), '<------>', 'F'),
  (('万宇烟', 'F', 0.7834142743741946), '<------>', 'M'),
  (('卞佳臻', 'M', 0.7434085763069487), '<------>', 'F'),
  (('付睿', 'M', 0.712606419452691), '<------>', 'U'),
  (('班柳淳', 'M', 0.5680285349769285), '<------>', 'F'),
  (('郭连', 'M', 0.5720968653852523), '<------>', 'U'),
  (('丁', 'M', 0.5716382443388314), '<------>', 'U'),
  (('贝学敏', 'F', 0.6493241711705547), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('必霏', 'F', 0.6319644944483971), '<------>', 'M'),
  (('乔', 'M', 0.5002984056611817), '<------>', 'U'),
  (('邵子奕', 'F', 0.4869604261972831), '<------>', 'M'),
  (('闫韶华', 'Undefined', 0.4836621778927178), '<------>', 'U'),
  (('刀鸾', 'M', 0.6437430130782102), '<------>', 'F'),
  (('纯融', 'F', 0.5285534138911746), '<------>', 'M'),
  (('榇译', 'M', 0.5846302505197468), '<------>', 'F'),
  (

In [18]:
# corresponding full name model = 0.9387862515386246
# corresponding full name model = 0.9452818086505438

# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9421028032792177,
 [('prediction', '<------>', 'observation'),
  (('钱穗君', 'M', 0.423255170201506), '<------>', 'F'),
  (('张继燕', 'M', 0.5147383902924803), '<------>', 'F'),
  (('万宇烟', 'F', 0.7834142743741946), '<------>', 'M'),
  (('卞佳臻', 'M', 0.7434085763069487), '<------>', 'F'),
  (('班柳淳', 'M', 0.5680285349769285), '<------>', 'F'),
  (('贝学敏', 'F', 0.6493241711705547), '<------>', 'M'),
  (('童雨杨', 'F', 0.5681692397642859), '<------>', 'M'),
  (('必霏', 'F', 0.6319644944483971), '<------>', 'M'),
  (('邵子奕', 'F', 0.4869604261972831), '<------>', 'M'),
  (('刀鸾', 'M', 0.6437430130782102), '<------>', 'F'),
  (('纯融', 'F', 0.5285534138911746), '<------>', 'M'),
  (('榇译', 'M', 0.5846302505197468), '<------>', 'F'),
  (('建梅', 'M', 0.5096760450675578), '<------>', 'F'),
  (('钭奕渝', 'F', 0.9298705240593117), '<------>', 'M'),
  (('奚雷筠', 'F', 0.702070718579464), '<------>', 'M'),
  (('连国萌', 'F', 0.7536643578050461), '<------>', 'M'),
  (('云夕', 'F', 0.4833743005472063), '<------>', 'M'),
  (('武亭