In [1]:
from gender import Gender

In [2]:
#loading test file

def test_set(file='data/test_ds.txt', full_name=True):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    for line in f:
        line = line.split('\t')
        try:
            if full_name:
            # line[2] = full name; line[1] = first name
                names.append(line[2])
            else:
                names.append(line[1])
            genders.append(line[3].strip())
        except:
            pass
    return names, genders


names, genders = test_set()
len(names), len(genders)

(731622, 731622)

In [3]:
# calculating the prediction accuracy based on full names

def accuracy(method='lap', include_U=True):
    global names, genders
    def comp(idx):
        nonlocal right, mismatch
        pred = predict(names[idx], method, False)
        real = genders[idx]
        if pred[1] == real: right += 1
        else: mismatch.append((pred, '<------>', genders[idx]))
        
    right = 0
    mismatch = [('prediction', '<------>', 'observation')]
    predict = Gender().predict
    sample_size = len(names)
    if include_U:
        for i in range(len(names)):
            comp(i)
    else:
        for i in range(len(names)):
            if genders[i] != 'U': comp(i)
            else: sample_size -= 1
    return right/sample_size, mismatch

## Test against full names

In [4]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9371465046157715,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6055499573274709), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5117781473971769), '<------>', 'F'),
  (('索刘敏', 'M', 0.5086917470381878), '<------>', 'F'),
  (('付睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('郭连', 'M', 0.5347978585933173), '<------>', 'U'),
  (('游丁', 'M', 0.7563559322033898), '<------>', 'U'),
  (('梅必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('林乔', 'M', 0.49097162510748066), '<------>', 'U'),
  (('闫韶华', 'M', 0.5934781105033407), '<------>', 'U'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.4845344880455854), '<------>', 'M'),
  (('连国萌', 'F', 0.5410949215661441), '<------>', 'M'),
  (('刘畏', 'M', 0.7812499999999999), '<------>', 'U'),
  (('巢云夕', 'F', 0.48294370286325383), '<------>', 'M'),
  (('南英', 'F', 0.8497095739063325), '<------>', 'U'),
  (('任善竺', 'F', 0.515157089128229), '<------>', 'M'),
  (('容庚', 'M', 0.8711217183770882), '<------>', 'U'),
  (('

In [5]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9623151018895784,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6055499573274709), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5117781473971769), '<------>', 'F'),
  (('索刘敏', 'M', 0.5086917470381878), '<------>', 'F'),
  (('梅必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.4845344880455854), '<------>', 'M'),
  (('连国萌', 'F', 0.5410949215661441), '<------>', 'M'),
  (('巢云夕', 'F', 0.48294370286325383), '<------>', 'M'),
  (('任善竺', 'F', 0.515157089128229), '<------>', 'M'),
  (('沙洪柳', 'F', 0.573392514338241), '<------>', 'M'),
  (('上官李那', 'F', 0.5988431816367817), '<------>', 'M'),
  (('任彦米', 'F', 0.6001639533890436), '<------>', 'M'),
  (('司空嘉桐', 'F', 0.4947270082700448), '<------>', 'M'),
  (('尤星冉', 'F', 0.5107737217938496), '<------>', 'M'),
  (('诸乔楚', 'F', 0.6450424097367288), '<------>', 'M'),
  (('苗蕙全', 'F', 0.6018003286852724), '<------>', 'M'),
  (('纪会会', 'Undefined', 0.3400954333668674), '<------

In [6]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9300936822566844,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6571441544919221), '<------>', 'M'),
  (('卞佳臻', 'M', 0.512897141401947), '<------>', 'F'),
  (('索刘敏', 'M', 0.5094639697360233), '<------>', 'F'),
  (('付睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('郭连', 'M', 0.5347978585933173), '<------>', 'U'),
  (('游丁', 'M', 0.8612786489746682), '<------>', 'U'),
  (('林乔', 'M', 0.49097162510748066), '<------>', 'U'),
  (('闫韶华', 'M', 0.5934781105033407), '<------>', 'U'),
  (('白榇译', 'M', 0.5307568158065461), '<------>', 'F'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.5743242655385526), '<------>', 'M'),
  (('司敬园', 'M', 0.5393666618020849), '<------>', 'F'),
  (('连国萌', 'F', 0.7766702031184257), '<------>', 'M'),
  (('刘畏', 'M', 0.8627366736693435), '<------>', 'U'),
  (('巢云夕', 'F', 0.5254527428157781), '<------>', 'M'),
  (('南英', 'F', 0.8497095739063325), '<------>', 'U'),
  (('任善竺', 'F', 0.6508855569711229), '<------>', 'M'),
  (('

In [7]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9550728644873521,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6571441544919221), '<------>', 'M'),
  (('卞佳臻', 'M', 0.512897141401947), '<------>', 'F'),
  (('索刘敏', 'M', 0.5094639697360233), '<------>', 'F'),
  (('白榇译', 'M', 0.5307568158065461), '<------>', 'F'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.5743242655385526), '<------>', 'M'),
  (('司敬园', 'M', 0.5393666618020849), '<------>', 'F'),
  (('连国萌', 'F', 0.7766702031184257), '<------>', 'M'),
  (('巢云夕', 'F', 0.5254527428157781), '<------>', 'M'),
  (('任善竺', 'F', 0.6508855569711229), '<------>', 'M'),
  (('任彦米', 'F', 0.810353129785596), '<------>', 'M'),
  (('司空嘉桐', 'F', 0.5042679439195341), '<------>', 'M'),
  (('尤星冉', 'F', 0.5107737217938496), '<------>', 'M'),
  (('诸乔楚', 'F', 0.63959665777796), '<------>', 'M'),
  (('谷寒宫', 'F', 0.6038967915822041), '<------>', 'M'),
  (('纪会会', 'Undefined', 0.3400954333668674), '<------>', 'F'),
  (('毕琳伟', 'F', 0.7103688853484734), '<------>', 

## Test against first names

In [8]:
# load names as first names

names, genders = test_set(full_name=False)
len(names), len(genders)

(731622, 731622)

**Repeat the above procedure**

In [9]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.8806200469641426,
 [('prediction', '<------>', 'observation'),
  (('唱又', 'M', 0.5145985401459855), '<------>', 'F'),
  (('梦饶', 'M', 0.7037037037037037), '<------>', 'F'),
  (('佳嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('艳砚', 'M', 0.543956043956044), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5717213114754098), '<------>', 'M'),
  (('宇烟', 'F', 0.8636363636363636), '<------>', 'M'),
  (('佳臻', 'M', 0.7533197139938713), '<------>', 'F'),
  (('睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('柳淳', 'M', 0.7773780975219824), '<------>', 'F'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5118628141883956), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('连', 'M', 0.5593406362595233), '<------>', 'U'),
  (('丁', 'M', 0.5593406362595233), '<------>', 'U'),
  (('学敏', 'F', 0.6382878398910253), '<------>', 'M'),
  (('凤荣', 'M', 0.7442

In [10]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.904270534058867,
 [('prediction', '<------>', 'observation'),
  (('唱又', 'M', 0.5145985401459855), '<------>', 'F'),
  (('梦饶', 'M', 0.7037037037037037), '<------>', 'F'),
  (('佳嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('艳砚', 'M', 0.543956043956044), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5717213114754098), '<------>', 'M'),
  (('宇烟', 'F', 0.8636363636363636), '<------>', 'M'),
  (('佳臻', 'M', 0.7533197139938713), '<------>', 'F'),
  (('柳淳', 'M', 0.7773780975219824), '<------>', 'F'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5118628141883956), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('学敏', 'F', 0.6382878398910253), '<------>', 'M'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593406362595233), '<------>', 'F'),
  (('必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('邦皖', 'F', 0.480

In [11]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.8764402929381566,
 [('prediction', '<------>', 'observation'),
  (('梦饶', 'M', 0.6841197868278862), '<------>', 'F'),
  (('佳嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('艳砚', 'M', 0.5663615560640732), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5571745562130178), '<------>', 'M'),
  (('宇烟', 'F', 0.8825944809870393), '<------>', 'M'),
  (('佳臻', 'M', 0.7543310106757015), '<------>', 'F'),
  (('睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('柳淳', 'M', 0.7780327592740747), '<------>', 'F'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5255034366333052), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('连', 'M', 0.5593406362595233), '<------>', 'U'),
  (('丁', 'M', 0.5593406362595233), '<------>', 'U'),
  (('学敏', 'F', 0.6382878398910253), '<------>', 'M'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593

In [12]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.8999785259239818,
 [('prediction', '<------>', 'observation'),
  (('梦饶', 'M', 0.6841197868278862), '<------>', 'F'),
  (('佳嘉', 'M', 0.5583882187523126), '<------>', 'F'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('艳砚', 'M', 0.5663615560640732), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5571745562130178), '<------>', 'M'),
  (('宇烟', 'F', 0.8825944809870393), '<------>', 'M'),
  (('佳臻', 'M', 0.7543310106757015), '<------>', 'F'),
  (('柳淳', 'M', 0.7780327592740747), '<------>', 'F'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5255034366333052), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('学敏', 'F', 0.6382878398910253), '<------>', 'M'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593406362595233), '<------>', 'F'),
  (('邦皖', 'F', 0.6098446348192246), '<------>', 'M'),
  (('博函', 'F', 0.7854653989519744), '<------>', 'M'),
  (('榇译', 'M', 0.5

## Test again first/full names (randomly)

In [13]:
#loading test file
from random import random, seed


def test_set(file='data/test_ds.txt', seed_=0):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    seed(seed_)
    for line in f:
        line = line.split('\t')
        try:
            # line[2] = full name; line[1] = first name
            name = line[2] if random() >= 0.5 else line[1]
            names.append(name)
            genders.append(line[3].strip())
        except:
            pass
    return names, genders


names, genders = test_set()
len(names), len(genders)

(731622, 731622)

In [14]:
# check the names
names[:20]

['冯瑞琳',
 '曹凯棋',
 '义祥',
 '识闻',
 '钮缤鲃',
 '颖函',
 '尹唱又',
 '竺歌',
 '博辰',
 '舒梅云',
 '钱穗君',
 '张继燕',
 '维娜',
 '冯梦饶',
 '曹瑞鹏',
 '倩璐',
 '姚蕴珈',
 '薛俊英',
 '李倩雅',
 '肖井刚']

**Repeat the above procedure**

In [15]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9090226920458926,
 [('prediction', '<------>', 'observation'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5717213114754098), '<------>', 'M'),
  (('万宇烟', 'F', 0.6055499573274709), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5117781473971769), '<------>', 'F'),
  (('索刘敏', 'M', 0.5086917470381878), '<------>', 'F'),
  (('付睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5118628141883956), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('郭连', 'M', 0.5347978585933173), '<------>', 'U'),
  (('丁', 'M', 0.5593406362595233), '<------>', 'U'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593406362595233), '<------>', 'F'),
  (('必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('乔', 'M', 0.5593406362595233), '<------>', 'U'),
  (('闫韶华', 'M', 0.5934781105033407), '<------>', 'U'),
  (('李思宁', 'F', 

In [16]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9334359784810109,
 [('prediction', '<------>', 'observation'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5717213114754098), '<------>', 'M'),
  (('万宇烟', 'F', 0.6055499573274709), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5117781473971769), '<------>', 'F'),
  (('索刘敏', 'M', 0.5086917470381878), '<------>', 'F'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5118628141883956), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593406362595233), '<------>', 'F'),
  (('必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('丽东', 'M', 0.9267227938883693), '<------>', 'F'),
  (('钭奕渝', 'F', 0.4845344880455854), '<------>', 'M'),
  (('烟', 'M', 0.5593406362595233), '<------>', 'F'),
  (('舜玉', 'F', 0.5987675363838993), '<------>', 'M'),
  (('连国萌', 'F'

In [17]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9034337403741276,
 [('prediction', '<------>', 'observation'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5571745562130178), '<------>', 'M'),
  (('万宇烟', 'F', 0.6571441544919221), '<------>', 'M'),
  (('卞佳臻', 'M', 0.512897141401947), '<------>', 'F'),
  (('索刘敏', 'M', 0.5094639697360233), '<------>', 'F'),
  (('付睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5255034366333052), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('郭连', 'M', 0.5347978585933173), '<------>', 'U'),
  (('丁', 'M', 0.5593406362595233), '<------>', 'U'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593406362595233), '<------>', 'F'),
  (('乔', 'M', 0.5593406362595233), '<------>', 'U'),
  (('闫韶华', 'M', 0.5934781105033407), '<------>', 'U'),
  (('榇译', 'M', 0.5307568158065461), '<------>', 'F'),
  (('李思宁', 'F', 0

In [18]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9276969264000606,
 [('prediction', '<------>', 'observation'),
  (('玉墨', 'M', 0.5746924428822495), '<------>', 'F'),
  (('凤辰', 'M', 0.7418631949631843), '<------>', 'F'),
  (('子懿', 'F', 0.5571745562130178), '<------>', 'M'),
  (('万宇烟', 'F', 0.6571441544919221), '<------>', 'M'),
  (('卞佳臻', 'M', 0.512897141401947), '<------>', 'F'),
  (('索刘敏', 'M', 0.5094639697360233), '<------>', 'F'),
  (('丽清', 'M', 0.601304347826087), '<------>', 'F'),
  (('舒童', 'M', 0.5255034366333052), '<------>', 'F'),
  (('秋华', 'M', 0.6056708097239992), '<------>', 'F'),
  (('凤荣', 'M', 0.7442228401572538), '<------>', 'F'),
  (('苏', 'M', 0.5593406362595233), '<------>', 'F'),
  (('榇译', 'M', 0.5307568158065461), '<------>', 'F'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('丽东', 'M', 0.9267227938883693), '<------>', 'F'),
  (('钭奕渝', 'F', 0.5743242655385526), '<------>', 'M'),
  (('司敬园', 'M', 0.5393666618020849), '<------>', 'F'),
  (('烟', 'M', 0.5593406362595233), '<------>', 'F'),
  (('舜玉', 'F',