In [1]:
from gender import Gender

In [2]:
#loading test file

def test_set(file='data/test_ds.txt'):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    for line in f:
        line = line.split('\t')
        # line[2] = full name; line[1] = first name
        names.append(line[2])
        genders.append(line[3].strip())
    
    return names, genders


names, genders = test_set()
len(names), len(genders)

(731622, 731622)

In [3]:
# calculating the prediction accuracy 

def accuracy(method='lap', include_U=True):
    global names, genders
    def comp(idx):
        nonlocal right, mismatch
        pred = predict(names[idx], method, False)
        real = genders[idx]
        if pred[1] == real: right += 1
        else: mismatch.append((pred, '<------>', genders[idx]))
        
    right = 0
    mismatch = [('prediction', '<------>', 'observation')]
    predict = Gender().predict
    sample_size = len(names)
    if include_U:
        for i in range(len(names)):
            comp(i)
    else:
        for i in range(len(names)):
            if genders[i] != 'U': comp(i)
            else: sample_size -= 1
    return right/sample_size, mismatch

In [4]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9371465046157715,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6055499573274709), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5117781473971769), '<------>', 'F'),
  (('索刘敏', 'M', 0.5086917470381878), '<------>', 'F'),
  (('付睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('郭连', 'M', 0.5347978585933173), '<------>', 'U'),
  (('游丁', 'M', 0.7563559322033898), '<------>', 'U'),
  (('梅必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('林乔', 'M', 0.49097162510748066), '<------>', 'U'),
  (('闫韶华', 'M', 0.5934781105033407), '<------>', 'U'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.4845344880455854), '<------>', 'M'),
  (('连国萌', 'F', 0.5410949215661441), '<------>', 'M'),
  (('刘畏', 'M', 0.7812499999999999), '<------>', 'U'),
  (('巢云夕', 'F', 0.48294370286325383), '<------>', 'M'),
  (('南英', 'F', 0.8497095739063325), '<------>', 'U'),
  (('任善竺', 'F', 0.515157089128229), '<------>', 'M'),
  (('容庚', 'M', 0.8711217183770882), '<------>', 'U'),
  (('

In [5]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9623151018895784,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6055499573274709), '<------>', 'M'),
  (('卞佳臻', 'M', 0.5117781473971769), '<------>', 'F'),
  (('索刘敏', 'M', 0.5086917470381878), '<------>', 'F'),
  (('梅必霏', 'F', 0.5617018756556652), '<------>', 'M'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.4845344880455854), '<------>', 'M'),
  (('连国萌', 'F', 0.5410949215661441), '<------>', 'M'),
  (('巢云夕', 'F', 0.48294370286325383), '<------>', 'M'),
  (('任善竺', 'F', 0.515157089128229), '<------>', 'M'),
  (('沙洪柳', 'F', 0.573392514338241), '<------>', 'M'),
  (('上官李那', 'F', 0.5988431816367817), '<------>', 'M'),
  (('任彦米', 'F', 0.6001639533890436), '<------>', 'M'),
  (('司空嘉桐', 'F', 0.4947270082700448), '<------>', 'M'),
  (('尤星冉', 'F', 0.5107737217938496), '<------>', 'M'),
  (('诸乔楚', 'F', 0.6450424097367288), '<------>', 'M'),
  (('苗蕙全', 'F', 0.6018003286852724), '<------>', 'M'),
  (('纪会会', 'Undefined', 0.3400954333668674), '<------

In [6]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9300936822566844,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6571441544919221), '<------>', 'M'),
  (('卞佳臻', 'M', 0.512897141401947), '<------>', 'F'),
  (('索刘敏', 'M', 0.5094639697360233), '<------>', 'F'),
  (('付睿', 'M', 0.6372919818456882), '<------>', 'U'),
  (('郭连', 'M', 0.5347978585933173), '<------>', 'U'),
  (('游丁', 'M', 0.8612786489746682), '<------>', 'U'),
  (('林乔', 'M', 0.49097162510748066), '<------>', 'U'),
  (('闫韶华', 'M', 0.5934781105033407), '<------>', 'U'),
  (('白榇译', 'M', 0.5307568158065461), '<------>', 'F'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.5743242655385526), '<------>', 'M'),
  (('司敬园', 'M', 0.5393666618020849), '<------>', 'F'),
  (('连国萌', 'F', 0.7766702031184257), '<------>', 'M'),
  (('刘畏', 'M', 0.8627366736693435), '<------>', 'U'),
  (('巢云夕', 'F', 0.5254527428157781), '<------>', 'M'),
  (('南英', 'F', 0.8497095739063325), '<------>', 'U'),
  (('任善竺', 'F', 0.6508855569711229), '<------>', 'M'),
  (('

In [7]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9550728644873521,
 [('prediction', '<------>', 'observation'),
  (('万宇烟', 'F', 0.6571441544919221), '<------>', 'M'),
  (('卞佳臻', 'M', 0.512897141401947), '<------>', 'F'),
  (('索刘敏', 'M', 0.5094639697360233), '<------>', 'F'),
  (('白榇译', 'M', 0.5307568158065461), '<------>', 'F'),
  (('李思宁', 'F', 0.5074970136241429), '<------>', 'M'),
  (('钭奕渝', 'F', 0.5743242655385526), '<------>', 'M'),
  (('司敬园', 'M', 0.5393666618020849), '<------>', 'F'),
  (('连国萌', 'F', 0.7766702031184257), '<------>', 'M'),
  (('巢云夕', 'F', 0.5254527428157781), '<------>', 'M'),
  (('任善竺', 'F', 0.6508855569711229), '<------>', 'M'),
  (('任彦米', 'F', 0.810353129785596), '<------>', 'M'),
  (('司空嘉桐', 'F', 0.5042679439195341), '<------>', 'M'),
  (('尤星冉', 'F', 0.5107737217938496), '<------>', 'M'),
  (('诸乔楚', 'F', 0.63959665777796), '<------>', 'M'),
  (('谷寒宫', 'F', 0.6038967915822041), '<------>', 'M'),
  (('纪会会', 'Undefined', 0.3400954333668674), '<------>', 'F'),
  (('毕琳伟', 'F', 0.7103688853484734), '<------>', 