In [1]:
from gender import Gender

In [2]:
#loading test file

def test_set(file='data/t.txt'):
    names, genders = [], []
    f = open(file, 'r')
    next(f)
    for line in f:
        line = line.split('\t')
        # line[2] = full name; line[1] = first name
        names.append(line[2])
        genders.append(line[3].strip())
    
    return names, genders


names, genders = test_set()
len(names), len(genders)

(2926486, 2926486)

In [3]:
# calculating the prediction accuracy 

def accuracy(method='lap', include_U=True):
    global names, genders
    def comp(idx):
        nonlocal right, mismatch
        pred = predict(names[idx], method, False)
        real = genders[idx]
        if pred[1] == real: right += 1
        else: mismatch.append((pred, '<------>', genders[idx]))
        
    right = 0
    mismatch = [('prediction', '<------>', 'observation')]
    predict = Gender().predict
    sample_size = len(names)
    if include_U:
        for i in range(len(names)):
            comp(i)
    else:
        for i in range(len(names)):
            if genders[i] != 'U': comp(i)
            else: sample_size -= 1
    return right/sample_size, mismatch

In [4]:
# calculating accuracy of laplace method, incuding all genders
accu, mismatch = accuracy()
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9376819161273965,
 [('prediction', '<------>', 'observation'),
  (('贾晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('秋紫俊', 'F', 0.6188568425156774), '<------>', 'M'),
  (('武美啸', 'M', 0.5175399048014521), '<------>', 'F'),
  (('林文冰', 'F', 0.5265047592527463), '<------>', 'U'),
  (('宋文会', 'M', 0.5966601669895142), '<------>', 'U'),
  (('吴乐懿', 'M', 0.6334930288052831), '<------>', 'U'),
  (('林水荣', 'M', 0.7984595416770441), '<------>', 'U'),
  (('张彩龙', 'F', 0.5669448429175324), '<------>', 'M'),
  (('褚飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('沈云华', 'M', 0.3768863196945002), '<------>', 'U'),
  (('施正漪', 'F', 0.7944377272158326), '<------>', 'M'),
  (('吉晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('柯逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('吴宝懿', 'M', 0.684011403531615), '<------>', 'U'),
  (('赵珣', 'M', 0.4626865671641791), '<------>', 'U'),
  (('魏俊华', 'M', 0.812477928343533), '<------>', 'U'),
  (('卞郡', 'F', 0.5093457943925234), '<------>', 'M'),
  (('

In [5]:
# calculating accuracy of laplace method, excluding undefined genders
accu, mismatch = accuracy(include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9624090238263583,
 [('prediction', '<------>', 'observation'),
  (('秋紫俊', 'F', 0.6188568425156774), '<------>', 'M'),
  (('武美啸', 'M', 0.5175399048014521), '<------>', 'F'),
  (('张彩龙', 'F', 0.5669448429175324), '<------>', 'M'),
  (('褚飘庆', 'F', 0.6278352387730713), '<------>', 'M'),
  (('施正漪', 'F', 0.7944377272158326), '<------>', 'M'),
  (('吉晗', 'F', 0.6603485838779957), '<------>', 'M'),
  (('卞郡', 'F', 0.5093457943925234), '<------>', 'M'),
  (('段常桃', 'F', 0.6254946956457782), '<------>', 'M'),
  (('栾德慧', 'F', 0.5536782694266356), '<------>', 'M'),
  (('李旭彤', 'M', 0.5467137154925025), '<------>', 'F'),
  (('乔海云', 'M', 0.5714315674455618), '<------>', 'F'),
  (('王越M', 'Undefined', 0.7078066158956854), '<------>', 'F'),
  (('游松菱', 'F', 0.7711401793521472), '<------>', 'M'),
  (('饶璃', 'F', 0.5862068965517242), '<------>', 'M'),
  (('闻思嘉', 'F', 0.5991541905808809), '<------>', 'M'),
  (('杨坷潼', 'F', 0.6194318626785917), '<------>', 'M'),
  (('蒲厘杉', 'M', 0.6062027508955053), '<------>', 

In [6]:
# calculating accuracy of good turing method, incuding all genders
accu, mismatch = accuracy(method='gt')
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.9307989171996722,
 [('prediction', '<------>', 'observation'),
  (('贾晔', 'M', 0.49505494505494513), '<------>', 'U'),
  (('秋紫俊', 'F', 0.6227565027133336), '<------>', 'M'),
  (('武美啸', 'M', 0.7945392799222982), '<------>', 'F'),
  (('林文冰', 'F', 0.5265047592527463), '<------>', 'U'),
  (('宋文会', 'M', 0.5966601669895142), '<------>', 'U'),
  (('吴乐懿', 'M', 0.6174312283858132), '<------>', 'U'),
  (('林水荣', 'M', 0.7984595416770441), '<------>', 'U'),
  (('张彩龙', 'F', 0.7571218017776794), '<------>', 'M'),
  (('沈云华', 'M', 0.3768863196945002), '<------>', 'U'),
  (('施正漪', 'F', 0.7621473175264708), '<------>', 'M'),
  (('吉晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('柯逸', 'M', 0.6708860759493671), '<------>', 'U'),
  (('吴宝懿', 'M', 0.662937360224399), '<------>', 'U'),
  (('赵珣', 'M', 0.8688826744629464), '<------>', 'U'),
  (('魏俊华', 'M', 0.812477928343533), '<------>', 'U'),
  (('贾渝', 'F', 0.5544634224943603), '<------>', 'M'),
  (('卞郡', 'F', 0.5190476190476191), '<------>', 'M'),
  (('陈游

In [7]:
# calculating accuracy of good turing method, excluding undefined genders
accu, mismatch = accuracy(method='gt', include_U=False)
# display the accuracy score and some of the mismatched cases
accu, mismatch[:20]

(0.955344517019629,
 [('prediction', '<------>', 'observation'),
  (('秋紫俊', 'F', 0.6227565027133336), '<------>', 'M'),
  (('武美啸', 'M', 0.7945392799222982), '<------>', 'F'),
  (('张彩龙', 'F', 0.7571218017776794), '<------>', 'M'),
  (('施正漪', 'F', 0.7621473175264708), '<------>', 'M'),
  (('吉晗', 'F', 0.662272396212673), '<------>', 'M'),
  (('贾渝', 'F', 0.5544634224943603), '<------>', 'M'),
  (('卞郡', 'F', 0.5190476190476191), '<------>', 'M'),
  (('段常桃', 'F', 0.6131184187129176), '<------>', 'M'),
  (('余舁', 'F', 0.5190522056268192), '<------>', 'M'),
  (('姜明瑾', 'M', 0.5441911101311979), '<------>', 'F'),
  (('栾德慧', 'F', 0.5536782694266356), '<------>', 'M'),
  (('李旭彤', 'M', 0.5475283303662931), '<------>', 'F'),
  (('乔海云', 'M', 0.5714315674455618), '<------>', 'F'),
  (('王越M', 'Undefined', 0.7078066158956854), '<------>', 'F'),
  (('宋悠也', 'M', 0.6709454839161347), '<------>', 'F'),
  (('潘珑瑚', 'Undefined', 0.5830852073228867), '<------>', 'F'),
  (('游松菱', 'F', 0.754092362381194), '<------