In [1]:
import pandas as pd
import nltk
from nltk.corpus import names
from nltk.metrics.scores import (precision, recall)
import random

In [2]:
# female names usually end in a,i,e and male names are k,o,r,s,t
#return the last letter in a word
names = ([(name.lower(), 'male') for name in names.words('male.txt')] + 
        [(name.lower(), 'female') for name in names.words('female.txt')])

In [3]:
random.seed(4)
random.shuffle(names)

In [4]:
# use suffixes more than prefixes, add a length feature if anything
def gender_features_custom(name):
    return {'suffix1': name[-1],
            'suffix2': name[-2:],
            'suffix3': name[-3:],
            'prefix2': name[0:2],
            'prefix3': name[0:3],
            'length': len(name),
            'n_vowels': (name.count("a") + name.count("e") + name.count("i") + name.count("o") + name.count("u"))}

In [5]:
test_names = names[:500]
train_names = names[1000:]
dev_test_names = names[500:1000]

train_set = [(gender_features_custom(n), g) for (n,g) in train_names]
dev_test_set = [(gender_features_custom(n), g) for (n,g) in dev_test_names]
test_set = [(gender_features_custom(n), g) for (n,g) in test_names]

# train dataset
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [6]:
print "Labels: ", classifier.labels()

Labels:  ['male', 'female']


In [7]:
print classifier.show_most_informative_features(1000)

Most Informative Features
                 suffix2 = u'na'          female : male   =    156.7 : 1.0
                 suffix2 = u'la'          female : male   =     72.6 : 1.0
                 suffix2 = u'ia'          female : male   =     38.0 : 1.0
                 suffix2 = u'ld'            male : female =     37.0 : 1.0
                 suffix1 = u'a'           female : male   =     35.1 : 1.0
                 suffix2 = u'sa'          female : male   =     34.4 : 1.0
                 suffix3 = u'nne'         female : male   =     30.4 : 1.0
                 suffix2 = u'rd'            male : female =     29.1 : 1.0
                 suffix1 = u'k'             male : female =     28.7 : 1.0
                 suffix2 = u'us'            male : female =     26.3 : 1.0
                 suffix2 = u'ch'            male : female =     26.0 : 1.0
                 suffix2 = u'io'            male : female =     26.0 : 1.0
                 suffix1 = u'f'             male : female =     25.3 : 1.0

In [8]:
print "Accuracy dev-test data: ", nltk.classify.accuracy(classifier, dev_test_set)

Accuracy dev-test data:  0.828


In [9]:
# See what was misclassified:
errors = []
for (name, tag) in dev_test_names:
    guess = classifier.classify(gender_features_custom(name))
    if guess != tag:
        errors.append((tag, guess, name))
        
for (tag, guess, name) in sorted(errors):
    print 'correct=', tag, 'guess=', guess, 'name=', name

correct= female guess= male name= agnes
correct= female guess= male name= alis
correct= female guess= male name= allyson
correct= female guess= male name= avivah
correct= female guess= male name= bab
correct= female guess= male name= barry
correct= female guess= male name= biddy
correct= female guess= male name= binny
correct= female guess= male name= bird
correct= female guess= male name= blanch
correct= female guess= male name= bo
correct= female guess= male name= brigid
correct= female guess= male name= carmon
correct= female guess= male name= charis
correct= female guess= male name= charmian
correct= female guess= male name= chloris
correct= female guess= male name= cloris
correct= female guess= male name= constancy
correct= female guess= male name= cybill
correct= female guess= male name= daffy
correct= female guess= male name= devin
correct= female guess= male name= dusty
correct= female guess= male name= eran
correct= female guess= male name= garland
correct= female guess= male 

In [10]:
# show number of mislabeled names 
print "Mislabeled names: ", len(errors)

Mislabeled names:  86
