In [200]:
import pandas as pd
import nltk
from nltk.corpus import names
from nltk.metrics.scores import (precision, recall)
import random
import re

In [201]:
# female names usually end in a,i,e and male names are k,o,r,s,t
#return the last letter in a word
names = ([(name.lower(), 'male') for name in names.words('male.txt')] + 
        [(name.lower(), 'female') for name in names.words('female.txt')])

In [202]:
random.seed(4)
random.shuffle(names)

In [203]:
# use suffixes more than prefixes, add a length feature if anything
def gender_features_custom(name):
    return {'suffix1': name[-1],
            'suffix2': name[-2:],
            'suffix3': name[-3:] if len(name) > 2 else name[-2:],
            'suffix4': name[-4:] if len(name) > 3 else "",
            'prefix2': name[0:2],
            'prefix3': name[0:3],
            'prefix4': name[0:4],
            'length': len(name),
            'n_vowels': (name.count("a") + name.count("e") + name.count("i") + name.count("o") + name.count("u")),
            'consonant_y': 'y' if bool(re.search(r"([b-df-hj-np-tv-z])\1{1,}y$", name)) else 'n', # rule for all male names like Barry, Daffy, Larry, etc.
            'two_letters': 'y' if len(name) == 2 else 'n',  # All Female names: Jo, Bo.
            'starts_ro': 'y' if bool(re.search(r"^(ro|ma)", name)) else 'n',
            'ends_with_E': 'y' if bool(re.search(r"ie$", name)) else 'n',
           }

In [204]:
test_names = names[:500]
train_names = names[1000:]
dev_test_names = names[500:1000]

train_set = [(gender_features_custom(n), g) for (n,g) in train_names]
dev_test_set = [(gender_features_custom(n), g) for (n,g) in dev_test_names]
test_set = [(gender_features_custom(n), g) for (n,g) in test_names]

# train dataset
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [205]:
print "Labels: ", classifier.labels()

Labels:  ['male', 'female']


In [206]:
print classifier.show_most_informative_features(1000)

Most Informative Features
                 suffix2 = u'na'          female : male   =    156.7 : 1.0
                 suffix2 = u'la'          female : male   =     72.6 : 1.0
                 suffix2 = u'ia'          female : male   =     38.0 : 1.0
                 suffix2 = u'ld'            male : female =     37.0 : 1.0
                 suffix1 = u'a'           female : male   =     35.1 : 1.0
                 suffix2 = u'sa'          female : male   =     34.4 : 1.0
                 suffix3 = u'nne'         female : male   =     30.4 : 1.0
                 suffix2 = u'rd'            male : female =     29.1 : 1.0
                 suffix1 = u'k'             male : female =     28.7 : 1.0
                 suffix2 = u'us'            male : female =     26.3 : 1.0
                 suffix2 = u'ch'            male : female =     26.0 : 1.0
                 suffix2 = u'io'            male : female =     26.0 : 1.0
                 suffix1 = u'f'             male : female =     25.3 : 1.0

                 prefix4 = u'carl'        female : male   =      2.3 : 1.0
                 prefix3 = u'tam'         female : male   =      2.3 : 1.0
                 prefix3 = u'mor'           male : female =      2.3 : 1.0
                 prefix2 = u'et'          female : male   =      2.3 : 1.0
                 prefix2 = u'zo'          female : male   =      2.3 : 1.0
                 suffix3 = u'tin'           male : female =      2.3 : 1.0
                 prefix3 = u'han'           male : female =      2.3 : 1.0
                 prefix4 = u'augu'          male : female =      2.3 : 1.0
                 prefix4 = u'wall'          male : female =      2.3 : 1.0
                 prefix4 = u'theo'          male : female =      2.3 : 1.0
                 prefix4 = u'dari'          male : female =      2.3 : 1.0
                 prefix4 = u'gust'          male : female =      2.3 : 1.0
                 suffix4 = u'rian'          male : female =      2.3 : 1.0
                 prefix3 

In [207]:
print "Accuracy dev-test data: ", nltk.classify.accuracy(classifier, dev_test_set)

Accuracy dev-test data:  0.85


In [208]:
# See what was misclassified:
errors = []
for (name, tag) in dev_test_names:
    guess = classifier.classify(gender_features_custom(name))
    if guess != tag:
        errors.append((tag, guess, name))
        
for (tag, guess, name) in sorted(errors):
    print 'correct=', tag, 'guess=', guess, 'name=', name

correct= female guess= male name= alis
correct= female guess= male name= allyson
correct= female guess= male name= avivah
correct= female guess= male name= bab
correct= female guess= male name= barry
correct= female guess= male name= biddy
correct= female guess= male name= bird
correct= female guess= male name= blanch
correct= female guess= male name= bo
correct= female guess= male name= brigid
correct= female guess= male name= carmon
correct= female guess= male name= charmian
correct= female guess= male name= cloris
correct= female guess= male name= constancy
correct= female guess= male name= cybill
correct= female guess= male name= daffy
correct= female guess= male name= devin
correct= female guess= male name= dusty
correct= female guess= male name= eran
correct= female guess= male name= ethyl
correct= female guess= male name= garland
correct= female guess= male name= grace
correct= female guess= male name= grayce
correct= female guess= male name= havivah
correct= female guess= male 

In [209]:
# show number of mislabeled names 
print "Mislabeled names: ", len(errors)

Mislabeled names:  75
