In [130]:
import nltk
import random
import numpy as np
from itertools import repeat
from sklearn.model_selection import train_test_split
from nltk.corpus import names

In [187]:
# confirm male and female txt files exist
names.fileids()

['female.txt', 'male.txt']

In [188]:
# load male and female  name files from nltk.names; store in people list
males = [n for n in names.words('male.txt')] 
females = [n for n in names.words('female.txt')] 
people = males + females

# make gender list
gender = list(repeat('male',len(males))) + \
list(repeat('female',len(females)))


In [217]:
# extract features
def gender_features(word):
    features = {}
    #features['last'] = word[-1].lower()
    features['last_two'] = word[-2:].lower()
    features['first_two'] = word[0:3].lower()
    
    return(features)


gender_features('Shrek')

{'last_two': 'ek', 'first_two': 'shr'}

In [218]:
# split into test and train, with test file containing 1000 samples
people_train, people_test, gender_train, gender_test =  \
train_test_split(people, gender, test_size=1000, random_state=4)

# split test into two separate components of 500 each: test and devtest
people_test, people_devtest, gender_test, gender_devtest = \
train_test_split(people_test, gender_test, test_size=500, random_state=4)

# list of tuples, gender features, gender
train_set = list(zip(map(gender_features, people_train),gender_train))
devtest_set = list(zip(map(gender_features, people_devtest),gender_devtest))
test_set = list(zip(map(gender_features, people_test),gender_test))


# list of tuples, names, gender
train_names = list(zip(people_train,gender_train))
devtest_names = list(zip(people_devtest,gender_devtest))
test_names = list(zip(people_test, gender_test))

# train naive bayes classifier 
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [222]:
# look at most informative features
classifier.show_most_informative_features(20)

Most Informative Features
                last_two = 'na'           female : male   =     94.0 : 1.0
                last_two = 'la'           female : male   =     68.3 : 1.0
                last_two = 'ia'           female : male   =     36.2 : 1.0
                last_two = 'ra'           female : male   =     33.9 : 1.0
                last_two = 'us'             male : female =     29.1 : 1.0
                last_two = 'ta'           female : male   =     28.9 : 1.0
                last_two = 'rd'             male : female =     27.2 : 1.0
                last_two = 'ld'             male : female =     23.2 : 1.0
                last_two = 'rt'             male : female =     21.4 : 1.0
                last_two = 'do'             male : female =     20.8 : 1.0
                last_two = 'im'             male : female =     18.5 : 1.0
                last_two = 'os'             male : female =     17.4 : 1.0
               first_two = 'ros'          female : male   =     16.3 : 1.0

In [220]:
# classifer accuracy on validation set
print(nltk.classify.accuracy(classifier, devtest_set))

0.828


In [192]:
# look at names that were mis-classified
errors = []
for (name, tag) in devtest_names:
    #print(name)
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

print('actual, guess, name: \n')
for x in errors:
    print(x)
        
        

actual, guess, name: 

('female', 'male', 'Winonah')
('male', 'female', 'Abbie')
('male', 'female', 'Clayborne')
('male', 'female', 'Reese')
('male', 'female', 'Randi')
('male', 'female', 'Aguste')
('male', 'female', 'Lucien')
('male', 'female', 'Jeffry')
('male', 'female', 'Davide')
('male', 'female', 'Damien')
('female', 'male', 'Marillin')
('female', 'male', 'Sibby')
('female', 'male', 'Haley')
('male', 'female', 'Clarance')
('male', 'female', 'Kennedy')
('female', 'male', 'Kim')
('male', 'female', 'Arel')
('male', 'female', 'Morly')
('male', 'female', 'Lane')
('male', 'female', 'Adams')
('male', 'female', 'Casey')
('male', 'female', 'Tremayne')
('female', 'male', 'Ruthy')
('male', 'female', 'Michele')
('female', 'male', 'Marian')
('male', 'female', 'Kit')
('male', 'female', 'Darien')
('female', 'male', 'Joycelin')
('female', 'male', 'Chrysler')
('female', 'male', 'Marin')
('female', 'male', 'Franky')
('female', 'male', 'Honey')
('female', 'male', 'Quentin')
('male', 'female', 'Neal

### References
http://www.nltk.org/howto/corpus.html