In [5]:
import nltk
import random
import numpy as np
from itertools import repeat
from sklearn.model_selection import train_test_split
from nltk.corpus import names

In [6]:
# confirm male and female txt files exist
names.fileids()

['female.txt', 'male.txt']

In [7]:
# load male and female  name files from nltk.names; store in people list
males = [n for n in names.words('male.txt')] 
females = [n for n in names.words('female.txt')] 
people = males + females

# make gender list
gender = list(repeat('male',len(males))) + \
list(repeat('female',len(females)))


In [8]:
# produce features
def gender_features(word, *args):
    """
    function returns dictionary of features
        word: name to extract features from
        args:  one or more strings to specify desired features, including:
                'length','first','first2','first3', 'last', 'last2', 'last3',
                'every_other2_beg','every_other3_beg', 'every_other2_end', 'every_other3_end',
                'vowel_ct', 'round_cons_ct', 'sharp_cons_ct','round_vowel_ct',
                trad_female_end'
               
    """
    
    gf = {}
    
    # word length
    gf['length'] = len(word)
   
    # first letters
    gf['first'] = word[0].lower()
    gf['first2'] = word[0:2].lower()
    gf['first3'] = word[0:3].lower() if gf['length'] >2  else word[0:2].lower()
    
    # last letters
    gf['last'] = word[-1].lower()
    gf['last2'] = word[-2:].lower()
    gf['last3'] = word[-3:].lower() if gf['length'] >2  else word[-2:].lower()
    
    # every other beg
    gf['every_other2_beg'] = word[0]+word[2] if gf['length'] > 2 else word[0]
    gf['every_other3_beg'] = gf['every_other2_beg']+word[4]  if gf['length'] > 4 else \
    gf['every_other2_beg']
    
    # every other end
    gf['every_other2_end'] = word[-3]+word[-1] if gf['length'] > 2 else word[-1]
    gf['every_other3_end'] = word[-5]+gf['every_other2_end']  if gf['length'] > 4 else \
    gf['every_other2_end']
    
    # count: vowels, rounded consonants, sharp consonants
    for letter in word:
        # count vowels
        if letter in 'aeiou':
            gf['vowel_ct'] = gf.get('vowel_ct',0) + 1
        # count rounded consonants
        if letter in 'bmln':
            gf['round_cons_ct'] = gf.get('round_cons_ct',0) + 1
        # count sharp consonants
        if letter in 'k,p,t':
            gf['sharp_cons_ct'] = gf.get('sharp_cons_ct',0) + 1
        # count rounded vowels
        if letter in 'uo':
            gf['round_vowel_ct'] = gf.get('round_vowel_ct',0) + 1
            
    # traditional feminine ending, 'y' or 'n'
    gf['trad_female_end'] = 'y' if gf['last2'] in ['ie','ah'] or \
    gf['last'] in ['a','y'] else 'n'
    
    # generate dictionary subset
    return(dict((k, gf[k]) for k in args if k in gf))
    
       

In [9]:
# specify which features to use
myargs = ['length','first','first2','first3', 'last', 'last2', 'last3', \
          'every_other2_beg','every_other3_beg', 'every_other2_end', 'every_other3_end', \
          'vowel_ct', 'round_cons_ct', 'sharp_cons_ct','round_vowel_ct', \
          'trad_female_end']

# specify name, and argument list 
gender_features('Sandy', *myargs)

{'length': 5,
 'first': 's',
 'first2': 'sa',
 'first3': 'san',
 'last': 'y',
 'last2': 'dy',
 'last3': 'ndy',
 'every_other2_beg': 'Sn',
 'every_other3_beg': 'Sny',
 'every_other2_end': 'ny',
 'every_other3_end': 'Sny',
 'vowel_ct': 1,
 'round_cons_ct': 1,
 'trad_female_end': 'y'}

In [11]:
# split into test and train, with test file containing 1000 samples
people_train, people_test, gender_train, gender_test =  \
train_test_split(people, gender, test_size=1000, random_state=4)

# split test into two separate components of 500 each: test and devtest
people_test, people_devtest, gender_test, gender_devtest = \
train_test_split(people_test, gender_test, test_size=500, random_state=4)

# list of tuples, gender features, gender
train_set = list(zip(map(lambda d: gender_features(d, *myargs), people_train),gender_train))
devtest_set = list(zip(map(lambda d: gender_features(d, *myargs), people_devtest),gender_devtest))
test_set = list(zip(map(lambda d: gender_features(d, *myargs), people_test),gender_test))


# list of tuples, names, gender
train_names = list(zip(people_train,gender_train))
devtest_names = list(zip(people_devtest,gender_devtest))
test_names = list(zip(people_test, gender_test))

# train naive bayes classifier 
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [15]:
# look at most informative features
classifier.show_most_informative_features(50)

Most Informative Features
                   last2 = 'na'           female : male   =     94.0 : 1.0
        every_other2_end = 'la'           female : male   =     77.7 : 1.0
                   last2 = 'la'           female : male   =     68.3 : 1.0
        every_other2_end = 'ea'           female : male   =     62.7 : 1.0
        every_other2_end = 'ia'           female : male   =     54.2 : 1.0
                    last = 'a'            female : male   =     36.9 : 1.0
                   last2 = 'ia'           female : male   =     36.2 : 1.0
                   last2 = 'ra'           female : male   =     33.9 : 1.0
                    last = 'k'              male : female =     30.6 : 1.0
                   last2 = 'us'             male : female =     29.1 : 1.0
                   last2 = 'ta'           female : male   =     28.9 : 1.0
                   last2 = 'rd'             male : female =     27.2 : 1.0
        every_other3_end = 'aia'          female : male   =     27.0 : 1.0

In [13]:
# classifer accuracy on validation set
print(nltk.classify.accuracy(classifier, devtest_set))

0.84


In [9]:
# look at names that were mis-classified
errors = []
for (name, tag) in devtest_names:
    #print(name)
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

print('actual, guess, name: \n')
for x in errors:
    print(x)
        
        

actual, guess, name: 

('female', 'male', 'Winonah')
('male', 'female', 'Abbie')
('male', 'female', 'Clayborne')
('female', 'male', 'Abbey')
('male', 'female', 'Ivor')
('male', 'female', 'Reese')
('male', 'female', 'Randi')
('male', 'female', 'Aguste')
('male', 'female', 'Lucien')
('male', 'female', 'Davide')
('male', 'female', 'Damien')
('female', 'male', 'Benny')
('male', 'female', 'Clarance')
('male', 'female', 'Percival')
('male', 'female', 'Malcolm')
('male', 'female', 'Gabriell')
('female', 'male', 'Kim')
('male', 'female', 'Arel')
('female', 'male', 'Lamb')
('male', 'female', 'Morly')
('male', 'female', 'Lane')
('male', 'female', 'Adams')
('male', 'female', 'Casey')
('male', 'female', 'Tremayne')
('female', 'male', 'Ruthy')
('male', 'female', 'Michele')
('male', 'female', 'Kit')
('male', 'female', 'Heath')
('male', 'female', 'Darien')
('female', 'male', 'Chrysler')
('male', 'female', 'Terrence')
('female', 'male', 'Franky')
('female', 'male', 'Quentin')
('male', 'female', 'Gabri

### References
http://www.nltk.org/howto/corpus.html