In [1]:
import numpy as np
from __future__ import division
import nltk, re, pprint
from nltk import word_tokenize


In [2]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])

In [3]:
import random

random.shuffle(labeled_names)


In [4]:
len(labeled_names)

7944

In [5]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [6]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:],
            'prefix1': word[:1],
            'prefix1': word[:2]}

In [7]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]


In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.815


In [10]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
    

In [11]:
for (tag, guess, name) in sorted (errors):
    print ('correct={} guess={} name={}'.format(tag, guess, name))

correct=female guess=male name=Abbey
correct=female guess=male name=Adey
correct=female guess=male name=Adrien
correct=female guess=male name=Aeriell
correct=female guess=male name=Alexis
correct=female guess=male name=Anais
correct=female guess=male name=Ariel
correct=female guess=male name=Bamby
correct=female guess=male name=Barbey
correct=female guess=male name=Bev
correct=female guess=male name=Blondell
correct=female guess=male name=Brandais
correct=female guess=male name=Brett
correct=female guess=male name=Brit
correct=female guess=male name=Brook
correct=female guess=male name=Calypso
correct=female guess=male name=Carol-Jean
correct=female guess=male name=Cherish
correct=female guess=male name=Christan
correct=female guess=male name=Cleo
correct=female guess=male name=Cordey
correct=female guess=male name=Corey
correct=female guess=male name=Courtenay
correct=female guess=male name=Cybill
correct=female guess=male name=Dael
correct=female guess=male name=Damaris
correct=femal

In [12]:
def vowel_ratio(word):
    if len(re.findall(r'[aeiouy]', word))/len(word)>= 0.8:
        return ".8+"
    elif len(re.findall(r'[aeiouy]', word))/len(word)>= 0.6:
        return "0.6 - 0.8"
    elif len(re.findall(r'[aeiouy]', word))/len(word)>= 0.4:
        return "0.4 - 0.6"
    elif len(re.findall(r'[aeiouy]', word))/len(word)>= 0.2:
        return "0.2 - 0.4"
    else: 
        return "< 0.2"

### find the length of the names in our training set to establish what constitute extreme lengths (top 10% or bottom 10%)

In [13]:
import pandas as pd

#create a dataframe
names = []
for (name, gender) in train_names:
    names.append(name)

df = pd.DataFrame(names)



In [14]:
#loop through and create a new column
length = []
for row in df[0]:
    length.append(len(row))

df['length'] = length


In [15]:
top_10p = df.quantile(0.9)
bottom_10p = df.quantile(0.1)
print bottom_10p, top_10p

length    4
dtype: float64 length    8
dtype: float64


In [16]:
def word_length(word):
    if len(word) > 8:
        return ">8"
    elif len(word) < 4:
        return "<4"
    else: 
        return "4-8"

In [17]:
def gender_features(word):
    return {'vowels' : word_length(word)}



In [18]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

In [19]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.63


In [20]:
#I didn't dfind this to improve the model so it is excluded from below
print classifier.show_most_informative_features(5)

Most Informative Features
                  vowels = '<4'             male : female =      2.0 : 1.0
                  vowels = '>8'           female : male   =      1.3 : 1.0
                  vowels = '4-8'          female : male   =      1.0 : 1.0
None


In [21]:
#This was my best model so far

def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:],
            'suffix3': word[-3:],
            'prefix1': word[:1],
            'prefix2': word[:2],
            'prefix3': word[:3],
            'vowels' : vowel_ratio(word)}




In [22]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

In [23]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.848


In [24]:
print classifier.show_most_informative_features(50)

Most Informative Features
                 suffix2 = u'na'          female : male   =    149.6 : 1.0
                 suffix1 = u'k'             male : female =     68.4 : 1.0
                 suffix2 = u'la'          female : male   =     65.6 : 1.0
                 suffix2 = u'ia'          female : male   =     48.5 : 1.0
                 suffix1 = u'a'           female : male   =     41.5 : 1.0
                 suffix2 = u'ta'          female : male   =     38.2 : 1.0
                 suffix2 = u'us'            male : female =     34.2 : 1.0
                 suffix2 = u'ra'          female : male   =     32.6 : 1.0
                 suffix3 = u'nne'         female : male   =     30.9 : 1.0
                 suffix2 = u'sa'          female : male   =     30.7 : 1.0
                 suffix2 = u'rd'            male : female =     28.7 : 1.0
                 suffix3 = u'ard'           male : female =     26.2 : 1.0
                 suffix2 = u'io'            male : female =     24.9 : 1.0

In [25]:
#error check
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
        
for (tag, guess, name) in sorted (errors):
    print ('correct={} guess={} name={}'.format(tag, guess, name))

correct=female guess=male name=Abbey
correct=female guess=male name=Adorne
correct=female guess=male name=Adrien
correct=female guess=male name=Alexis
correct=female guess=male name=Ariel
correct=female guess=male name=Bamby
correct=female guess=male name=Barbee
correct=female guess=male name=Barbey
correct=female guess=male name=Berny
correct=female guess=male name=Bev
correct=female guess=male name=Blondell
correct=female guess=male name=Brandais
correct=female guess=male name=Brandy
correct=female guess=male name=Brett
correct=female guess=male name=Brook
correct=female guess=male name=Calypso
correct=female guess=male name=Cherish
correct=female guess=male name=Christan
correct=female guess=male name=Clemence
correct=female guess=male name=Cleo
correct=female guess=male name=Constance
correct=female guess=male name=Courtenay
correct=female guess=male name=Dael
correct=female guess=male name=Damaris
correct=female guess=male name=Deb
correct=female guess=male name=Del
correct=female