In [27]:
import nltk
import random
import numpy as np
from itertools import repeat
from sklearn.model_selection import train_test_split
from nltk.corpus import names
import math 

In [28]:
# confirm male and female txt files exist
names.fileids()

['female.txt', 'male.txt']

In [30]:
# load male and female  name files from nltk.names; store in people list
males = [n for n in names.words('male.txt')] 
females = [n for n in names.words('female.txt')] 
people = males + females

# make gender list
gender = list(repeat('male',len(males))) + \
list(repeat('female',len(females)))


In [31]:
# produce features
def gender_features(word, *args):
    """
    function returns dictionary of features
        word: name to extract features from
        args:  one or more strings to specify desired features, including:
                'length','first','first2','first3', 'last', 'last2', 'last3',
                'every_other2_beg','every_other3_beg', 'every_other2_end', 'every_other3_end',
                'vowel_ct', 'round_cons_ct', 'sharp_cons_ct','round_vowel_ct',
                trad_female_end'
               
    """
    
    gf = {}
    
    # word length
    gf['length'] = len(word)
   
    # first letters
    gf['first'] = word[0].lower()
    gf['first2'] = word[0:2].lower()
    gf['first3'] = word[0:3].lower() if gf['length'] >2  else word[0:2].lower()
    
    # last letters
    gf['last'] = word[-1].lower()
    gf['last2'] = word[-2:].lower()
    gf['last3'] = word[-3:].lower() if gf['length'] >2  else word[-2:].lower()
    
    # every other beg
    gf['every_other2_beg'] = word[0]+word[2] if gf['length'] > 2 else word[0]
    gf['every_other3_beg'] = gf['every_other2_beg']+word[4]  if gf['length'] > 4 else \
    gf['every_other2_beg']
    
    # every other end
    gf['every_other2_end'] = word[-3]+word[-1] if gf['length'] > 2 else word[-1]
    gf['every_other3_end'] = word[-5]+gf['every_other2_end']  if gf['length'] > 4 else \
    gf['every_other2_end']
    
    # count: vowels, rounded consonants, sharp consonants
    for letter in word:
        # count vowels
        if letter in 'aeiou':
            gf['vowel_ct'] = gf.get('vowel_ct',0) + 1
        # count rounded consonants
        if letter in 'bmln':
            gf['round_cons_ct'] = gf.get('round_cons_ct',0) + 1
        # count sharp consonants
        if letter in 'k,p,t':
            gf['sharp_cons_ct'] = gf.get('sharp_cons_ct',0) + 1
        # count rounded vowels
        if letter in 'uo':
            gf['round_vowel_ct'] = gf.get('round_vowel_ct',0) + 1
            
    # traditional feminine ending, 'y' or 'n'
    gf['trad_female_end'] = 'y' if gf['last2'] in ['ie','ah'] or \
    gf['last'] in ['a','y'] else 'n'
    
    # generate dictionary subset
    return(dict((k, gf[k]) for k in args if k in gf))
    
       

In [32]:
# specify which features to use
myargs = ['length','first','first2','first3', 'last', 'last2', 'last3', \
          'every_other2_beg','every_other3_beg', 'every_other2_end', 'every_other3_end', \
          'vowel_ct', 'round_cons_ct', 'sharp_cons_ct','round_vowel_ct', \
          'trad_female_end']

# specify name, and argument list 
gender_features('Sandy', *myargs)

{'every_other2_beg': 'Sn',
 'every_other2_end': 'ny',
 'every_other3_beg': 'Sny',
 'every_other3_end': 'Sny',
 'first': 's',
 'first2': 'sa',
 'first3': 'san',
 'last': 'y',
 'last2': 'dy',
 'last3': 'ndy',
 'length': 5,
 'round_cons_ct': 1,
 'trad_female_end': 'y',
 'vowel_ct': 1}

In [10]:
# split into test and train, with test file containing 1000 samples
people_train, people_test, gender_train, gender_test =  \
train_test_split(people, gender, test_size=1000, random_state=4)

# split test into two separate components of 500 each: test and devtest
people_test, people_devtest, gender_test, gender_devtest = \
train_test_split(people_test, gender_test, test_size=500, random_state=4)

# list of tuples, gender features, gender
train_set = list(zip(map(lambda d: gender_features(d, *myargs), people_train),gender_train))
devtest_set = list(zip(map(lambda d: gender_features(d, *myargs), people_devtest),gender_devtest))
test_set = list(zip(map(lambda d: gender_features(d, *myargs), people_test),gender_test))


# list of tuples, names, gender
train_names = list(zip(people_train,gender_train))
devtest_names = list(zip(people_devtest,gender_devtest))
test_names = list(zip(people_test, gender_test))





Let's  calculate the entropy of the labels in our dataset. The higher the entropy the better our classification algorithm

In [33]:

def entropy(labels):    
    freqdist = nltk.FreqDist(labels)    
    probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)]    
    return -sum([p * math.log(p,2) for p in probs])

print (entropy(gender))


0.951030970454714


We have a 95% entropy which very good

Let's now build a maximum entropy classifier based on the features. By default it will run for 100 iterations

In [34]:
classifier = nltk.classify.MaxentClassifier.train(
                         train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.368
             2          -0.42676        0.822
             3          -0.34445        0.877
             4          -0.29924        0.891
             5          -0.27015        0.903
             6          -0.24939        0.908
             7          -0.23355        0.914
             8          -0.22087        0.919
             9          -0.21039        0.923
            10          -0.20150        0.926
            11          -0.19381        0.929
            12          -0.18707        0.931
            13          -0.18109        0.933
            14          -0.17572        0.935
            15          -0.17087        0.936
            16          -0.16646        0.939
            17          -0.16243        0.940
            18          -0.15871        0.940
            19          -0.15528        0.942
 

In [35]:
# look at most informative features
classifier.show_most_informative_features(50)

   6.521 every_other3_end=='mtr' and label is 'female'
   6.361 every_other3_beg=='Nkt' and label is 'male'
   6.058 first3=='eti' and label is 'male'
   6.006 every_other3_end=='Gyn' and label is 'male'
  -5.671 first3=='ros' and label is 'male'
   5.507 last3=='ela' and label is 'male'
   5.451 every_other3_end=='Tmr' and label is 'female'
   4.982 every_other3_end=='bil' and label is 'male'
   4.915 last3=='rko' and label is 'male'
   4.859 every_other3_end=='tin' and label is 'male'
   4.792 last3=='rbe' and label is 'female'
  -4.774 last2=='os' and label is 'female'
   4.714 every_other3_end=='ir' and label is 'female'
   4.641 every_other3_end=='hra' and label is 'male'
   4.574 last3=='nly' and label is 'male'
  -4.546 last2=='na' and label is 'male'
  -4.517 every_other2_beg=='Nn' and label is 'male'
   4.430 every_other3_beg=='Ban' and label is 'male'
  -4.347 last3=='nne' and label is 'male'
   4.333 every_other3_end=='ea' and label is 'male'
   4.308 every_other3_end=='im' 

In [38]:
# classifer accuracy on validation set
print(nltk.classify.accuracy(classifier, devtest_set))

0.812


In [37]:
# look at names that were mis-classified
errors = []
for (name, tag) in devtest_names:
    #print(name)
    guess = classifier2.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

print('actual, guess, name: \n')
for x in errors:
    print(x)
        
        

actual, guess, name: 

('female', 'male', 'Winonah')
('female', 'male', 'Eada')
('female', 'male', 'Jennette')
('female', 'male', 'Kylila')
('female', 'male', 'Sonni')
('female', 'male', 'Karita')
('female', 'male', 'Rorie')
('female', 'male', 'Flora')
('female', 'male', 'Mechelle')
('female', 'male', 'Damita')
('female', 'male', 'Deloria')
('female', 'male', 'Rhody')
('female', 'male', 'Abbey')
('female', 'male', 'Bonnie')
('female', 'male', 'Leesa')
('female', 'male', 'Vanessa')
('female', 'male', 'Rubia')
('female', 'male', 'Shaylyn')
('female', 'male', 'Tessie')
('female', 'male', 'Monika')
('female', 'male', 'Susie')
('female', 'male', 'Larina')
('female', 'male', 'Margaretta')
('female', 'male', 'Erminia')
('female', 'male', 'Vickie')
('female', 'male', 'Seana')
('female', 'male', 'Tandie')
('female', 'male', 'Lira')
('female', 'male', 'Elise')
('female', 'male', 'Alexis')
('female', 'male', 'Hyacinthe')
('female', 'male', 'Leoline')
('female', 'male', 'Joann')
('female', 'male',

Applying the classifer on the test set to see accuracy

In [39]:
# classifer accuracy on validation set
print(nltk.classify.accuracy(classifier, test_set))

0.816


Conclusion
Maximum entropy classifier: We can see that it uses an iterative method to maximize the performance of the training corpus classification. In this case the default number of iteration was 100, which is reasonable for our dataset.
This is why it takes a long time to train a huge dataset and could also explain why it is not as popular.
Another drawback with this classifer is that can only answer the questions about the conditional probabilities of the feature and label compared to the naive bayes.
 
However, this classifier gives us options to associate more than one feature with a given label and vice versa. Another advantage is that the classifier does not require a vast amount of data like decision trees as long as the dataset has a high entropy.



### References
http://www.nltk.org/howto/corpus.html