In [403]:
### import pandas as pd
import nltk
from nltk.corpus import names
from nltk.metrics.scores import (precision, recall)
import random
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier

## Dataset

In [404]:
# female names usually end in a,i,e and male names are k,o,r,s,t
#return the last letter in a word
names = pd.DataFrame(([(name.lower(), 'male') for name in names.words('male.txt')] + 
        [(name.lower(), 'female') for name in names.words('female.txt')]),columns = ['name','gender'])
names.head()

Unnamed: 0,name,gender
0,aamir,male
1,aaron,male
2,abbey,male
3,abbie,male
4,abbot,male


## Features

In [405]:
# create features such as suffix with one letter, suffix with two letters, prefix with two letters, name length etc.  
names['suffix1'] = names['name'].str[-1]
names['suffix2'] = names['name'].str[-2:]
names['suffix3'] = names['name'].str[-3:]  
names['suffix4'] = names['name'].str[-4:] 
names['prefix2'] = names['name'].str[0:2]
names['prefix3'] = names['name'].str[0:3]
names['prefix4'] = names['name'].str[0:4]
names['length'] = names['name'].str.len()
names['n_vowels'] = (names['name'].str.count("a") + names['name'].str.count("e") + names['name'].str.count("i") + names['name'].str.count("o") + names['name'].str.count("u"))
names.head()    

Unnamed: 0,name,gender,suffix1,suffix2,suffix3,suffix4,prefix2,prefix3,prefix4,length,n_vowels
0,aamir,male,r,ir,mir,amir,aa,aam,aami,5,3
1,aaron,male,n,on,ron,aron,aa,aar,aaro,5,3
2,abbey,male,y,ey,bey,bbey,ab,abb,abbe,5,2
3,abbie,male,e,ie,bie,bbie,ab,abb,abbi,5,3
4,abbot,male,t,ot,bot,bbot,ab,abb,abbo,5,2


In [406]:
#function that allows to select set of features 
def gender_features_custom(dataset,features_list):
    features = []
    
    for index, row in dataset.iterrows():
        dict = {item:row[item] for item in features_list}
        features.append(dict)
    
    return features

Create different sets of features.

In [407]:
#for example
features_set1 = ['suffix2','length']
features_set2 = ['suffix2','n_vowels','prefix2']
features_set3 = ['n_vowels','suffix3']

## Training dataset and testing dataset

In [408]:
#shuffle dataset
random.seed(4)
#random.shuffle(names)

#split intitial dataset
train_names = names[1000:]
test_names = names[:500]
dev_test_names = names[500:1000]

def create_set(dataset,features_list):
    return list(zip(gender_features_custom(dataset,features_list),dataset['gender']))

## Naive Bayes

Run Naive Bayes Classifier for all features

In [409]:
all_features = list(names.columns)[2:]
all_features

['suffix1',
 'suffix2',
 'suffix3',
 'suffix4',
 'prefix2',
 'prefix3',
 'prefix4',
 'length',
 'n_vowels']

In [410]:
# train dataset
train_set = create_set(train_names,all_features)
classifier = nltk.NaiveBayesClassifier.train(create_set(train_names,all_features))
print (classifier.show_most_informative_features(20))

Most Informative Features
                 suffix2 = 'na'           female : male   =    119.4 : 1.0
                 suffix2 = 'ia'           female : male   =     66.6 : 1.0
                 suffix1 = 'a'            female : male   =     37.4 : 1.0
                 suffix1 = 'k'              male : female =     30.4 : 1.0
                 suffix2 = 'us'             male : female =     30.1 : 1.0
                 prefix2 = 'hu'             male : female =     27.4 : 1.0
                 suffix2 = 'ld'             male : female =     27.2 : 1.0
                 suffix2 = 'ra'           female : male   =     26.4 : 1.0
                 suffix2 = 'sa'           female : male   =     25.4 : 1.0
                 suffix2 = 'rd'             male : female =     23.9 : 1.0
                 suffix2 = 'do'             male : female =     23.9 : 1.0
                 suffix2 = 'ta'           female : male   =     23.4 : 1.0
                 prefix3 = 'tha'            male : female =     21.3 : 1.0

In [411]:
print ("Accuracy dev-test data: ", nltk.classify.accuracy(classifier, create_set(dev_test_names,all_features)))

Accuracy dev-test data:  0.454


In [412]:
# See what was misclassified:
#errors = []
#for (name, tag) in dev_test_names:
    #guess = classifier.classify(gender_features_custom(name))
    #if guess != tag:
        #errors.append((tag, guess, name))
        
#for (tag, guess, name) in sorted(errors):
    #print ('correct=', tag, 'guess=', guess, 'name=', name)

1. Run Naive Bayes Classifier for features set 1.

In [413]:
train_set = create_set(train_names,features_set1)
classifier = nltk.NaiveBayesClassifier.train(create_set(train_names,features_set1))
print ("Accuracy dev-test data: ", nltk.classify.accuracy(classifier, create_set(dev_test_names,features_set1)))

Accuracy dev-test data:  0.54


2. Run Naive Bayes Classifier for features set 2.

In [414]:
train_set = create_set(train_names,features_set2)
classifier = nltk.NaiveBayesClassifier.train(create_set(train_names,features_set2))
print ("Accuracy dev-test data: ", nltk.classify.accuracy(classifier, create_set(dev_test_names,features_set2)))

Accuracy dev-test data:  0.116


3. Run Naive Bayes Classifier for features set 3.

In [415]:
train_set = create_set(train_names,features_set3)
classifier = nltk.NaiveBayesClassifier.train(create_set(train_names,features_set3))
print ("Accuracy dev-test data: ", nltk.classify.accuracy(classifier, create_set(dev_test_names,features_set3)))

Accuracy dev-test data:  0.608


## Decision Tree Classifier

In [237]:
Decision_Tree = DecisionTreeRegressor(random_state = 0)

## Random Forests