Gender Name Classification

# Load the libraries

In [93]:
import pandas as pd
import nltk
from nltk.corpus import names
from nltk.metrics.scores import (precision, recall)
import random
import numpy as np

# Load the whole data

In [94]:
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

Shuffle the rows

In [95]:
random.seed(123)
random.shuffle(names)

Print first few rows

In [96]:
names[1:10]

[('Peggie', 'female'),
 ('Solange', 'female'),
 ('Rana', 'female'),
 ('Jessy', 'female'),
 ('Lelia', 'female'),
 ('Dorothy', 'female'),
 ('Ulrick', 'male'),
 ('Roshelle', 'female'),
 ('Caitrin', 'female')]

# Split the data set into train test set, test-dev set and training set

In [97]:
len(names)

7944

In [98]:
test_set = names[0:500]
dev_test_set = names[501:1001]
training_set = names[1001:]

In [99]:
len(test_set)

500

In [100]:
len(dev_test_set)

500

In [101]:
len(training_set)

6943

# Example Name Gender Classifier

In [102]:
def gender_features(word):
    return {'name_length':len(word),'first_letter':word[1],'last_letter': word[-1]}

In [103]:
featuresets = [(gender_features(n), g) for (n,g) in names]

In [104]:
test_set = featuresets[0:500]
dev_test_set = featuresets[501:1001]
train_set = featuresets[1001:]

In [105]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, dev_test_set))

0.774


Accuracy with only name length, first letter and last letter is 0.774.

In [106]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.3 : 1.0
             last_letter = 'k'              male : female =     29.2 : 1.0
             last_letter = 'p'              male : female =     18.6 : 1.0
             last_letter = 'f'              male : female =     15.2 : 1.0
             last_letter = 'v'              male : female =      9.8 : 1.0


In [107]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s) "%letter] = name.lower().count(letter)
        features["has(%s)"%letter] = (letter in name.lower())
    return features

In [108]:
dev_test_set[1:10]

[({'name_length': 8, 'first_letter': 'e', 'last_letter': 'd'}, 'male'),
 ({'name_length': 6, 'first_letter': 'l', 'last_letter': 'e'}, 'female'),
 ({'name_length': 7, 'first_letter': 'n', 'last_letter': 'l'}, 'female'),
 ({'name_length': 8, 'first_letter': 'a', 'last_letter': 'e'}, 'male'),
 ({'name_length': 5, 'first_letter': 'a', 'last_letter': 'y'}, 'female'),
 ({'name_length': 3, 'first_letter': 'a', 'last_letter': 'm'}, 'female'),
 ({'name_length': 9, 'first_letter': 'a', 'last_letter': 'e'}, 'female'),
 ({'name_length': 5, 'first_letter': 'e', 'last_letter': 'a'}, 'female'),
 ({'name_length': 6, 'first_letter': 'u', 'last_letter': 'n'}, 'male')]

In [109]:
featuresets = [(gender_features2(n), g) for (n,g) in names]
test_set = featuresets[0:500]
dev_test_set = featuresets[501:1001]
train_set = featuresets[1001:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print( nltk.classify.accuracy(classifier, dev_test_set))

0.784


In [110]:
def gender_features(word):
    return {'suffix1': word[-1:],'suffix2': word[-2:]}

In [111]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [112]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, devtest_set))

0.791


 # Incremental improvements.

## Naive Bayes Classifiers

In [113]:
### With first last letter prefix and suffix

In [114]:
def naive_features1(name):
    features = {}
    name_holder = name
    features["first_letter"] = name[0].lower() 
    features["last_letter"] = name[-1].lower() 
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() 
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()
    return features

In [115]:
featuresets = [(naive_features1(n), g) for (n,g) in names]
test_set = featuresets[0:500]
dev_test_set = featuresets[501:1001]
train_set = featuresets[1001:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print( nltk.classify.accuracy(classifier, dev_test_set))

0.834


### Adding last vowels with previous model


In [116]:
def naive_features2(name):
    features = {}
    name_holder = name
    features["first_letter"] = name[0].lower() 
    features["last_letter"] = name[-1].lower() 
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() 
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()
    features['last_vowel']= (name[-1] in 'aeiou')

    return features

In [117]:
featuresets = [(naive_features2(n), g) for (n,g) in names]
test_set = featuresets[0:500]
dev_test_set = featuresets[501:1001]
train_set = featuresets[1001:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print( nltk.classify.accuracy(classifier, dev_test_set))

0.828


Accuracy with first letter, last letter, prefix , suffix and last vowel is 0.828 which is less than the previous naive bayes classifier model

Last vowel doesn't help much .

### Taking clustering into account

In [118]:
def naive_features3(name):
    features = {}
    name_holder = name
    eng_constants = ["bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp", "st", "sw", "th", "tr", "tw", "wh", "wr", "sch", "scr", "shr", "sph", "spl", "spr", "squ", "str", "thr"]
    features["first_letter"] = name[0].lower() 
    features["last_letter"] = name[-1].lower() 
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() 
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()
    clusters = []
    for cluster in eng_constants[::-1]:
        if cluster in name_holder:
            name_holder = name_holder.replace(cluster, "")
            clusters.append(cluster)
    features["clusters1"] = clusters[0] if len(clusters) > 0 else None
    features["clusters2"] = clusters[1] if len(clusters) > 1 else None
    features["clusters3"] = clusters[2] if len(clusters) > 2 else None
    return features

In [119]:
featuresets = [(naive_features3(n), g) for (n,g) in names]
test_set = featuresets[0:500]
dev_test_set = featuresets[501:1001]
train_set = featuresets[1001:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print( nltk.classify.accuracy(classifier, dev_test_set))

0.836


Accuracy with first letter, last letter, prefix , suffix and constant clustering is 0.836 which is higher than the previous naive bayes classifier model

We'll use constant clustering in the final model as a feature

## Adding number of syllabal feature

In [120]:
def count_syllable(word):
    word = word.lower()
    vowels = 'aeiouy'

    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    elif word.endswith('le'):
        count += 1
    elif word.endswith('bile'):
        count -= 1
    elif count == 0:
        count += 1
    return count

In [121]:
def naive_features4(name):
    features = {}
    name_holder = name
    eng_constants = ["bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp", "st", "sw", "th", "tr", "tw", "wh", "wr", "sch", "scr", "shr", "sph", "spl", "spr", "squ", "str", "thr"]
    features["first_letter"] = name[0].lower() 
    features["last_letter"] = name[-1].lower() 
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() 
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()
    clusters = []
    for cluster in eng_constants[::-1]:
        if cluster in name_holder:
            name_holder = name_holder.replace(cluster, "")
            clusters.append(cluster)
    features["clusters1"] = clusters[0] if len(clusters) > 0 else None
    features["clusters2"] = clusters[1] if len(clusters) > 1 else None
    features["clusters3"] = clusters[2] if len(clusters) > 2 else None
    features['syllable_count'] = count_syllable(name)
    return features

In [122]:
featuresets = [(naive_features4(n), g) for (n,g) in names]
test_set = featuresets[0:500]
dev_test_set = featuresets[501:1001]
train_set = featuresets[1001:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print( nltk.classify.accuracy(classifier, dev_test_set))

0.842


### Most informative features

In [123]:
classifier.most_informative_features(10)

[('last_letter', 'a'),
 ('last_letter', 'k'),
 ('suffix', 'ard'),
 ('suffix', 'tta'),
 ('suffix', 'ita'),
 ('suffix', 'nne'),
 ('last_letter', 'p'),
 ('suffix', 'old'),
 ('last_letter', 'f'),
 ('suffix', 'na')]

Counting syllable improves accuracy. So we'll consider number of syllable a important feature.

Now I'm satisifed with the accuracy of the model. The features i've selected are first_letter, last_letter, prefix, suffix, clustering of constants, syllable count

### Test accuracy

In [125]:
print( nltk.classify.accuracy(classifier, test_set))

0.842


# How does the performance on the test set compare to the performance on the dev-test set? Is this what you'd expect? 

The accuracy of test and training set are same. The model performs same on the dev-test and test set. We can say that the model is a good fitted model.
This is exactly what I've expected.