In [1]:
from nltk.corpus import names
import random

In [2]:
len(names.words('male.txt'))

2943

In [3]:
len(names.words('female.txt'))

5001

In [4]:
male=names.words('male.txt')
female=names.words('female.txt')

In [5]:
male[:5]

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot']

In [6]:
female[:5]

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi']

Gender Identification
The first step in creating a classifier is deciding what features of the input are relevant,
and how to encode those features.Names ending in a, e, and i are likely to be female, while names ending in k, o, r, s, andt are likely to be male.

In [7]:
def gender_feature(word):
    return {'last_letter':word[-1]}

In [8]:
gender_feature('prashant')

{'last_letter': 't'}

In [9]:
random.shuffle(male)

In [10]:
random.shuffle(female)

In [11]:
male[:5]

['Warren', 'Terry', 'Maury', 'Basil', 'Travers']

In [12]:
female[:5]

['Brunhilde', 'Nova', 'Claudina', 'Lise', 'Ricky']

In [13]:
l1=[(name, 'male') for name in names.words('male.txt')]

In [14]:
l1[:5]

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male')]

In [15]:
names=([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in names.words('female.txt')])

In [16]:
names[:5]

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male')]

In [17]:
random.shuffle(names)

In [18]:
names[:5]

[('Verena', 'female'),
 ('Rollo', 'male'),
 ('Elnora', 'female'),
 ('Dimitris', 'male'),
 ('Ruthann', 'female')]

In [19]:
feature_set=[(gender_feature(n),g) for (n,g) in names]

In [20]:
feature_set

[({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'o'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 's'}, 'male'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'i'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'k'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'o'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'g'}, 'male'),
 ({'last_letter': 'd'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_

In [21]:
len(feature_set)

7944

In [22]:
train_set, test_set = feature_set[500:], feature_set[:500]

In [23]:
import nltk

In [24]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [25]:
classifier.classify(gender_feature('Prashant'))

'male'

In [26]:
classifier.classify(gender_feature('Yash'))

'female'

In [27]:
classifier.classify(gender_feature('Dhruv'))

'male'

In [28]:
classifier.classify(gender_feature('Alok'))

'male'

In [29]:
classifier.classify(gender_feature('Sohan'))

'male'

In [30]:
print(nltk.classify.accuracy(classifier, test_set))

0.74


In [32]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     36.7 : 1.0
             last_letter = 'k'              male : female =     32.5 : 1.0
             last_letter = 'p'              male : female =     18.8 : 1.0
             last_letter = 'f'              male : female =     17.5 : 1.0
             last_letter = 'v'              male : female =     11.3 : 1.0


Selecting relevant features and deciding how to encode them for a learning method can
have an enormous impact on the learning method’s ability to extract a good model.

Once an initial set of features has been chosen, a very productive method for refining
the feature set is error analysis. First, we select a development set, containing the
corpus data for creating the model. This development set is then subdivided into the
training set and the dev-test set.

In [33]:
names[:5]

[('Verena', 'female'),
 ('Rollo', 'male'),
 ('Elnora', 'female'),
 ('Dimitris', 'male'),
 ('Ruthann', 'female')]

In [34]:
len(names)

7944

In [35]:
random.shuffle(names)

In [69]:
train=names[2000:]
dev_test=names[1000:2000]
test=names[0:1000]

In [37]:
train_set=[(gender_feature(n),g) for (n,g) in train]
dev_set=[(gender_feature(n),g) for (n,g) in dev_test]
test=[(gender_feature(n),g) for (n,g) in test ]

In [41]:
classifier=nltk.NaiveBayesClassifier.train(train_set)

In [43]:
print(nltk.classify.accuracy(classifier,dev_set))

0.763


Using the dev_set, we can generate a list of the errors that the classifier makes when
predicting name genders

In [54]:
error=[]
for (name, tag) in dev_test:
    guess=classifier.classify(gender_feature(name))
    if tag!=guess:
        error.append((tag,guess,name))

In [56]:
len(error)

237

In [58]:
error

[('male', 'female', 'Boniface'),
 ('male', 'female', 'Carleigh'),
 ('female', 'male', 'Wren'),
 ('male', 'female', 'Jule'),
 ('female', 'male', 'Lauryn'),
 ('female', 'male', 'Jessalyn'),
 ('female', 'male', 'Melicent'),
 ('male', 'female', 'Sky'),
 ('female', 'male', 'Thomasin'),
 ('female', 'male', 'Bridget'),
 ('male', 'female', 'Jimmy'),
 ('male', 'female', 'Terence'),
 ('male', 'female', 'Ferdy'),
 ('male', 'female', 'Tedie'),
 ('male', 'female', 'Edie'),
 ('male', 'female', 'Ambrose'),
 ('male', 'female', 'Perceval'),
 ('male', 'female', 'Mika'),
 ('female', 'male', 'Alix'),
 ('male', 'female', 'Karel'),
 ('male', 'female', 'Toby'),
 ('female', 'male', 'Rosaleen'),
 ('male', 'female', 'Bucky'),
 ('male', 'female', 'Anatole'),
 ('female', 'male', 'Katheleen'),
 ('male', 'female', 'Toddy'),
 ('female', 'male', 'Doloritas'),
 ('male', 'female', 'Reube'),
 ('female', 'male', 'Greer'),
 ('male', 'female', 'Arel'),
 ('male', 'female', 'Shea'),
 ('female', 'male', 'Joleen'),
 ('female',

Looking through this list of errors makes it clear that some suffixes that are more than
one letter can be indicative of name genders. For example, names ending in yn appear
to be predominantly female, despite the fact that names ending in n tend to be male;
and names ending in ch are usually male, even though names that end in h tend to be
female.

In [61]:
def gender_feature(word):
    return {'suffix1':word[-1:],
            'suffix2':word[-2:]}

In [62]:
train_set=[(gender_feature(n),g) for (n,g) in train]
dev_set=[(gender_feature(n),g) for (n,g) in dev_test]

In [64]:
classifier=nltk.NaiveBayesClassifier.train(train_set)

In [67]:
print(nltk.classify.accuracy(classifier,dev_set))

0.779


This error analysis procedure can then be repeated, checking for patterns in the errors
that are made by the newly improved classifier

In [70]:
test_set=[(gender_feature(n),g) for (n,g) in test ]

In [71]:
print(nltk.classify.accuracy(classifier,test_set))

0.783


It's giving us a pretty good result , we can also modify  our gender_feature to look for more patterns in error 

In [72]:
classifier.classify(gender_feature('Yash'))

'male'

In [73]:
classifier.classify(gender_feature('Dhruv'))

'male'

In [74]:
classifier.classify(gender_feature('Prashant'))

'male'

In [76]:
classifier.classify(gender_feature('Arshad'))

'male'

In [77]:
classifier.classify(gender_feature('Ankita'))

'female'

In [78]:
classifier.classify(gender_feature('sheetal'))

'female'

In [79]:
classifier.classify(gender_feature('pavan'))

'male'

In [81]:
classifier.classify(gender_feature('pradeep'))

'male'

In [82]:
classifier.classify(gender_feature('anjali'))

'female'