In [1]:
>>> def gender_features(word):
...     return {'last_letter': word[-1]}
>>> gender_features('Shrek')
{'last_letter': 'k'}

{'last_letter': 'k'}

In [2]:
>>> from nltk.corpus import names
>>> labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
... [(name, 'female') for name in names.words('female.txt')])
>>> import random
>>> random.shuffle(labeled_names)

In [4]:
from nltk import *
>>> featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
>>> train_set, test_set = featuresets[500:], featuresets[:500]
>>> classifier = NaiveBayesClassifier.train(train_set)

In [5]:
>>> classifier.classify(gender_features('Neo'))


'male'

In [6]:
>>> classifier.classify(gender_features('Trinity'))


'female'

In [8]:
>>> print(classify.accuracy(classifier, test_set))


0.774


In [9]:
>>> classifier.show_most_informative_features(5)


Most Informative Features
             last_letter = 'a'            female : male   =     34.4 : 1.0
             last_letter = 'k'              male : female =     31.1 : 1.0
             last_letter = 'f'              male : female =     13.8 : 1.0
             last_letter = 'p'              male : female =     11.8 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0


In [10]:
>>> from nltk.classify import apply_features
>>> train_set = apply_features(gender_features, labeled_names[500:])
>>> test_set = apply_features(gender_features, labeled_names[:500])

In [11]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [12]:
>>> gender_features2('John') 


{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [16]:
>>> featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
>>> train_set, test_set = featuresets[500:], featuresets[:500]
>>> classifier = NaiveBayesClassifier.train(train_set)
>>> print(classify.accuracy(classifier, test_set))

0.778


In [17]:
>>> train_names = labeled_names[1500:]
>>> devtest_names = labeled_names[500:1500]
>>> test_names = labeled_names[:500]

In [19]:
>>> train_set = [(gender_features(n), gender) for (n, gender) in train_names]
>>> devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
>>> test_set = [(gender_features(n), gender) for (n, gender) in test_names]
>>> classifier = NaiveBayesClassifier.train(train_set)
>>> print(classify.accuracy(classifier, devtest_set))

0.746


In [20]:
>>> errors = []
>>> for (name, tag) in devtest_names:
...     guess = classifier.classify(gender_features(name))
...     if guess != tag:
...         errors.append( (tag, guess, name) )

In [21]:
>>> for (tag, guess, name) in sorted(errors):
...     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Ailyn                         
correct=female   guess=male     name=Aleen                         
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Allis                         
correct=female   guess=male     name=Allison                       
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Anais                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=April                         
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Arden                         
correct=female   guess=male     name=Arlyn                         
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Austin     

In [22]:
>>> def gender_features(word):
...     return {'suffix1': word[-1:],
...             'suffix2': word[-2:]}

In [24]:
>>> train_set = [(gender_features(n), gender) for (n, gender) in train_names]
>>> devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
>>> classifier = NaiveBayesClassifier.train(train_set)
>>> print(classify.accuracy(classifier, devtest_set))

0.76


In [25]:
>>> from nltk.corpus import movie_reviews
>>> documents = [(list(movie_reviews.words(fileid)), category)
...              for category in movie_reviews.categories()
...              for fileid in movie_reviews.fileids(category)]
>>> random.shuffle(documents)

In [29]:
all_words = FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000] [1]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [30]:
>>> print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 


{'contains(t)': False, 'contains(l)': False, 'contains(g)': False, 'contains(c)': False, 'contains(i)': False, 'contains(n)': False, 'contains(a)': True, 'contains(u)': False}


In [32]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = NaiveBayesClassifier.train(train_set)

In [33]:
>>> print(classify.accuracy(classifier, test_set))

0.56


In [34]:
>>> classifier.show_most_informative_features(5)

Most Informative Features
             contains(t) = False             pos : neg    =      1.5 : 1.0
             contains(c) = True              neg : pos    =      1.4 : 1.0
             contains(i) = False             pos : neg    =      1.2 : 1.0
             contains(u) = True              pos : neg    =      1.2 : 1.0
             contains(g) = True              pos : neg    =      1.2 : 1.0
