In [1]:
import nltk

In [3]:
def gender_features(word):
...     return {'last_letter': word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

In [4]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
... [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [5]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [6]:
classifier.classify(gender_features('Neo'))


'male'

In [7]:
classifier.classify(gender_features('Trinity'))


'female'

In [8]:
print(nltk.classify.accuracy(classifier, test_set))


0.778


In [9]:
classifier.show_most_informative_features(5)


Most Informative Features
             last_letter = 'a'            female : male   =     35.7 : 1.0
             last_letter = 'k'              male : female =     33.3 : 1.0
             last_letter = 'f'              male : female =     15.3 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0


In [10]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [11]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.77


In [12]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [13]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, devtest_set))

0.739


In [14]:
errors = []
for (name, tag) in devtest_names:
...     guess = classifier.classify(gender_features(name))
...     if guess != tag:
...         errors.append( (tag, guess, name) )

In [16]:
for (tag, guess, name) in sorted(errors):
...     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Abagael                       
correct=female   guess=male     name=Adel                          
correct=female   guess=male     name=Annabell                      
correct=female   guess=male     name=Arabel                        
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Brenn                         
correct=female   guess=male     name=Bryn                          
correct=female   guess=male     name=Caril                         
correct=female   guess=male     name=Carmen                        
correct=female   guess=male     name=Carroll                       
correct=female   guess=male     name=Charmion   

In [17]:
def gender_features(word):
...     return {'suffix1': word[-1:],
...             'suffix2': word[-2:]}

In [18]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.763


In [19]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
...              for category in movie_reviews.categories()
...              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [20]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [21]:
>>> print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 




In [None]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
>>> print(nltk.classify.accuracy(classifier, test_set)) [1]


In [None]:
>>> classifier.show_most_informative_features(5) [2]


In [None]:
>>> from nltk.corpus import brown
>>> suffix_fdist = nltk.FreqDist()
>>> for word in brown.words():
...     word = word.lower()
...     suffix_fdist[word[-1:]] += 1
...     suffix_fdist[word[-2:]] += 1
...     suffix_fdist[word[-3:]] += 1

In [None]:
>>> common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
>>> print(common_suffixes)

In [None]:
>>> def pos_features(word):
...     features = {}
...     for suffix in common_suffixes:
...         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
...     return features

In [None]:
>>> tagged_words = brown.tagged_words(categories='news')
>>> featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [None]:
>>> size = int(len(featuresets) * 0.1)
>>> train_set, test_set = featuresets[size:], featuresets[:size]

In [None]:
>>> classifier = nltk.DecisionTreeClassifier.train(train_set)
>>> nltk.classify.accuracy(classifier, test_set)

In [None]:
>>> classifier.classify(pos_features('cats'))