In [69]:
def gender_features(word):
...     return {'last_letter': word[-1]}

In [70]:
gender_features('Shrek')

{'last_letter': 'k'}

In [71]:
import nltk
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [72]:
import random
random.shuffle(labeled_names)

In [73]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [74]:
classifier.classify(gender_features('Neo'))

'male'

In [75]:
classifier.classify(gender_features('Trinity'))

'female'

In [76]:
print(nltk.classify.accuracy(classifier, test_set))

0.758


In [77]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     34.4 : 1.0
             last_letter = u'k'             male : female =     32.3 : 1.0
             last_letter = u'f'             male : female =     16.0 : 1.0
             last_letter = u'p'             male : female =     11.9 : 1.0
             last_letter = u'd'             male : female =     10.7 : 1.0


In [78]:
def gender_features2(word):
...     return {'last_letter': word[-1], 'length_name': len(word)}

In [79]:
gender_features2('Shrek')

{'last_letter': 'k', 'length_name': 5}

In [80]:
labeled_names2 = ([(name, 'male', len(name)) for name in names.words('male.txt')] + [(name, 'female', len(name)) for name in names.words('female.txt')])

random.shuffle(labeled_names2)

featuresets2 = [(gender_features2(n), gender, length) for (n, gender, length) in labeled_names2]

from nltk.classify import apply_features

train_set2 = apply_features(gender_features2, labeled_names2[500:])
test_set2 = apply_features(gender_features2, labeled_names2[:500])

classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

In [81]:
gender_features2('Shrek')

{'last_letter': 'k', 'length_name': 5}

In [82]:
classifier2.classify(gender_features2('Neo'))

'male'

In [83]:
classifier2.show_most_informative_features(50)

Most Informative Features
             last_letter = u'a'           female : male   =     34.1 : 1.0
             last_letter = u'k'             male : female =     32.8 : 1.0
             last_letter = u'f'             male : female =     27.7 : 1.0
             last_letter = u'p'             male : female =     11.9 : 1.0
             last_letter = u'v'             male : female =     10.5 : 1.0
             last_letter = u'd'             male : female =      9.6 : 1.0
             last_letter = u'o'             male : female =      8.8 : 1.0
             last_letter = u'm'             male : female =      8.0 : 1.0
             last_letter = u'r'             male : female =      6.7 : 1.0
             last_letter = u'w'             male : female =      6.6 : 1.0
             last_letter = u'u'             male : female =      4.7 : 1.0
             last_letter = u'g'             male : female =      4.6 : 1.0
             last_letter = u's'             male : female =      4.1 : 1.0

In [84]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [85]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.774


In [86]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [87]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.755


In [88]:
errors = []
for (name, tag) in devtest_names:
     guess = classifier.classify(gender_features(name))
     if guess != tag:
         errors.append( (tag, guess, name) )

In [89]:
for (tag, guess, name) in sorted(errors):
     print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Adelind                       
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Annabell                      
correct=female   guess=male     name=Avis                          
correct=female   guess=male     name=Beryl                         
correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Carol-Jean                    
correct=female   guess=male     name=Cathyleen                     
correct=female   guess=male     name=Chad                          
correct=female   guess=male     name=Chandal                       
correct=female   guess=male     name=Charil     

In [90]:
def gender_features(word):
     return {'suffix1': word[-1:],
             'suffix2': word[-2:]}

In [91]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.79


In [92]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [93]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [94]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 



In [95]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [96]:
print(nltk.classify.accuracy(classifier, test_set))

0.62


In [97]:
classifier.show_most_informative_features(15)

Most Informative Features
          contains(sans) = True              neg : pos    =      8.2 : 1.0
    contains(cronenberg) = True              pos : neg    =      7.1 : 1.0
    contains(mediocrity) = True              neg : pos    =      6.9 : 1.0
   contains(overwhelmed) = True              pos : neg    =      6.4 : 1.0
   contains(bruckheimer) = True              neg : pos    =      6.2 : 1.0
         contains(wires) = True              neg : pos    =      6.2 : 1.0
        contains(doubts) = True              pos : neg    =      5.9 : 1.0
          contains(wits) = True              pos : neg    =      5.8 : 1.0
       contains(topping) = True              pos : neg    =      5.8 : 1.0
        contains(fabric) = True              pos : neg    =      5.8 : 1.0
          contains(lang) = True              pos : neg    =      5.8 : 1.0
           contains(ugh) = True              neg : pos    =      5.7 : 1.0
     contains(uplifting) = True              pos : neg    =      5.7 : 1.0

In [98]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
     word = word.lower()
     suffix_fdist[word[-1:]] += 1
     suffix_fdist[word[-2:]] += 1
     suffix_fdist[word[-3:]] += 1

In [99]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

[u'e', u',', u'.', u's', u'd', u't', u'he', u'n', u'a', u'of', u'the', u'y', u'r', u'to', u'in', u'f', u'o', u'ed', u'nd', u'is', u'on', u'l', u'g', u'and', u'ng', u'er', u'as', u'ing', u'h', u'at', u'es', u'or', u're', u'it', u'``', u'an', u"''", u'm', u';', u'i', u'ly', u'ion', u'en', u'al', u'?', u'nt', u'be', u'hat', u'st', u'his', u'th', u'll', u'le', u'ce', u'by', u'ts', u'me', u've', u"'", u'se', u'ut', u'was', u'for', u'ent', u'ch', u'k', u'w', u'ld', u'`', u'rs', u'ted', u'ere', u'her', u'ne', u'ns', u'ith', u'ad', u'ry', u')', u'(', u'te', u'--', u'ay', u'ty', u'ot', u'p', u'nce', u"'s", u'ter', u'om', u'ss', u':', u'we', u'are', u'c', u'ers', u'uld', u'had', u'so', u'ey']


In [100]:
def pos_features(word):
...     features = {}
...     for suffix in common_suffixes:
...         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
...     return features

In [101]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [102]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [103]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [104]:
classifier.classify(pos_features('cats'))

u'NNS'

In [105]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return u'.'
      if endswith(.) == True: return u'.'
    if endswith(s) == True: 
      if endswith(is) == False: return u'PP$'
      if endswith(is) == True: return u'BEZ'
  if endswith(,) == True: return u','
if endswith(the) == True: return u'AT'



In [106]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [107]:
pos_features(brown.sents()[0], 8)

{'prev-word': u'an',
 'suffix(1)': u'n',
 'suffix(2)': u'on',
 'suffix(3)': u'ion'}

In [108]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
     untagged_sent = nltk.tag.untag(tagged_sent)
     for i, (word, tag) in enumerate(tagged_sent):
         featuresets.append( (pos_features(untagged_sent, i), tag) )

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [109]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [110]:
def pos_features(sentence, i, history):
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
     if i == 0:
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else:
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]
     return features

In [111]:
class ConsecutivePosTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [112]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.798052851182


In [113]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [114]:
def punct_features(tokens, i):
...     return {'next-word-capitalized': tokens[i+1][0].isupper(),
...             'prev-word': tokens[i-1].lower(),
...             'punct': tokens[i],
...             'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [115]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
...                for i in range(1, len(tokens)-1)
...                if tokens[i] in '.?!']

In [116]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [117]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [118]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [119]:
def dialogue_act_features(post):
...     features = {}
...     for word in nltk.word_tokenize(post):
...         features['contains({})'.format(word.lower())] = True
...     return features

In [120]:
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668


In [121]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [147]:
from nltk.corpus import rte

rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print(extractor.text_words)

set([''])


In [148]:
print(extractor.hyp_words)

set([''])


In [149]:
print(extractor.overlap('word'))

set([''])


In [150]:
print(extractor.overlap('ne'))

set([])


In [151]:
print(extractor.hyp_extra('word'))

set([])


In [152]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [153]:
import random
from nltk.corpus import brown
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [160]:
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])

In [177]:
train_set = brown.tagged_sents(categories='news')[:2]
test_set = brown.tagged_sents(categories='fiction')[:2]

In [183]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print('Accuracy: {:4.2f}'.format(nltk.classify.accuracy(classifier, test_set))) 

ValueError: too many values to unpack

In [162]:
brown.tagged_sents(categories='news')

[[(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN'), (u"Atlanta's", u'NP$'), (u'recent', u'JJ'), (u'primary', u'NN'), (u'election', u'NN'), (u'produced', u'VBD'), (u'``', u'``'), (u'no', u'AT'), (u'evidence', u'NN'), (u"''", u"''"), (u'that', u'CS'), (u'any', u'DTI'), (u'irregularities', u'NNS'), (u'took', u'VBD'), (u'place', u'NN'), (u'.', u'.')], [(u'The', u'AT'), (u'jury', u'NN'), (u'further', u'RBR'), (u'said', u'VBD'), (u'in', u'IN'), (u'term-end', u'NN'), (u'presentments', u'NNS'), (u'that', u'CS'), (u'the', u'AT'), (u'City', u'NN-TL'), (u'Executive', u'JJ-TL'), (u'Committee', u'NN-TL'), (u',', u','), (u'which', u'WDT'), (u'had', u'HVD'), (u'over-all', u'JJ'), (u'charge', u'NN'), (u'of', u'IN'), (u'the', u'AT'), (u'election', u'NN'), (u',', u','), (u'``', u'``'), (u'deserves', u'VBZ'), (u'the', u'AT'), (u'praise', u'NN'), (u'and'

In [182]:
brown.tagged_sents(categories='fiction')[:2]

[[(u'Thirty-three', u'CD-HL')],
 [(u'Scotty', u'NP'),
  (u'did', u'DOD'),
  (u'not', u'*'),
  (u'go', u'VB'),
  (u'back', u'RB'),
  (u'to', u'IN'),
  (u'school', u'NN'),
  (u'.', u'.')]]

In [184]:
def tag_list(tagged_sents):
...     return [tag for sent in tagged_sents for (word, tag) in sent]

In [185]:
def apply_tagger(tagger, corpus):
...     return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [186]:
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
cm = nltk.ConfusionMatrix(gold, test)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

NameError: name 't2' is not defined

In [187]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [188]:
print(entropy(['male', 'male', 'male', 'male'])) 

-0.0


In [189]:
print(entropy(['male', 'female', 'male', 'male']))

0.811278124459


In [190]:
print(entropy(['female', 'male', 'female', 'male']))

1.0


In [191]:
print(entropy(['female', 'female', 'male', 'female']))

0.811278124459


In [192]:
print(entropy(['female', 'female', 'female', 'female'])) 

-0.0
