In [1]:
import nltk
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

In [4]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Neo'))

'male'

In [5]:
classifier.classify(gender_features('Trinity'))

'female'

In [6]:
print(nltk.classify.accuracy(classifier, test_set))

0.752


In [7]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.6 : 1.0
             last_letter = 'k'              male : female =     31.2 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     11.8 : 1.0
             last_letter = 'v'              male : female =      9.8 : 1.0


In [8]:
def gender_features(word):
    return {'last_letter': word[-1],
           'length': len(word),
           'first_letter': word[0]}
#nltk structure that acts like list but does not store all objects
#in memory
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.802


In [9]:
#using too many features can lead to overfitting the training data
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.772


In [10]:
#error analysis
#devtest set is used to perform error analysis
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.755


In [12]:
errors = []
for (name, tag) in devtest_names:
     guess = classifier.classify(gender_features(name))
     if guess != tag:
         errors.append( (tag, guess, name) )
            
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ardeen                        
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Austin                        
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bev                           
correct=female   guess=male     name=Brandais                      
correct=female   guess=male     name=Brier                         
correct=female   guess=male     name=Brigid                        
correct=female   guess=male     name=Brit                          
correct=female   guess=male     name=Cal                           
correct=female   guess=male     name=Charis                        
correct=female   guess=male     name=Charlot    

In [13]:
'''names ending in yn are predominantly female,
despite names ending in n being prefominantly male'''
def gender_features(word):
     return {'suffix1': word[-1:],
             'suffix2': word[-2:]}
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.778


In [15]:
#Document Classification
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

{'contains(borderlines)': False, 'contains(exotically)': False, 'contains(downcast)': False, 'contains(turn)': False, 'contains(cheating)': False, 'contains(eludes)': False, 'contains(geologist)': False, 'contains(herbal)': False, 'contains(happening)': False, 'contains(infallibility)': False, 'contains(tradiational)': False, 'contains(cares)': False, 'contains(scholl)': False, 'contains(spaceflight)': False, 'contains(orange)': False, 'contains(kaczynski)': False, 'contains(lanced)': False, 'contains(prevert)': False, 'contains(sayles)': False, 'contains(&#)': False, 'contains(collectibles)': False, 'contains(verbalized)': False, 'contains(36th)': False, 'contains(propoganda)': False, 'contains(matalin)': False, 'contains(richness)': False, 'contains(devastation)': False, 'contains(certain)': False, 'contains(roster)': False, 'contains(stonily)': False, 'contains(rossellinia)': False, 'contains(proposing)': False, 'contains(undefeatable)': False, 'contains(synthesizer)': False, 'conta

In [16]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))

0.6


In [17]:
classifier.show_most_informative_features(5)

Most Informative Features
     contains(insulting) = True              neg : pos    =     10.6 : 1.0
    contains(accessible) = True              pos : neg    =      9.7 : 1.0
          contains(sans) = True              neg : pos    =      9.0 : 1.0
      contains(fairness) = True              neg : pos    =      9.0 : 1.0
       contains(detract) = True              pos : neg    =      8.4 : 1.0


In [25]:
#Part of speech tagging
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
     word = word.lower()
     suffix_fdist[word[-1:]] += 1
     suffix_fdist[word[-2:]] += 1
     suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(5)]
print(common_suffixes)

['e', ',', '.', 's', 'd']


In [26]:
def pos_features(word):
     features = {}
     for suffix in common_suffixes:
         features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
     return features
pos_features("England")

{'endswith(,)': False,
 'endswith(.)': False,
 'endswith(d)': True,
 'endswith(e)': False,
 'endswith(s)': False}

In [27]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.3177523620089508

In [28]:
print(classifier.pseudocode(depth=4))

if endswith(,) == False: 
  if endswith(s) == False: 
    if endswith(e) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(e) == True: return 'AT'
  if endswith(s) == True: return 'NNS'
if endswith(,) == True: return ','



In [30]:
#Exploiting Context
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [31]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
     untagged_sent = nltk.tag.untag(tagged_sent)
     for i, (word, tag) in enumerate(tagged_sent):
         featuresets.append( (pos_features(untagged_sent, i), tag) )
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [36]:
#Sequence classification
#Decision of the next tag is based on history of decisions
#Commit to every decision that it makes
def pos_features(sentence, i, history):
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
     if i == 0:
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else:
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]
     return features

class ConsecutivePosTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
list(tagger.tag(["I","love","me","some","sunshine"]))

[('I', 'PPSS'),
 ('love', 'VB'),
 ('me', 'PPO'),
 ('some', 'DTI'),
 ('sunshine', 'NN')]

In [37]:
print(tagger.evaluate(test_sents))

0.7980528511821975


In [38]:
'''Hidden markov models - assign scores to all of the possible
sequences of part-of-speech tags and choose the sequence whose
overall score is highest

All possible sequences generate a very large number of permutations,
limit to the last word or n words.

This approach is used on Maximum Entropy Markov Models
and Linear-Chain Conditional Random Field Models
'''

'Hidden markov models - assign scores to all of the possible\nsequences of part-of-speech tags and choose the sequence whose\noverall score is highest\n\nAll possible sequences generate a very large number of permutations,\nlimit to the last word or n words.\n\nThis approach is used on Maximum Entropy Markov Models\nand Linear-Chain Conditional Random Field Models\n'

In [48]:
#Sentence segmentation, can be viewed as a classification task 
#for punctuation.

sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)
    
def punct_features(tokens, i):
    features = {'punct': tokens[i]}
    if i+1 >= len(tokens):
        features['next-word-capitalized'] = False
    else:
        features['next-word-capitalized'] = tokens[i+1][0].isupper()
    if i == 0:
        features['prev-word'] = '<START>'
        features['prev-word-is-one-char'] = False
    else:
        features['prev-word'] = tokens[i-1]
        features['prev-word-is-one-char'] = len(tokens[i-1]) == 1
    return features

featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.9377104377104377

In [49]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

segment_sentences(["Hello","mister",".","How","you","doin","?"])

[['Hello', 'mister', '.'], ['How', 'you', 'doin', '?']]

In [50]:
#Indentifying dialogue act types
#greetings, questions, answers, assertions, and clarifications
#can all be thought of as types of speech-based actions

posts = nltk.corpus.nps_chat.xml_posts()[:10000]
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668


In [53]:
classifier.classify(dialogue_act_features("ahah you are funny lol"))

'Statement'

In [54]:
#Recognize textual entailment
#Say if statement H (Hypothesis) is true or false based on text T

def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print(extractor.text_words)

{'at', 'Soviet', 'fight', 'Iran', 'four', 'Asia', 'Davudi', 'operation', 'together', 'Shanghai', 'Parviz', 'republics', 'China', 'that', 'was', 'fledgling', 'SCO', 'central', 'Russia', 'association', 'Organisation', 'representing', 'binds', 'terrorism.', 'Co', 'former', 'meeting'}


In [55]:
print(extractor.hyp_words)

{'China', 'member', 'SCO.'}


In [56]:
print(extractor.overlap('word'))

set()


In [57]:
print(extractor.overlap('ne'))

{'China'}


In [58]:
print(extractor.hyp_extra('word'))

{'member'}


In [60]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)
print(entropy(['male', 'male', 'male', 'male'])) 

-0.0


In [61]:
print(entropy(['female', 'female', 'female', 'female'])) 

-0.0


In [62]:
print(entropy(['female', 'male', 'female', 'male']))

1.0
