# gender classifier

In [6]:
from __future__ import print_function
import nltk
from nltk.corpus import brown 

In [1]:
def gender_features(word): 
        return {'last_letter': word[-1]} 

gender_features('Shrek') 

{'last_letter': 'k'}

In [2]:
from nltk.corpus import names
import random

names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) 
random.shuffle(names) 
featuresets = [(gender_features(n), g) for (n,g) in names] 
train_set, test_set = featuresets[500:], featuresets[:500]  

# Naive Bayes Classifier

In [5]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Neo')) 

'male'

In [6]:
classifier.classify(gender_features('Trinity')) 

'female'

In [9]:
print( nltk.classify.accuracy(classifier, test_set))

0.762


In [10]:
classifier.show_most_informative_features(5) 

Most Informative Features
             last_letter = 'a'            female : male   =     34.2 : 1.0
             last_letter = 'k'              male : female =     31.4 : 1.0
             last_letter = 'f'              male : female =     16.0 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0


In [12]:
from nltk.classify import apply_features 
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])
train_set

[({'last_letter': 'l'}, 'male'), ({'last_letter': 'y'}, 'male'), ...]

In [14]:
def gender_features2(name): 
    features = {} 
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower() 
    for letter in 'abcdefghijklmnopqrstuvwxyz': 
        features["count(%s)" % letter] = name.lower().count(letter) 
        features["has(%s)" % letter] = (letter in name.lower()) 
    return features  

gender_features2('John') 

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'firstletter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'lastletter': 'n'}

In [16]:
featuresets = [(gender_features2(n), g) for (n,g) in names] 
train_set, test_set = featuresets[500:], featuresets[:500] 

classifier = nltk.NaiveBayesClassifier.train(train_set)  
print(nltk.classify.accuracy(classifier, test_set))

0.79


In [17]:
train_names = names[1500:]
devtest_names = names[500:1500] 
test_names = names[:500] 

train_set = [(gender_features(n), g) for (n,g) in train_names] 
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names] 
test_set = [(gender_features(n), g) for (n,g) in test_names] 

classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, devtest_set))

0.762


In [19]:
errors = [] 
for (name, tag) in devtest_names: 
    guess = classifier.classify(gender_features(name)) 
    if guess != tag: 
        errors.append( (tag, guess, name) ) 
        
for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

correct=female   guess=male     name=Abagail                       
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Aeriel                        
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Averil                        
correct=female   guess=male     name=Bab                           
correct=female   guess=male     name=Bel                           
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bev                           
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brandais   

### new feature----name with last 2 letters

In [20]:
def gender_features(word): 
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

train_set = [(gender_features(n), g) for (n,g) in train_names] 
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names] 

classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, devtest_set))

0.788


# document classification

In [21]:
from nltk.corpus import movie_reviews 
documents = [(list(movie_reviews.words(fileid)), category) 
             for category in movie_reviews.categories() 
             for fileid in movie_reviews.fileids(category)] 

random.shuffle(documents) 

In [23]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) 
word_features = list(all_words.keys())[:2000] 

def document_features(document): 
    document_words = set(document) 
    features = {} 
    for word in word_features: 
        features['contains(%s)' % word] = (word in document_words) 
        
    return features

print(document_features(movie_reviews.words('pos/cv957_8737.txt')))



In [24]:
featuresets = [(document_features(d), c) for (d,c) in documents] 
train_set, test_set = featuresets[100:], featuresets[:100] 
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, test_set))

0.64


In [25]:
classifier.show_most_informative_features(5) 

Most Informative Features
  contains(refreshingly) = True              pos : neg    =      8.4 : 1.0
    contains(weaknesses) = True              pos : neg    =      8.4 : 1.0
        contains(denial) = True              pos : neg    =      7.7 : 1.0
          contains(moss) = True              pos : neg    =      7.0 : 1.0
       contains(insipid) = True              neg : pos    =      6.6 : 1.0


In [30]:
from nltk.corpus import brown 
suffix_fdist = nltk.FreqDist() 
for word in brown.words(): 
    word = word.lower() 
    suffix_fdist[word[-1:]] 
    suffix_fdist[word[-2:]]
    suffix_fdist[word[-3:]]
    
common_suffixes = list(suffix_fdist.keys())[:100] 
print(common_suffixes)

[]


In [31]:
def pos_features(word):
    features = {} 
    for suffix in common_suffixes: 
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix) 
    return features 

# decision tree classifier

In [32]:
tagged_words = brown.tagged_words(categories='news') 
featuresets = [(pos_features(n), g) for (n,g) in tagged_words] 
size = int(len(featuresets) * 0.1) 
train_set, test_set = featuresets[size:], featuresets[:size]

classifier = nltk.DecisionTreeClassifier.train(train_set) 
nltk.classify.accuracy(classifier, test_set) 

0.14221780208851317

In [33]:
classifier.classify(pos_features('cats')) 

'NN'

In [34]:
 print(classifier.pseudocode(depth=4))

return 'NN'



# context features classifier

In [35]:
def pos_features(sentence, i): 
    features = {"suffix(1)": sentence[i][-1:], 
                "suffix(2)": sentence[i][-2:], 
                "suffix(3)": sentence[i][-3:]} 
    if i == 0: 
        features["prev-word"] = "<START>" 
    else: 
        features["prev-word"] = sentence[i-1]
    return features 

pos_features(brown.sents()[0], 8) 

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [36]:
tagged_sents = brown.tagged_sents(categories='news') 
featuresets = []  
for tagged_sent in tagged_sents: 
    untagged_sent = nltk.tag.untag(tagged_sent) 
    for i, (word, tag) in enumerate(tagged_sent):
         featuresets.append((pos_features(untagged_sent, i), tag)) 
            
size = int(len(featuresets) * 0.1) 
train_set, test_set = featuresets[size:], featuresets[:size]

classifier = nltk.NaiveBayesClassifier.train(train_set) 
nltk.classify.accuracy(classifier, test_set) 

0.7891596220785678

# sequential classifier

In [38]:
def pos_features(sentence, i, history): 
    features = {"suffix(1)": sentence[i][-1:], 
                "suffix(2)": sentence[i][-2:], 
                "suffix(3)": sentence[i][-3:]}
    if i == 0: 
        features["prev-word"] = "<START>" 
        features["prev-tag"] = "<START>" 
    else: 
        features["prev-word"] = sentence[i-1] 
        features["prev-tag"] = history[i-1] 
    return features 

class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents): 
        train_set = [] 
        for tagged_sent in train_sents: 
            untagged_sent = nltk.tag.untag(tagged_sent) 
            history = [] 
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history) 
                train_set.append((featureset, tag)) 
                history.append(tag) 
        self.classifier = nltk.NaiveBayesClassifier.train(train_set) 
    def tag(self, sentence): 
        history = [] 
        for i, word in enumerate(sentence): 
            featureset = pos_features(sentence, i, history) 
            tag = self.classifier.classify(featureset) 
            history.append(tag)   
        return zip(sentence, history) 

tagged_sents = brown.tagged_sents(categories='news') 
size = int(len(tagged_sents) * 0.1) 
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] 

tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975


# sentence segmentation

In [39]:
sents = nltk.corpus.treebank_raw.sents() 
tokens = [] 
boundaries = set() 
offset = 0 
for sent in nltk.corpus.treebank_raw.sents(): 
    tokens.extend(sent) 
    offset += len(sent) 
    boundaries.add(offset-1) 
    
def punct_features(tokens, i): 
    return {'next-word-capitalized': tokens[i+1][0].isupper(), 
                'prevword': tokens[i-1].lower(),
                 'punct': tokens[i], 
                 'prev-word-is-one-char': len(tokens[i-1]) == 1} 

featuresets = [(punct_features(tokens, i), (i in boundaries)) 
               for i in range(1, len(tokens)-1) if tokens[i] in '.?!'] 

size = int(len(featuresets) * 0.1) 
train_set, test_set = featuresets[size:], featuresets[:size]

classifier = nltk.NaiveBayesClassifier.train(train_set) 
nltk.classify.accuracy(classifier, test_set) 

0.936026936026936

In [40]:
def segment_sentences(words): 
    start = 0 
    sents = [] 
    for i, word in words: 
        if word in '.?!' and classifier.classify(words, i) == True: 
            sents.append(words[start:i+1]) 
            start = i+1 
    if start < len(words): 
        sents.append(words[start:])

# dialogue class

In [1]:
import nltk
posts = nltk.corpus.nps_chat.xml_posts()[:10000] 

In [2]:
def dialogue_act_features(post): 
    features = {} 
    for word in nltk.word_tokenize(post): 
        features['contains(%s)' % word.lower()] = True 
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts] 

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size] 
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, test_set))

0.668


# Recognizing textual entailment

In [3]:
def rte_features(rtepair): 
    extractor = nltk.RTEFeatureExtractor(rtepair) 
    features = {}
    features['word_overlap'] = len(extractor.overlap('word')) 
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne')) 
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) 
    return features 

rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] 
extractor = nltk.RTEFeatureExtractor(rtepair) 
print(extractor.text_words)

LookupError: 
**********************************************************************
  Resource 'corpora/rte' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - 'C:\\Users\\jet/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'd:\\programfiles\\python35\\nltk_data'
    - 'd:\\programfiles\\python35\\lib\\nltk_data'
    - 'C:\\Users\\jet\\AppData\\Roaming\\nltk_data'
**********************************************************************

In [10]:
from nltk.corpus import brown

file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1) 
train_set = brown.tagged_sents(file_ids[size:]) 
test_set = brown.tagged_sents(file_ids[:size]) 

classifier = nltk.NaiveBayesClassifier.train(train_set)
print('Accuracy: %4.2f' % nltk.classify.accuracy(classifier, test_set))

ValueError: too many values to unpack (expected 2)

# confusion matrix

In [24]:
def tag_list(tagged_sents): 
    return [tag for sent in tagged_sents for (word, tag) in sent] 

def apply_tagger(tagger, corpus): 
        return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus] 

from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
fd = nltk.FreqDist(brown.words())
word_list = fd.keys()
word_mapping = dict((w,w) if fd[w] > 3  else (w,'UNK')
                            for w in word_list)
brown_tagged_sent = brown.tagged_sents(categories='news')
training_size = int(len(brown_tagged_sents) * 0.9)
new_brown_tagged_sent = [[(word_mapping[wd],tag) for (wd,tag) in sent] for sent in brown_tagged_sent]
train_sents = new_brown_tagged_sent[:training_size]

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

gold = tag_list(brown.tagged_sents(categories='editorial')) 
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial'))) 
cm = nltk.ConfusionMatrix(gold, test)
print(cm)

           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

# decision tree

In [25]:
import math 
def entropy(labels): 
    freqdist = nltk.FreqDist(labels) 
    probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)] 
    return -sum([p * math.log(p,2) for p in probs]) 

print(entropy(['male', 'male', 'male', 'male']))

-0.0


In [26]:
print(entropy(['male', 'female', 'male', 'male']))

0.8112781244591328


In [27]:
print(entropy(['female', 'male', 'female', 'male']))

1.0
