In [1]:
import nltk 

In [2]:
from nltk.corpus import sentence_polarity
import random

In [3]:
documents = [(sent, cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)]

random.shuffle(documents)

In [4]:
all_words_list = [word for (sent, cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

21401


In [6]:
word_items = all_words.most_common(1500)
word_features = [word for (word,count) in word_items]

In [9]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_()'.format(word)] = (word in document_words)
    return features

In [10]:
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

In [11]:
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier, test_set)

0.504

In [12]:
from nltk.collocations import * 
bigram_measures = nltk.collocations.BigramAssocMeasures()

print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

['will', 'probably', 'stay', 'in', 'the', 'shadow', 'of', 'its', 'two', 'older', ',', 'more', 'accessible', 'qatsi', 'siblings', '.', 'brainy', ',', 'artistic', 'and', 'muted', ',', 'almost', 'to', 'the', 'point', 'of', 'suffocation', '.', 'simplistic', 'fluff-ball', 'of', 'whimsy', '.', "isn't", 'as', 'sharp', 'as', 'the', 'original', '.', '.', '.', 'despite', 'some', 'visual', 'virtues', ',', "'blade", "ii'"]


In [13]:
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
print(bigram_features[:50])

[("''independent", "film''"), ("'60s-homage", 'pokepie'), ("'[the", 'cockettes]'), ("'ace", "ventura'"), ("'alternate", "reality'"), ("'aunque", 'recurre'), ("'black", "culture'"), ("'blue", "crush'"), ("'chan", "moment'"), ("'chick", "flicks'"), ("'date", "movie'"), ("'ethnic", 'cleansing'), ("'face", "value'"), ("'fully", "experienced'"), ("'jason", "x'"), ("'juvenile", "delinquent'"), ("'laugh", "therapy'"), ("'masterpiece", "theatre'"), ("'nicholas", "nickleby'"), ("'old", "neighborhood'"), ("'opening", "up'"), ("'rare", "birds'"), ("'sacre", 'bleu'), ("'science", "fiction'"), ("'shindler's", "list'"), ("'snow", "dogs'"), ("'some", "body'"), ("'special", "effects'"), ("'terrible", "filmmaking'"), ("'time", "waster'"), ("'true", "story'"), ("'unfaithful'", 'cheats'), ("'very", "sneaky'"), ("'we're", '-doing-it-for'), ("'who's", "who'"), ('-after', 'spangle'), ('-as-it-', 'thinks-it-is'), ('-as-nasty', '-as-it-'), ('-doing-it-for', "-the-cash'"), ('10-course', 'banquet'), ('10-year',

In [15]:
sent = ["Arthur", "carefully", "rode", "the", "brown", "horse", "around", "the", "castle"]
sentbigrams = list(nltk.bigrams(sent))
print(sentbigrams)

[('Arthur', 'carefully'), ('carefully', 'rode'), ('rode', 'the'), ('the', 'brown'), ('brown', 'horse'), ('horse', 'around'), ('around', 'the'), ('the', 'castle')]


In [16]:
bigram = ('brown', 'horse')
print(bigram in sentbigrams)
print('B_{}_{}'.format(bigram[0], bigram[1]))

True
B_brown_horse


In [18]:
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_words)
    return features

bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d,c) in documents]

In [20]:
print(len(bigram_featuresets[0][0].keys()))
print(bigram_featuresets[0][0])

2000
{'V_.': True, 'V_the': True, 'V_,': True, 'V_a': False, 'V_and': False, 'V_of': True, 'V_to': False, 'V_is': False, 'V_in': True, 'V_that': False, 'V_it': False, 'V_as': False, 'V_but': False, 'V_with': False, 'V_film': False, 'V_this': False, 'V_for': False, 'V_its': True, 'V_an': False, 'V_movie': False, "V_it's": False, 'V_be': False, 'V_on': False, 'V_you': False, 'V_not': False, 'V_by': False, 'V_about': False, 'V_more': True, 'V_one': False, 'V_like': False, 'V_has': False, 'V_are': False, 'V_at': False, 'V_from': False, 'V_than': False, 'V_"': False, 'V_all': False, 'V_--': False, 'V_his': False, 'V_have': False, 'V_so': False, 'V_if': False, 'V_or': False, 'V_story': False, 'V_i': False, 'V_too': False, 'V_just': False, 'V_who': False, 'V_into': False, 'V_what': False, 'V_most': False, 'V_out': False, 'V_no': False, 'V_much': False, 'V_even': False, 'V_good': False, 'V_up': False, 'V_will': True, 'V_comedy': False, 'V_time': False, 'V_can': False, 'V_some': False, 'V_chara

In [21]:
train_set, test_set = bigram_featuresets[1000:],bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.72

In [22]:
print(sent)
print(nltk.pos_tag(sent))

['Arthur', 'carefully', 'rode', 'the', 'brown', 'horse', 'around', 'the', 'castle']
[('Arthur', 'NNP'), ('carefully', 'RB'), ('rode', 'VBD'), ('the', 'DT'), ('brown', 'JJ'), ('horse', 'NN'), ('around', 'IN'), ('the', 'DT'), ('castle', 'NN')]


In [23]:
def POS_features(document, word_features): 
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features: 
        features['V_{}'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [24]:
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
print(len(POS_featuresets[0][0].keys()))

1504


In [26]:
print(documents[0])
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

(['will', 'probably', 'stay', 'in', 'the', 'shadow', 'of', 'its', 'two', 'older', ',', 'more', 'accessible', 'qatsi', 'siblings', '.'], 'neg')
num nouns 2
num verbs 1
num adjectives 3
num adverbs 2


In [27]:
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.72

In [28]:
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size: {}'.format(subset_size))
    accuracy_list = []
    for i in range(num_folds): 
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)]+featuresets[((i+1)*subset_size):]
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print(i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    print('mean accuracy', sum(accuracy_list)/num_folds)

In [41]:
num_folds = 5
cross_validation_accuracy(num_folds, POS_featuresets)

Each fold size: 2132
0 0.7274859287054409
1 0.7265478424015009
2 0.7593808630393997
3 0.7420262664165104
4 0.7406191369606003
mean accuracy 0.7392120075046904


In [30]:
goldlist = []
predictedlist = []
for (features, label) in test_set: 
    goldlist.append(label)
    predictedlist.append(classifier.classify(features))

In [31]:
print(goldlist[:30])
print(predictedlist[:30])

['neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos']
['pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg']


In [32]:
cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

    |   p   n |
    |   o   e |
    |   s   g |
----+---------+
pos |<356>148 |
neg | 132<364>|
----+---------+
(row = reference; col = test)



In [34]:
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      p      n |
    |      o      e |
    |      s      g |
----+---------------+
pos | <35.6%> 14.8% |
neg |  13.2% <36.4%>|
----+---------------+
(row = reference; col = test)



In [36]:
def eval_measures(gold, predicted):
    labels = list(set(gold))
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels: 
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab: TP += 1
            if val == lab and predicted[i] != lab: FN += 1
            if val != lab and predicted[i] == lab: FP += 1
            if val != lab and predicted[i] != lab: TN += 1
        recall = TP/(TP + FP)
        precision = TP/(TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append(2*(recall*precision)/(recall+precision))
    print('\tPrecision\tRecall\t\tF1')
    for i, lab in enumerate(labels):
        print(lab, '\t' "{:10.3f}".format(precision_list[i]), "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [37]:
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
neg 	     0.734      0.711      0.722
pos 	     0.706      0.730      0.718
