In [11]:
#!/usr/bin/env python3
import nltk
from nltk.corpus import movie_reviews
import sys

documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]

import random
random.shuffle(documents)

In [2]:
print(len(documents), 'documents')

2000 documents


In [3]:

# count all words, regardless of case (by mapping everythign to lower case)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

# Get the 2000 most frequent words, and make this a "potential features" list
word_features = all_words.most_common(2000)

print('top words:', word_features[:20])

top words: [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595), (')', 11781), ('(', 11664), ('as', 11378), ('with', 10792), ('for', 9961)]


In [4]:
# Feature extractor - include only words that are popular
# this method takes a list of words,
#   i.e. document looks like ['the', 'big', 'fish']

def document_features(document):
    # make this list unique; 
    document_words = set(document)
    features = {}
    for word in word_features:
        # We only apply this for words contained in our list of potential features
        features['contains({})'.format(word[0])] = (word[0] in document_words)

    return features

In [6]:
# test it out!
print(movie_reviews.words('pos/cv957_8737.txt'))
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

['capsule', ':', 'the', 'best', 'place', 'to', 'start', ...]
{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)'

In [None]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [13]:
classifier = nltk.NaiveBayesClassifier.train(featuresets)

print(classifier.show_most_informative_features(10))

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.5 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
        contains(seagal) = True              neg : pos    =      8.2 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.0 : 1.0
         contains(damon) = True              pos : neg    =      6.3 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.5 : 1.0
          contains(lame) = True              neg : pos    =      5.5 : 1.0
         contains(awful) = True              neg : pos    =      5.3 : 1.0
        contains(poorly) = True              neg : pos    =      5.1 : 1.0
None


In [14]:
train_set, test_set = featuresets[200:], featuresets[:200]

# evaluate against the test set
classifier = nltk.NaiveBayesClassifier.train(train_set)

print('accuracy:', nltk.classify.accuracy(classifier, test_set))

accuracy: 0.78


In [19]:
# Complete this function, to return all 1...n-grams in a document

# this method takes a list of words, i.e. document looks like ['the', 'big', 'fish']
# it also takes a number k, which is the maximum n in n-gram
def document_all_ngram_features(document, k):
    features = {}
    for n in range(1, k+1):
        for ngram in nltk.ngrams(document, n):
            features[ngram] = True
    return features

print(document_all_ngram_features(movie_reviews.words('pos/cv957_8737.txt'), 3)) 

{('capsule',): True, (':',): True, ('the',): True, ('best',): True, ('place',): True, ('to',): True, ('start',): True, ('if',): True, ('you',): True, ("'",): True, ('re',): True, ('a',): True, ('jackie',): True, ('chan',): True, ('newcomer',): True, ('.',): True, ('roars',): True, ('along',): True, (',',): True, ('never',): True, ('stops',): True, ('for',): True, ('breath',): True, ('and',): True, ('frequently',): True, ('hilarious',): True, ('talk',): True, ('about',): True, ('as',): True, ('"',): True, ('stuntman',): True, ('is',): True, ('miss',): True, ('million',): True, ('things',): True, ('not',): True, ('only',): True, ('one',): True, ('of',): True, ('most',): True, ('accomplished',): True, ('fearsomely',): True, ('unafraid',): True, ('physical',): True, ('adepts',): True, ('on',): True, ('screen',): True, ('today',): True, ('but',): True, ('also',): True, ('very',): True, ('funny',): True, ('guy',): True, ('he',): True, ('reminds',): True, ('me',): True, ('way',): True, ('arno

In [20]:
# use this feature extractor on whole dataset
featuresets = [(document_all_ngram_features(d, 3), c) for (d,c) in documents]
train_set, test_set = featuresets[200:], featuresets[:200]

# train a classifier using the results
classifier = nltk.NaiveBayesClassifier.train(train_set)
print('accuracy:', nltk.classify.accuracy(classifier, test_set))

# what are the top positive and negative n-grams indicating a classification?
print(classifier.show_most_informative_features(10))


accuracy: 0.785
Most Informative Features
  ('to', 'work', 'with') = True              neg : pos    =     23.8 : 1.0
              ('sucks',) = True              neg : pos    =     17.1 : 1.0
   ('to', 'be', 'funny') = True              neg : pos    =     14.4 : 1.0
      ('is', 'terrific') = True              pos : neg    =     14.3 : 1.0
       ('and', 'boring') = True              neg : pos    =     13.7 : 1.0
        ('insult', 'to') = True              neg : pos    =     13.1 : 1.0
  ('everything', 'from') = True              pos : neg    =     12.9 : 1.0
('perhaps', 'the', 'most') = True              pos : neg    =     12.9 : 1.0
          ('our', 'own') = True              pos : neg    =     12.9 : 1.0
         ('schumacher',) = True              neg : pos    =     12.4 : 1.0
None
