In [1]:
import random

import nltk
from nltk.corpus import movie_reviews

In [2]:
# form a document with all reviews and their categories(class labels)
documents = [(list(movie_reviews.words(file_id)), category)
             for category in movie_reviews.categories()
             for file_id in movie_reviews.fileids(category)]
random.shuffle(documents)

In [3]:
# find the frequency of all words in complete documents and take top 2000 words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [4]:
# extract feature vector for a document.(Feature for each word is indication whether the document contains that word)
def document_features(document):
    document_word = set(document)  # set of words in a document for faster computation
    features = {}  # size = (2000,)
    for word in word_features:
        features['contains({})'.format(word)] = word in document_word  # check if each word in word_features is present in a document
    return features

In [5]:
# Train Naive Bayes classifier
feature_sets = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = feature_sets[100:], feature_sets[:100]

In [6]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Classifier accuracy: {}\n".format(nltk.classify.accuracy(classifier, test_set)))

Classifier accuracy: 0.79



In [7]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.6 : 1.0
         contains(mulan) = True              pos : neg    =      9.1 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
         contains(damon) = True              pos : neg    =      8.0 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.6 : 1.0
