In [1]:
import nltk
from nltk import word_tokenize

from nltk.corpus import stopwords
from nltk.corpus import movie_reviews

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import precision, f_measure, recall

import collections

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
print("POSITIVE:")
print(movie_reviews.sents(categories='pos'))
print("NEGATIVE:")
print(movie_reviews.sents(categories='neg'))

POSITIVE:
[['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(', 'casper', ')', 'or', 'the', 'arthouse', 'crowd', '(', 'ghost', 'world', ')', ',', 'but', 'there', "'", 's', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '.'], ['for', 'starters', ',', 'it', 'was', 'created', 'by', 'alan', 'moore', '(', 'and', 'eddie', 'campbell', ')', ',', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', "'", '80s', 'with', 'a', '12', '-', 'part', 'series', 'called', 'the', 'watchmen', '.'], ...]
NEGATIVE:
[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]


In [4]:
print(movie_reviews.fileids())

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt', 'neg/cv010_29063.txt', 'neg/cv011_13044.txt', 'neg/cv012_29411.txt', 'neg/cv013_10494.txt', 'neg/cv014_15600.txt', 'neg/cv015_29356.txt', 'neg/cv016_4348.txt', 'neg/cv017_23487.txt', 'neg/cv018_21672.txt', 'neg/cv019_16117.txt', 'neg/cv020_9234.txt', 'neg/cv021_17313.txt', 'neg/cv022_14227.txt', 'neg/cv023_13847.txt', 'neg/cv024_7033.txt', 'neg/cv025_29825.txt', 'neg/cv026_29229.txt', 'neg/cv027_26270.txt', 'neg/cv028_26964.txt', 'neg/cv029_19943.txt', 'neg/cv030_22893.txt', 'neg/cv031_19540.txt', 'neg/cv032_23718.txt', 'neg/cv033_25680.txt', 'neg/cv034_29446.txt', 'neg/cv035_3343.txt', 'neg/cv036_18385.txt', 'neg/cv037_19798.txt', 'neg/cv038_9781.txt', 'neg/cv039_5963.txt', 'neg/cv040_8829.txt', 'neg/cv041_22364.txt', 'neg/cv042_11927.txt', 'neg/cv043_16808.t

In [5]:
positive_raw = movie_reviews.raw("pos/cv262_12649.txt")
print("EXAMPLE POSITIVE")
print(positive_raw)

EXAMPLE POSITIVE
on seeing the outrageous previews for bulworth one wonders what plot could possibly allow beatty get away with making those statements ( in case you missed it , warren beatty plays a politician on the campaign trail . 
he says to a black congregation " . . . if you can't cut down on malt liquor and chicken wings and get behind someone other than a running back who stabs his wife , you're never gonna get rid of me . " ) 
well , there is such a plot , and it works very well in this comedy . 
beatty plays jay billington bulworth , a long-time democratic senator from california . 
the movie opens on a painfully repetitious montage of bulworth's latest commercials condemning affirmative action . 
the montage is ironically intercut with pictures of martin luther king and bulworth in his youth working with jack kennedy . 
when we finally lay eyes on bulworth he is morosely weeping in front of his tv , having gone without food or sleep for days . 
his campaign is entering the 

In [6]:
neg_raw = movie_reviews.raw("neg/cv151_17231.txt")
print("EXAMPLE NEGATIVE")
print(neg_raw)

EXAMPLE NEGATIVE
in " twilight , " a ex-alcoholic , ex-cop , ex-husband , ex-private-eye , harry ross ( paul newman ) , works for a pair of aging hollywood actors , catherine ( susan sarandon ) and jack ames ( gene hackman ) . 
jack is being blackmailed , and he asks harry to deliver the payoff . 
instead of the blackmailers , harry finds a dying ex-cop ( m . 
emmet walsh ) . 
as more bodies begin to pile up , harry realizes that he will have to solve the disappearance of catherine's first husband twenty years earlier to find out who's willing to kill to keep that secret buried . 
newman . sarandon . 
hackman . 
with an a-list cast of oscar laureates like that , " twilight " would seem very promising . 
however , the script is tired and predictable . 
it would serve well as a tv-movie-of-the-week , possibly with some 1970's detective hero reprising his role . 
the appeal of the project for its stars and its director , robert benton , is nostalgia . 
 " twilight " wants very much to be 

In [7]:
'''Feature Extraction'''
#Bag of Words feature extraction
def word_feats(words):
    return dict([(word, True) for word in words])

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

stopwords = nltk.corpus.stopwords.words('english') #delete stopwords from data
 
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids if f not in stopwords]
print(len(negfeats))
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids if f not in stopwords]
#print(posfeats)

1000


In [8]:
'''Make training and testing sets'''

negcutoff = int(len(negfeats)*3/4)
print(negcutoff)
poscutoff = int(len(posfeats)*3/4)
 
trainfeats = negfeats[0:negcutoff] + posfeats[0:poscutoff]
#print(trainfeats)
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

750
train on 1500 instances, test on 500 instances


In [9]:
'''Naive Bayes Classifier'''

classifier = NaiveBayesClassifier.train(trainfeats)
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


In [10]:
'''Precision and Recall'''
#Precision measures the exactness of a classifier. A higher precision means less false positives.
#Recall measures the completeness, or sensitivity, of a classifier. Higher recall means less false negatives.
#F-measure is a weighted harmonic mean of precision and recall

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print('pos precision:', precision(refsets['pos'], testsets['pos']))
print('pos recall:', recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', precision(refsets['neg'], testsets['neg']))
print('neg recall:', recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))


pos precision: 0.651595744680851
pos recall: 0.98
pos F-measure: 0.7827476038338657
neg precision: 0.9596774193548387
neg recall: 0.476
neg F-measure: 0.6363636363636364


In [17]:
'''Using our classifier'''

sent1 = "I have never seen such an amazing film since I saw The Shawshank Redemption" #pos
sent2 = "History will in fact judge 'The Two Towers' as the greatest of the three Rings" #pos
sent3 = "it was an okay film" #neutral
sent4 = "It was an OK film" #neutral
sent5 = "This movie is beyond description" #neg
sent6 = "'Monster A Go Go' is gloriously, mind shatteringly awful." #neg
sent7 = "Dracula 3000 is the epitome of painfully cheesy cinema." #neg

def classify_sent(sentence):
    tokens = nltk.word_tokenize(sentence)
    print("TOKENS: "+str(tokens))
    words = [w for w in tokens if w.lower() not in stopwords]
    print("REMOVE STOPWORDS: "+str(words))
    print(classifier.classify(word_feats(words)))

#classify_sent(sent1)
#classify_sent(sent2)
#classify_sent(sent3)
#classify_sent(sent4)
#classify_sent(sent5)
#classify_sent(sent6)
classify_sent(sent7)

TOKENS: ['Dracula', '3000', 'is', 'the', 'epitome', 'of', 'painfully', 'cheesy', 'cinema', '.']
REMOVE STOPWORDS: ['Dracula', '3000', 'epitome', 'painfully', 'cheesy', 'cinema', '.']
neg
