In [None]:
import nltk
from nltk import word_tokenize

from nltk.corpus import stopwords
from nltk.corpus import movie_reviews

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
import nltk.metrics
import collections

In [None]:
movie_reviews.categories()

In [None]:
print("POSITIVE:")
print(movie_reviews.sents(categories='pos'))
print("NEGATIVE:")
print(movie_reviews.sents(categories='neg'))

In [None]:
print(movie_reviews.fileids())

In [None]:
positive_raw = movie_reviews.raw("pos/cv262_12649.txt")
print("EXAMPLE POSITIVE")
print(positive_raw)

In [None]:
neg_raw = movie_reviews.raw("neg/cv151_17231.txt")
print("EXAMPLE NEGATIVE")
print(neg_raw)

In [None]:
'''Feature Extraction'''
#Bag of Words feature extraction
def word_feats(words):
    return dict([(word, True) for word in words])

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

stopwords = nltk.corpus.stopwords.words('english') #delete stopwords from data
 
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids if f not in stopwords]
print(len(negfeats))
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids if f not in stopwords]
#print(posfeats)

In [None]:
'''Make training and testing sets'''

negcutoff = int(len(negfeats)*3/4)
print(negcutoff)
poscutoff = int(len(posfeats)*3/4)
 
trainfeats = negfeats[0:negcutoff] + posfeats[0:poscutoff]
#print(trainfeats)
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

In [None]:
'''Naive Bayes Classifier'''

classifier = NaiveBayesClassifier.train(trainfeats)
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

In [None]:
'''Precision and Recall'''

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
 
print('pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']))
print('pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']))
print('neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg']))


In [None]:
'''Using our classifier'''

sent1 = "I have never seen such an amazing film since I saw The Shawshank Redemption" #pos
sent2 = "History will in fact adjudge 'The Two Towers' as the greatest of the three Rings" #pos
sent3 = "it was an okay film" #neutral
sent4 = "It was an OK film" #neutral
sent5 = "This movie is beyond description" #neg
sent6 = "'Monster A Go Go' is gloriously, mind shatteringly awful." #neg
sent7 = "Dracula 3000 is the epitome of painfully cheesy cinema." #neg

def classify_sent(sentence):
    tokens = nltk.word_tokenize(sentence)
    print("TOKENS: "+str(tokens))
    words = [w for w in tokens if w.lower() not in stopwords]
    print("REMOVE STOPWORDS: "+str(words))
    print(classifier.classify(word_feats(words)))

classify_sent(sent1)
classify_sent(sent2)
classify_sent(sent3)
classify_sent(sent4)
classify_sent(sent5)
classify_sent(sent6)
classify_sent(sent7)