In [1]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.metrics import confusion_matrix
import pandas as pd

# Sentiment Analysis on Movie Reviews

We will be working with sentiment analysis on movie reviews. This dataset is originally taken from http://www.cs.cornell.edu/people/pabo/movie-review-data/ but NLTK has loaded into the list of corpus (under `nltk.corpus.movie_reviews`). We will use the one from NLTK for ease of use.

In [2]:
#the original dataset has 2000 reviews (1000 negative reviews & 1000 positive reviews)
#each in a separate file with different filename
print(len(movie_reviews.fileids()))
print(movie_reviews.fileids()[:10])

2000
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [3]:
#this will get all the fileids for the 1000 negative reviews
print(len(movie_reviews.fileids("neg")))
print(movie_reviews.fileids("neg")[:10])

1000
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [4]:
#this will get all the fileids for the 1000 positive reviews
print(len(movie_reviews.fileids("pos")))
print(movie_reviews.fileids("pos")[:10])

1000
['pos/cv000_29590.txt', 'pos/cv001_18431.txt', 'pos/cv002_15918.txt', 'pos/cv003_11664.txt', 'pos/cv004_11636.txt', 'pos/cv005_29443.txt', 'pos/cv006_15448.txt', 'pos/cv007_4968.txt', 'pos/cv008_29435.txt', 'pos/cv009_29592.txt']


In [5]:
#get one review (raw text)
pos_review1 = movie_reviews.raw("pos/cv000_29590.txt")
pos_review1

'films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes brothers to direct this seem

In [6]:
#NLTK has tokenize the reviews before hand. So, we will use the list of words instead
pos_review_words1 = movie_reviews.words("pos/cv000_29590.txt")
pos_review_words1[:20]

['films',
 'adapted',
 'from',
 'comic',
 'books',
 'have',
 'had',
 'plenty',
 'of',
 'success',
 ',',
 'whether',
 'they',
 "'",
 're',
 'about',
 'superheroes',
 '(',
 'batman',
 ',']

## Split the data into Training and Testing set

In [7]:
#we will do 90%/10% training/test split
training_set_size = 900

#### ❓Exercise 1. Create the training data by combining the first 900 rows of the positive reviews and the first 900 rows of negative reviews

In [8]:
training_pos_fields = movie_reviews.fileids("pos")[:training_set_size]
training_neg_fields = movie_reviews.fileids("neg")[:training_set_size]

training_data = [(movie_reviews.words(fileid), "pos") for fileid in training_pos_fields] + [(movie_reviews.words(fileid), "neg") for fileid in training_neg_fields]

===

In [9]:
#there should be 900 + 900 = 1800 rows in the training set
print(len(training_data))
training_data

1800


[(['films', 'adapted', 'from', 'comic', 'books', 'have', ...], 'pos'),
 (['every', 'now', 'and', 'then', 'a', 'movie', 'comes', ...], 'pos'),
 (['you', "'", 've', 'got', 'mail', 'works', 'alot', ...], 'pos'),
 (['"', 'jaws', '"', 'is', 'a', 'rare', 'film', 'that', ...], 'pos'),
 (['moviemaking', 'is', 'a', 'lot', 'like', 'being', ...], 'pos'),
 (['on', 'june', '30', ',', '1960', ',', 'a', 'self', ...], 'pos'),
 (['apparently', ',', 'director', 'tony', 'kaye', 'had', ...], 'pos'),
 (['one', 'of', 'my', 'colleagues', 'was', 'surprised', ...], 'pos'),
 (['after', 'bloody', 'clashes', 'and', 'independence', ...], 'pos'),
 (['the', 'american', 'action', 'film', 'has', 'been', ...], 'pos'),
 (['after', 'watching', '"', 'rat', 'race', '"', 'last', ...], 'pos'),
 (['i', "'", 've', 'noticed', 'something', 'lately', ...], 'pos'),
 (['synopsis', ':', 'bobby', 'garfield', '(', 'yelchin', ...], 'pos'),
 (['synopsis', ':', 'in', 'this', 'movie', ',', 'steven', ...], 'pos'),
 (['the', 'police', 'nego

#### ❓Exercise 2. Create the testing data by combining the remaining 100 rows of the positive reviews and the remaining 100 rows of negative reviews

In [10]:
testing_pos_fields = movie_reviews.fileids("pos")[training_set_size:]
testing_neg_fields = movie_reviews.fileids("neg")[training_set_size:]

testing_data = [(movie_reviews.words(fileid), "pos") for fileid in testing_pos_fields] + [(movie_reviews.words(fileid),"neg") for fileid in testing_neg_fields]

===

In [11]:
#there should be 100 + 100 = 200 rows in the testing set
print(len(testing_data))
testing_data

200


[(['in', '1912', ',', 'a', 'ship', 'set', 'sail', 'on', ...], 'pos'),
 (['the', 'start', 'of', 'this', 'movie', 'reminded', ...], 'pos'),
 (['note', ':', 'some', 'may', 'consider', 'portions', ...], 'pos'),
 (['robert', 'altman', "'", 's', 'cookie', "'", 's', ...], 'pos'),
 (['well', 'i', "'", 'll', 'be', 'damned', '.', '.', '.', ...], 'pos'),
 (['hedwig', '(', 'john', 'cameron', 'mitchell', ')', ...], 'pos'),
 (['niagara', 'niagara', '(', 'r', ')', 'bob', 'gosse', ...], 'pos'),
 (['notice', ':', 'this', 'is', 'a', 'review', 'and', ...], 'pos'),
 (['i', 'actually', 'am', 'a', 'fan', 'of', 'the', ...], 'pos'),
 (['gere', ',', 'willis', ',', 'poitier', 'chase', ...], 'pos'),
 (['it', "'", 's', 'terribly', 'unfortunate', 'that', ...], 'pos'),
 (['usually', 'when', 'a', 'blockbuster', 'comes', 'out', ...], 'pos'),
 (['review', '-', 'peter', 'jackson', "'", 's', 'the', ...], 'pos'),
 (['before', 'you', 'read', 'my', 'review', ',', 'you', ...], 'pos'),
 (['susan', 'granger', "'", 's', 'revie

## Generate the Feature Vector  

We will just create Bag of Word (BoW) model. I.e. using the words as the features. We will adopt the **term presence** approach (i.e. capture whether a term/word exist).

In [12]:
def extract_unigram_feature(words):
    #each word is a feature
    #there are many entries so need to use dict() instead of {} literal
    return dict((word, True) for word in words)

In [13]:
extract_unigram_feature(training_data[0][0])

{'films': True,
 'adapted': True,
 'from': True,
 'comic': True,
 'books': True,
 'have': True,
 'had': True,
 'plenty': True,
 'of': True,
 'success': True,
 ',': True,
 'whether': True,
 'they': True,
 "'": True,
 're': True,
 'about': True,
 'superheroes': True,
 '(': True,
 'batman': True,
 'superman': True,
 'spawn': True,
 ')': True,
 'or': True,
 'geared': True,
 'toward': True,
 'kids': True,
 'casper': True,
 'the': True,
 'arthouse': True,
 'crowd': True,
 'ghost': True,
 'world': True,
 'but': True,
 'there': True,
 's': True,
 'never': True,
 'really': True,
 'been': True,
 'a': True,
 'book': True,
 'like': True,
 'hell': True,
 'before': True,
 '.': True,
 'for': True,
 'starters': True,
 'it': True,
 'was': True,
 'created': True,
 'by': True,
 'alan': True,
 'moore': True,
 'and': True,
 'eddie': True,
 'campbell': True,
 'who': True,
 'brought': True,
 'medium': True,
 'to': True,
 'whole': True,
 'new': True,
 'level': True,
 'in': True,
 'mid': True,
 '80s': True,
 '

In [14]:
training_feature_vector = [(extract_unigram_feature(words), label) for (words, label) in training_data]
print(training_feature_vector[:1])

[({'films': True, 'adapted': True, 'from': True, 'comic': True, 'books': True, 'have': True, 'had': True, 'plenty': True, 'of': True, 'success': True, ',': True, 'whether': True, 'they': True, "'": True, 're': True, 'about': True, 'superheroes': True, '(': True, 'batman': True, 'superman': True, 'spawn': True, ')': True, 'or': True, 'geared': True, 'toward': True, 'kids': True, 'casper': True, 'the': True, 'arthouse': True, 'crowd': True, 'ghost': True, 'world': True, 'but': True, 'there': True, 's': True, 'never': True, 'really': True, 'been': True, 'a': True, 'book': True, 'like': True, 'hell': True, 'before': True, '.': True, 'for': True, 'starters': True, 'it': True, 'was': True, 'created': True, 'by': True, 'alan': True, 'moore': True, 'and': True, 'eddie': True, 'campbell': True, 'who': True, 'brought': True, 'medium': True, 'to': True, 'whole': True, 'new': True, 'level': True, 'in': True, 'mid': True, '80s': True, 'with': True, '12': True, '-': True, 'part': True, 'series': Tru

In [15]:
testing_feature_vector = [(extract_unigram_feature(words), label) for (words, label) in testing_data]
print(testing_feature_vector[:1])

[({'in': True, '1912': True, ',': True, 'a': True, 'ship': True, 'set': True, 'sail': True, 'on': True, 'her': True, 'maiden': True, 'voyage': True, 'across': True, 'the': True, 'atlantic': True, 'for': True, 'america': True, '.': True, 'this': True, 'was': True, 'built': True, 'to': True, 'be': True, 'largest': True, 'world': True, 'and': True, 'she': True, 'also': True, 'build': True, 'one': True, 'of': True, 'most': True, 'luxurious': True, 'that': True, 'finally': True, 'unsinkable': True, 'unfortunately': True, 'not': True, 'get': True, 'ticket': True, 'you': True, 'either': True, ':': True, 'spent': True, 'life': True, "'": True, 's': True, 'savings': True, 'start': True, 'anew': True, 'were': True, 'part': True, 'upper': True, 'class': True, 'had': True, 'money': True, 'spare': True, 'or': True, 'lucky': True, 'enough': True, 'have': True, 'full': True, 'house': True, 'poker': True, 'match': True, 'by': True, 'docks': True, 'like': True, 'jack': True, 'dawson': True, 'makes': Tr

## Train The Naive Bayes Classifier

In [16]:
training_feature_vector[:1]

[({'films': True,
   'adapted': True,
   'from': True,
   'comic': True,
   'books': True,
   'have': True,
   'had': True,
   'plenty': True,
   'of': True,
   'success': True,
   ',': True,
   'whether': True,
   'they': True,
   "'": True,
   're': True,
   'about': True,
   'superheroes': True,
   '(': True,
   'batman': True,
   'superman': True,
   'spawn': True,
   ')': True,
   'or': True,
   'geared': True,
   'toward': True,
   'kids': True,
   'casper': True,
   'the': True,
   'arthouse': True,
   'crowd': True,
   'ghost': True,
   'world': True,
   'but': True,
   'there': True,
   's': True,
   'never': True,
   'really': True,
   'been': True,
   'a': True,
   'book': True,
   'like': True,
   'hell': True,
   'before': True,
   '.': True,
   'for': True,
   'starters': True,
   'it': True,
   'was': True,
   'created': True,
   'by': True,
   'alan': True,
   'moore': True,
   'and': True,
   'eddie': True,
   'campbell': True,
   'who': True,
   'brought': True,
   'm

#### ❓Exercise 3. Create the Naive Bayes classifier using the training data

Take note that the training process might take a while.

In [17]:
#
nb_classifier = nltk.NaiveBayesClassifier.train(training_feature_vector)


## Evaluation

In [18]:
#predict the testing data
prediction = nb_classifier.classify(testing_feature_vector[0][0])
actual_class = testing_feature_vector[0][1]
print("Prediction:", prediction, " Actual class:", actual_class)

Prediction: pos  Actual class: pos


#### ❓Exercise 4. Determine the accuracy of the `nb_classifier`

In [19]:
# this is the basic train of processes for sentiment analysis
tokens = nltk.word_tokenize("i am not sad")
features = extract_unigram_feature(tokens)
nb_classifier.classify(features)
# notice that this model is not able to detect the negation 'not'


'neg'

In [20]:
nltk.classify.accuracy(nb_classifier, testing_feature_vector)

0.73

#### ❓Exercise 5. Investigate to see whether there is any changes to the accuracy if you have a different train/test split %

===

## Confusion Matrix

To properly evaluate the performance of a classifier, it is a good idea to generate the confusion matrix so that we can better assess how good/bad the classification results are. In this case, we want the classifier to predict `pos` as `pos` and `neg` as `neg`

In [21]:
predictions = [nb_classifier.classify(feature_vect[0]) for feature_vect in testing_feature_vector]
actual = [feature_vect[1] for feature_vect in testing_feature_vector]

In [22]:
labels = ["pos","neg"]

In [23]:
cm = confusion_matrix(actual, predictions, labels=labels)
cm

array([[97,  3],
       [51, 49]], dtype=int64)

In [24]:
#predicted value is the row
#actual value is the column
#the accuracy = (sum of the diagram) / (total number of testing data)
pd.DataFrame(cm,
             index = labels, 
             columns = labels)

Unnamed: 0,pos,neg
pos,97,3
neg,51,49
