# Project Overview

For this project we'll use the Cornell University Movie Review polarity dataset v2.0 obtained from http://www.cs.cornell.edu/people/pabo/movie-review-data/


We will try to predict whether a review is considered negative or positive

# Importing Basic Libraries

In [1]:
# These are the libraries I typically use in my analysis so I find it easier to import them all at once
# If I need more libraries I will import them as needed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

# Importing the Dataset

In [2]:
# Our dataset is moviereviews.tsv, where the tsv stands for "tab separated variables"
# Hence in order to import the file correctly we need to add delimiter = "\t"
# We will name the dataframe "movies"

movies =  pd.read_csv('moviereviews.tsv', delimiter = '\t')

In [3]:
movies.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
# There are 2000 movie reviews in our dataset

movies.shape

(2000, 2)

In [5]:
# We do not have any missing values in the label column
# We do however have 35 missing values in the review column

movies.isnull().sum()

label      0
review    35
dtype: int64

In [6]:
# Here we will drop the missing values

movies.dropna(inplace = True)

In [7]:
# Just confirming that we no longer have any null values

movies.isnull().sum()

label     0
review    0
dtype: int64

In [8]:
# As an extra check, we will create our own way of iterating through the dataset to look for empty strings

# So here we will initialize an empty list
blanks = []

# We will get a tuple with the index location, label value, and the review text itself
# i = index, lb = label value, rv = review
for i,lb,rv in movies.itertuples():
    # isspace() checks if there is just a whitespace and no other values
    # "If the review is whitespace"
    if rv.isspace():
        # Add the index position to our empty list me initiated above
        blanks.append(i)

In [9]:
# Here are all the indices where the review is just empty space

blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [10]:
# We will remove the rows with empty reviews by passing in our blanks list to our dataframe

movies.drop(blanks, inplace = True)

In [11]:
# Looks like we have a perfectly symmetrical breakdown of positive and negative reviews after the cleaning

movies['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

# Creating Our Classification Model Part One

In [12]:
# Setting up our X and y values

X = movies['review']
y = movies['label']

In [13]:
# Splitting the data between a training and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Let's build a pipeline to vectorize the data

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# The first model we will train will be a Naive Bayes classifier

from sklearn.naive_bayes import MultinomialNB

text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

In [16]:
# Fitting the data to the NB Classifier

text_clf_nb.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [17]:
# Form a prediction set

y_pred = text_clf_nb.predict(X_test)

In [18]:
# Report the confusion matrix

from sklearn import metrics
print(metrics.confusion_matrix(y_test,y_pred))

[[174  14]
 [ 65 135]]


In [19]:
# Print a classification report

print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.73      0.93      0.81       188
         pos       0.91      0.68      0.77       200

    accuracy                           0.80       388
   macro avg       0.82      0.80      0.79       388
weighted avg       0.82      0.80      0.79       388



In [20]:
# Print the overall accuracy

print(metrics.accuracy_score(y_test,y_pred))

0.7963917525773195


In [21]:
# Here we will create our own personal review
# We will then test our model on the review and see what it predicts

myreview = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [22]:
# Here we will run our model on a new review
# Be sure to put "myreview" inside square brackets
# The review is considered negative by our model
print(text_clf_nb.predict([myreview])) 

['neg']


# Creating Our Classification Model Part Two

In [23]:
# Our second model will be a Support Vector Machine Classifier

from sklearn.svm import LinearSVC

text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

In [24]:
# Fitting the data to the SVC Classifier

text_clf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [25]:
# Form a prediction set

y_pred = text_clf_lsvc.predict(X_test)

In [26]:
# Report the confusion matrix

from sklearn import metrics
print(metrics.confusion_matrix(y_test,y_pred))

[[162  26]
 [ 26 174]]


In [27]:
# Print a classification report

print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.86      0.86      0.86       188
         pos       0.87      0.87      0.87       200

    accuracy                           0.87       388
   macro avg       0.87      0.87      0.87       388
weighted avg       0.87      0.87      0.87       388



In [28]:
# Print the overall accuracy

print(metrics.accuracy_score(y_test,y_pred))

0.865979381443299


In [29]:
# Here we will run our model on a new review
# Be sure to put "myreview" inside square brackets
# The review is considered negative by our model

print(text_clf_lsvc.predict([myreview]))

['neg']


# Adding Stopwords to CountVectorizer

By default, CountVectorizer and TfidfVectorizer do not filter stopwords. However, they offer some optional settings, including passing in your own stopword list.

The CountVectorizer class accepts the following arguments:

CountVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)

TfidVectorizer supports the same arguments and more. Under stop_words we have the following options:

stop_words : string {'english'}, list, or None (default)

That is, we can run TfidVectorizer(stop_words='english') to accept scikit-learn's built-in list,
or TfidVectorizer(stop_words=[a, and, the]) to filter these three words. In practice we would assign our list to a variable and pass that in instead.

Scikit-learn's built-in list contains 318 stopwords:

from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)
['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']

In [30]:
# Let's trim the list to just 60 words:

stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [31]:
# Here we will add Stopwords to the Linear SVC Pipeline:

text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC())])


text_clf_lsvc2.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['a', 'about', 'an', 'and', 'are',
                                             'as', 'at', 'be', 'been', 'but',
                                             'by', 'can', 'even', 'ever', 'for',
                                             'from', 'get', 'had', 'has',
                                             'have', 'he', 'her', 'hers', 'his',
                                             'how', 'i', 'if', 'in', 'into',
                                             'is', ...])),
                ('clf', LinearSVC())])

In [32]:
y_pred = text_clf_lsvc2.predict(X_test)

print(metrics.confusion_matrix(y_test,y_pred))

[[160  28]
 [ 33 167]]


In [33]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.83      0.85      0.84       188
         pos       0.86      0.83      0.85       200

    accuracy                           0.84       388
   macro avg       0.84      0.84      0.84       388
weighted avg       0.84      0.84      0.84       388



In [34]:
# Our score didn't change that much. Keep in mind that 2000 movie reviews is a relatively small dataset. 
# The real gain from stripping stopwords is improved processing speed.
# Depending on the size of the corpus, it might save hours.

print(metrics.accuracy_score(y_test,y_pred))

0.8427835051546392


In [35]:
# Here we will run our model on a new review
# Be sure to put "myreview" inside square brackets
# The review is considered negative by our model
# So all three models we created predicted that our review was negative. That's good for consistency and accuracy

print(text_clf_lsvc2.predict([myreview]))

['neg']
