# Sentiment Analysis of Movie Reviews

Jonathan Ortiz

#Imports and Data Upload

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfTransformer


In [None]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
train = 'https://drive.google.com/file/d/1La56QDc3XPPvNJixmSqlxC4NrUMq0e_O/view?usp=sharing'
test = 'https://drive.google.com/file/d/1N62Qwif5BuEsW5iKzlh2JSoar8vSomyf/view?usp=sharing'

In [None]:
id = train.split("/")[-2]
 
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train.csv') 
 
train = pd.read_csv('train.csv')
print(train)

In [None]:
id = test.split("/")[-2]
 
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('test.csv') 
 
test = pd.read_csv('test.csv')
print(test)

In [None]:
test.head()

In [None]:
train.head()

Tokenizing each word vectorizing unigram counts.

#Unigram

In [None]:
#Create vectors using ngram_range(1,1) to specify unigrams

token = RegexpTokenizer(r'[a-sA-Z0-9]+')
cv = CountVectorizer(ngram_range=(1,1), tokenizer = token.tokenize)
train_counts = cv.fit_transform(train['text'])
test_counts = cv.fit_transform(test['text'])

#[:5000]

In [None]:
train_counts.shape

In [None]:
test_counts.shape

In [None]:
train.label.shape

In [None]:
#Switching the frequency unigram counts in the vectors to Tf-idf values

tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
test_tfidf = tfidf_transformer.fit_transform(test_counts)

In [None]:
train_tfidf.shape

In [None]:
test_tfidf.shape

In [None]:
#Splitting the data into parameters to be passed into the prediction model later

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    train_tfidf, train.label, test_size = 0.20, random_state = 2)

Compile model, import SKLearn's Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
#Fitting the model

MNB = MultinomialNB()
MNB.fit(x_train, y_train)

In [None]:
#Evaluate the model by quantifying its quality.

from sklearn import metrics
predicted = MNB.predict(x_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

In [None]:
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

Utilizing MultinomialNB Classifier with the TF-IDF values, the accuracy score for the trained model was 84.99%. 

Will now go back to vectors of frequency counts to see the accuracy differency frrom the TF-IDF values.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train_counts, train.label, test_size = 0.20, random_state = 2)

In [None]:
MNB = MultinomialNB()
MNB.fit(x_train, y_train)

In [None]:
predicted = MNB.predict(x_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

The accuracy slightly decreased to 83.21% when using unigram frequency rather then TF-IDF values.

Using a different Naive Bayes Classifier to compare accuracy on the unigram TF-IDF values.

Refill parameters to have TF-IDF values:

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train_tfidf, train.label, test_size = 0.20, random_state = 2)



Using Complement Naive Bayes.

Instead of predicting whether an item belongs to a certain class, Complement Naive Bayes predicts the probability of the item belonging to all classes.

In [None]:
from sklearn.naive_bayes import ComplementNB

In [None]:
#Fit TF-IDF Vectors into CNB Model

CNB = ComplementNB()
CNB.fit(x_train, y_train)

In [None]:
#Evaluate model quality by quantifying the accuracy

accuracy_score = metrics.accuracy_score(CNB.predict(x_test), y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

Using Unigram TF-IDF Complement Naive Bayes produced an 85.81% Accuracy Score

#Bigrams

In [None]:
#Switching ngram_range=(2,2) to specify Bigrams

cv = CountVectorizer(ngram_range=(2,2), tokenizer = token.tokenize)
train_counts = cv.fit_transform(train['text'])
test_counts = cv.fit_transform(test['text'])

In [None]:
#Split into passable parameters to pass to model

x_train, x_test, y_train, y_test = train_test_split(
    train_counts, train.label, test_size = 0.20, random_state = 2)

Passing bigrams into MultinominalNB first using just bigram frequency vectors.

In [None]:
#Fitting the Model

MNB = MultinomialNB()
MNB.fit(x_train, y_train)

In [None]:
predicted = MNB.predict(x_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

Using Bigrams with MultinomialNB the accuracy of the classifier is 87.40%. Previously using Unigrams frequency counts the accuracy was 83.45%. There was rougly a 4% increase in accuracy when using Bigrams over Unigrams in this instance (frequency rather then TF-IDF).

In [None]:
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
test_tfidf = tfidf_transformer.fit_transform(test_counts)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train_tfidf, train.label, test_size = 0.20, random_state = 2)

In [None]:
MNB = MultinomialNB()
MNB.fit(x_train, y_train)

In [None]:
predicted = MNB.predict(x_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

When using TF-IDF from the bigrams with the MNB the accuracy is 88.36%. This is an increase from the 87.40% from the frequency counts and an increase from the 84.99% from the Unigram TF-IDF MNB score.

Next the CNB on the bigrams:

In [None]:
#Fit CNB model with tf-idf vectors

CNB = ComplementNB()
CNB.fit(x_train, y_train)

In [None]:
#Evaluate accuracy

accuracy_score = metrics.accuracy_score(CNB.predict(x_test), y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

In [None]:
#Switch data to counts, not tf-idf

x_train, x_test, y_train, y_test = train_test_split(
    train_counts, train.label, test_size = 0.20, random_state = 2)

CNB = ComplementNB()
CNB.fit(x_train, y_train)

accuracy_score = metrics.accuracy_score(CNB.predict(x_test), y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

With the Complement NB algorithm, TF-IDF Bigrams scored 88.39% while Bigram Frequency Counts scored slightly lower at 87.48% Accuracy at predicting the proper class

#Other Classifiers (LinearSVC and SGDC) with Bigrams

Linear Support Vector attempts to find a hyperplane which divides the data in order to predict/classify the items

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [None]:
LSVC = LinearSVC()
SGDC = SGDClassifier()

Realized every time I switched between tf-idf and counts I kept resampling the test/train data. The variables should have been set and not changed throughout. I have now switched to x,X and y,Y to differentiate between counts vs tf-idf

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train_tfidf, train.label, test_size = 0.20, random_state = 2)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    train_counts, train.label, test_size = 0.20, random_state = 2)

In [None]:
LSVC.fit(x_train, y_train)
accuracy_score_lsvc = metrics.accuracy_score(LSVC.predict(x_test), y_test)
print(str('{:04.2f}'.format(accuracy_score_lsvc*100))+'%')

In [None]:
LSVC.fit(X_train, Y_train)
accuracy_score = metrics.accuracy_score(LSVC.predict(X_test), Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

Utilizing LinearSVC with Bigrams:

TF-IDF : 91.06% Accuracy

Counts : 89.21% Accuracy

SGDC is a linear classifier which utilizes the minimal cost function to predict classes

In [None]:
SGDC.fit(x_train, y_train)
accuracy_score = metrics.accuracy_score(SGDC.predict(x_test), y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

In [None]:
SGDC.fit(X_train, Y_train)
accuracy_score = metrics.accuracy_score(SGDC.predict(X_test), Y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

SGDC with Bigrams:

TF-IDF : 89.98%

Counts : 89.21%