In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import spacy
import pickle

In [2]:
df = pd.read_csv('cleaned_holdout_set')

In [3]:
df.head()

Unnamed: 0,reviews,sentiment,cleaned_reviews
0,Alan Rickman & Emma Thompson give good perform...,0,alan rickman emma thompson give good performan...
1,I have seen this movie and I did not care for ...,0,i have see this movie and i do not care for th...
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0,in los angeles the alcoholic and lazy hank chi...
3,"This film is bundled along with ""Gli fumavano ...",0,this film be bundle along with gli fumavano le...
4,I only comment on really very good films and o...,0,i only comment on really very good film and on...


In [4]:
stopwords_list = stopwords.words('english')+['pron', '-PRON-']
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Import vectorizers/models

In [6]:
#BOW vectorizer
pickle_in = open("BOW_vect.pickle","rb")
BOW_vect = pickle.load(pickle_in)

In [7]:
#TFIDF vectorizer
pickle_in = open("TFIDF_vect.pickle","rb")
TFIDF_vect = pickle.load(pickle_in)

In [8]:
# logistic regresion model utilizing CountVectorizer
pickle_in = open("log_reg_mod.pickle","rb")
log_reg_mod = pickle.load(pickle_in)

In [9]:
# Naive Bayes model utilizing TfidfVectorizer
pickle_in = open("nb_mod.pickle","rb")
nb_mod = pickle.load(pickle_in)

## Logistic Regression Model

In [5]:
# choose features/target
predictors = df['cleaned_reviews']
target = df['sentiment']

In [10]:
# use vector from trained model
BOW_predictors = BOW_vect.transform(predictors)

In [11]:
# create dataframe with features as columns
BOW_predictors = pd.DataFrame(BOW_predictors.toarray(), columns=BOW_vect.get_feature_names())

In [12]:
#predict on holdout data
y_pred_log = log_reg_mod.predict(BOW_predictors)


# checking accuracy
print('Holdout Accuracy score: ', accuracy_score(y_pred_log, target))
print()
# checking F1
print('Holdout F1 score: ', f1_score(y_pred_log, target))
print()
# print confusion matrix
print('Confusion matrix Holdout set: \n', confusion_matrix(target, y_pred_log)/len(target))

Test Accuracy score:  0.87604
Test F1 score:  0.8770091677580664
Confusion matrix test set: 
 [[0.43408 0.05804]
 [0.06592 0.44196]]


## Naive Bayes Model

In [16]:
# choose features/target
predictors = df['cleaned_reviews']
target = df['sentiment']

In [17]:
# use vector from trained model
TFIDF_predictors=TFIDF_vect.transform(predictors)

In [18]:
# create dataframe with features as columns
TFIDF_predictors = pd.DataFrame(TFIDF_predictors.toarray(), columns=TFIDF_vect.get_feature_names())

In [19]:
#predict on holdout data
y_pred_nb = nb_mod.predict(predictors)

# checking accuracy
print('Holdout Accuracy score: ', accuracy_score(y_pred_nb, target))
print()
# checking F1
print('Holdout F1 score: ', f1_score(y_pred_nb, target))
print()
# print confusion matrix
print('Confusion matrix Holdout set: \n', confusion_matrix(target, y_pred_nb)/len(target))

Naive Bayes Test Accuracy Score : 0.85188
Test data f1 score:0.8502567835335031
confusion_matrix:[[0.43136 0.06864]
 [0.07948 0.42052]]


## Spacy TextCategorizer model

In [21]:
# load the saved model
output_dir=%pwd
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

Loading from /Users/jason/Flatiron/MOD4


In [22]:
# test that model is correctly loaded
test_text = "This movie is horrible"
doc = nlp2(test_text)
print(test_text, max(doc.cats, key=lambda key: doc.cats[key]))

This movie is horrible NEGATIVE


In [23]:
# test model one more time
test_text = "while there are good points in this movie overall its pretty meh"
doc = nlp2(test_text)
print(test_text, max(doc.cats, key=lambda key: doc.cats[key]))

while there are good points in this movie overall its pretty meh NEGATIVE


In [24]:
# create a list of reviews for model evaluation
test = df['reviews'].tolist()
len(test)

25000

In [25]:
# classify each review in the above list and append its calssification to a list
predictions = []
for review in test:
    doc = nlp2(review)
    predictions.append(max(doc.cats, key=lambda key: doc.cats[key]))

In [29]:
# creat a column based on the classified list then change to 0 and 1 for evaluation
df['predictions'] = predictions
df['predictions'] = np.where(df['predictions'] == 'NEGATIVE', 0, 1)

In [30]:
df.head()

Unnamed: 0,reviews,sentiment,cleaned_reviews,predictions
0,Alan Rickman & Emma Thompson give good perform...,0,alan rickman emma thompson give good performan...,0
1,I have seen this movie and I did not care for ...,0,i have see this movie and i do not care for th...,0
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0,in los angeles the alcoholic and lazy hank chi...,0
3,"This film is bundled along with ""Gli fumavano ...",0,this film be bundle along with gli fumavano le...,0
4,I only comment on really very good films and o...,0,i only comment on really very good film and on...,0


In [32]:
# get the column of predictions to evaluate model
spacy_predictions = df['predictions']

# checking accuracy
print('Holdout Accuracy score: ', accuracy_score(spacy_predictions, target))
print()
# checking F1
print('Holdout F1 score: ', f1_score(spacy_predictions, target))
print()
# print confusion matrix
print('Confusion matrix Holdout set: \n', confusion_matrix(target, spacy_predictions)/len(target))

Spacy Model accuracy : 0.89548
Test data f1 score:0.895834163842934
confusion_matrix:[[0.44604 0.05396]
 [0.05056 0.44944]]
