In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
import matplotlib.pyplot as plt
from functions import *
import nltk
from nltk.corpus import gutenberg, stopwords
from nltk.collocations import *
from nltk import FreqDist
from nltk import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('/Users/kevinmacmat/Desktop/mod_4_nlp_project/reviews_dataframe')

In [3]:
df

Unnamed: 0,reviews,sentiment
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0
...,...,...
24995,About a year ago I finally gave up on American...,1
24996,When I saw the elaborate DVD box for this and ...,1
24997,"Last November, I had a chance to see this film...",1
24998,Great movie -I loved it. Great editing and use...,1


### Create target variables and train-test-split

In [5]:
X = df['reviews']
y = df['sentiment']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Instantiate the count vectorizer

In [8]:
# Build the vectorizer, specify max features 
vect = CountVectorizer(stop_words = ENGLISH_STOP_WORDS, min_df = 10)
# can add in ngram_range=(1,2) for uni/bigram analysis, max_df = 200 (limits size of vocab to which occurs in 
# no more than 200 docs), min_df = 50 (limit size of vocab to ignore terms which occur in less than 50 docs)

### Fit, transform, and get a document-term-matrix

In [11]:
# Fit the vectorizer
X_train = vect.fit_transform(X_train)
# Transform the review column
X_test = vect.transform(X_test)
# Create the bow representation
X_df = pd.DataFrame(X_test.toarray(), columns = vect.get_feature_names())

In [22]:
X_df

Unnamed: 0,aag,aaron,abandon,abandoned,abandoning,abandonment,abandons,abbey,abbot,abbott,...,zombie,zombies,zone,zoo,zoom,zooms,zorro,zu,zulu,zuniga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6247,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6248,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Classify

In [14]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [15]:
nb_classifier.fit(X_train, y_train)
nb_train_preds = nb_classifier.predict(X_train)
nb_test_preds = nb_classifier.predict(X_test)

In [16]:
rf_classifier.fit(X_train, y_train)
rf_train_preds = rf_classifier.predict(X_train)
rf_test_preds = rf_classifier.predict(X_test)

In [19]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.8803 		 Testing Accuracy: 0.8515

----------------------------------------------------------------------

Random Forest
Training Accuracy: 1.0 		 Testing Accuracy: 0.8467
