In [110]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.model_selection import GridSearchCV

from sklearn.utils import shuffle


In [111]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import re

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use("ggplot")
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)
from os import path
#from PIL import image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [112]:
#import data set as dataframe and use smaller subset
#for simple modeling purposes
df=pd.read_csv("data/rotten_tomatoes_reviews.csv")
df = shuffle(df)
df_quick = df[:10000]
df_holdout = df[400000:]
df_quick.head()

Unnamed: 0,Freshness,Review
620,1,"Buoyant and lightweight... except for Brando,..."
176092,0,"Years from now, chances are that when people ..."
316086,0,For those of us who want movies to be about s...
396557,1,"A good-natured, if inconsequential, thrill ri..."
371579,1,Quentin Tarantino's latest film is a deliciou...


In [113]:
def cleaned_dframe(df, col_name = None):
    """Purpose: Take in a text based Dataframe and return a cleaned text 
    dataframe by using regex, lowercasing, stripping stop words and lemmatizing
    
    Input: Dataframe with only text column
    Output: Dataframe with only cleaned text column"""
    
    #using regexp notation to get rid of numbers in reviews
    df[col_name] = df[col_name].str.replace(r'([^a-zA-Z\s]+?)'," ")

    # 1. Create a set of documents.
    df[col_name] = df[col_name].apply(lambda x : x.lower())

    # 2. Create a set of tokenized documents.
    docs_tokenized = [word_tokenize(content) for content in df[col_name].values]
    

    # 3. Strip out stop words from each tokenized document.

    stop = set(stopwords.words('english'))
#    new_stopwords = set(["film","movie","like","feel","time","little","adject", "adds",
#                        "bestloved","agonizingly","bantamweight"])
#    stop.update(new_stopwords)
    docs_stop = [[word for word in words if word not in stop] for words in docs_tokenized]

    # Stemming / Lemmatization

    # 1. Stem using lemmatizer
    wordnet = WordNetLemmatizer()
    docs_wordnet = [[wordnet.lemmatize(word) for word in words] for words in docs_stop]

    new_element =[]
    for element in docs_wordnet:
        test = " ".join(element)
        new_element.append(test) 
    new_series = pd.Series(new_element)
    col = "text"
    cleaned_df = pd.DataFrame(new_series,columns = [col])
    return cleaned_df



In [123]:
col= "Review"
cleaned_df = cleaned_dframe(df_quick.copy(), col)
cleaned_df

Unnamed: 0,text
0,buoyant lightweight except brando fact rather ...
1,year chance people sit around talk enthusiasti...
2,u want movie something empty action minimal sh...
3,good natured inconsequential thrill ride shepa...
4,quentin tarantino latest film deliciously ente...
5,perhaps maclachlan visual vocabulary limited f...
6,redford honorable job emulating author unsenti...
7,blomkamp far better observer technological beh...
8,akin achieves peaceful balance alongside death...
9,light silly instantly forgettable comedy peppe...


In [124]:
def text2num(cleaned_df, col= None, train=True, cv=None, tfidf=None):
    """Purpose: receive a df and a column for text and turn text into CountVectorized Data and tfidf
    data
    
    Input: DataFrame with string column to be numerically vectorized
    Output: A doc word count matrix and a tfidf matrix as well"""
    
    str_data = cleaned_df[col].values
    if train == True:
        X_counts = cv.fit_transform(str_data)
        X_counts_tfidf_arr = tfidf.fit_transform(X_counts).toarray()
    else:
        X_counts = cv.transform(str_data)
        X_counts_tfidf_arr = tfidf.transform(X_counts).toarray()
    return X_counts, X_counts_tfidf_arr
    
    

In [227]:
cv = CountVectorizer(lowercase=True, tokenizer=None, strip_accents= "ascii", stop_words="english",
                             analyzer='word', max_df=1.0, min_df=2,ngram_range=(1,1),
                             max_features=4500)
tfidf = TfidfTransformer(use_idf=True)


In [228]:
col2= "text"
train = True
X_counts, X_counts_tfidf_arr = text2num(cleaned_df.copy(), col2, train, cv, tfidf)
X_counts, X_counts_tfidf_arr

(<10000x4500 sparse matrix of type '<class 'numpy.int64'>'
 	with 87893 stored elements in Compressed Sparse Row format>,
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [229]:
df_train = df[:30000]
df_test = df[30000:37500]
y_train = df_train.Freshness
y_test = df_test.Freshness
df_train_clean = cleaned_dframe(df_train.copy(),"Review")
df_test_clean = cleaned_dframe(df_test.copy(),"Review")
X_counts_train, X_counts_tfidf_arr_train = text2num(df_train_clean.copy(),"text",True,cv,tfidf)
X_counts_test, X_counts_tfidf_arr_test = text2num(df_test_clean.copy(),"text",False,cv,tfidf)
X_counts_test,X_counts_tfidf_arr_test
df_holdout = df[37500:45000]
y_holdout = df_holdout.Freshness
df_holdout_clean = cleaned_dframe(df_holdout.copy(),"Review")
X_counts_holdout, X_counts_tfidf_arr_holdout = text2num(df_holdout_clean.copy(),"text",False,cv,tfidf)

In [230]:
nb_model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)

In [231]:
nb_model.fit(X_counts_tfidf_arr_train, y_train)
nb_model.score(X_counts_tfidf_arr_train,y_train), nb_model.score(X_counts_tfidf_arr_test,y_test),nb_model.score(X_counts_tfidf_arr_holdout,y_holdout)

(0.7926, 0.744, 0.7470666666666667)

In [122]:
# Problem is that i changed the stop words due to misclassifications in my 
# test set.  This could be data leakage.  Let me check on a holdout set.

df_holdout2 = df[30000:35000]
y_holdout2 = df_test.Freshness
df_holdout_clean2 = cleaned_dframe(df_holdout2.copy(),"Review")
X_counts_holdout2, X_counts_tfidf_arr_holdout2 = text2num(df_holdout_clean2.copy(),"text",False,cv,tfidf)
nb_model.score(X_counts_tfidf_arr_holdout2,y_holdout2)

#It is unfortunately confirmed.  I do not have enough features or data in my
#model to get useful information.  Let's use gridsearch to arrive at a good
#small model. Let's start by going back and taking the new stop words out of 
#stop words.

0.4988

In [86]:
# This is really weird.  My test score varies from .72 down to .5 based on
# the sample chose.  I suspect that I do not have enough data rows or features.

In [87]:
# parameter_candidates = [
#   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
# ]
parameter_candidates = [{"max_features":[5000,10000,20000,100000]}]

In [88]:
# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=nb_model, param_grid=parameter_candidates, cv=5, n_jobs=-1)

# Train the classifier on data1's feature and target data
clf.fit(data1_features, data1_target) 

NameError: name 'data1_features' is not defined

In [None]:
# View the accuracy score
print('Best score for data1:', clf.best_score_)

In [None]:
# View the best parameters for the model found using grid search
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)