In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords

import re
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from scipy import stats


In [5]:
df = pd.read_csv('./review_process.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Helpfulness_ratio,avg_score,normalized_score,positive_review
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1.0,5.0,0.0,True
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,,1.0,0.0,False
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1.0,4.333333,-0.333333,True
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,1.0,2.0,0.0,False
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,,5.0,0.0,True


In [8]:
#create document and labels to train the model
docs = df['Text']
labels = df['positive_review']

# Create bag of words
A bag of words representation is to encode language or text into numerical vectors. This happens by first creating a list of all vocabularies. Then for each data point, the above list is initialized with all zeros, then the corresponding index's entry is incremented for each word in the data point. 

For example, consider the sentence 'I eat an apple' with a vocabulary list of 'I', 'eat', 'an', 'apple', 'orange'. The cardinality of the vocabulary list is five, so there will be a 1x5 vector representation of each datapoint. In the above example's case, the representation will be [1,1,1,1,0]. Similarly, for the sentence 'I eat eat an orange,' the representaiton would be [1,2,1,0,1].

The benefit of this representation is that document similarities can be calculated via cosine similarity defined as below:

$$similarity = \frac{dot\_product(d_1, d_2)}{||d_1||*||d_2||}$$

In [9]:
stop_words = set(stopwords.words("English"))
def remove_stopwords(text):
    text = re.sub("[^a-zA-z]"," ",text)
    text = [w.lower() for w in text.split()]
    #words = [w for w in text if not w in cached_stop_words]
    words = [w for w in text if not w in stop_words]
    return ''.join(words)
    

In [10]:
clean_text = docs.apply(remove_stopwords)
clean_text.head()

0    boughtseveralvitalitycanneddogfoodproductsfoun...
1    productarrivedlabeledjumbosaltedpeanutspeanuts...
2    confectionaroundcenturieslightpillowycitrusgel...
3    lookingsecretingredientrobitussinbelievefoundg...
4    greattaffygreatpricewideassortmentyummytaffyde...
Name: Text, dtype: object

In [11]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None) 

In [12]:
train_features = vectorizer.fit_transform(clean_text)
train_features.shape

(568452, 398196)

# Creating the baseline models
Now that we have the bag of words, the goal will be to fit a logistic regression and Random Forest model to measure the accuracy, to set the baseline. The processed data will be split into a train/test split, with 70% of the data as the training set. Then the models will be fit with the training data, and accuracy will be measured on the testing set. Hyperparameter tuning will be done using gridsearch. Once the optimal hyperparamers are found, a confidence interval of accuracy will be measured to statistically prove the accuracies measured in the neural network in the next notebook is significantly better or not.

In [14]:
#Logistic Regression

xtrain, xtest, ytrain, ytest = train_test_split(train_features, labels, test_size = 0.3)

In [28]:
lr_parameters = [{'penalty':['l1','l2'], 'max_iter':[10,100], 'C':[0.1,1,10]}]
lr = LogisticRegression()
grid = GridSearchCV(lr, lr_parameters, scoring='accuracy')
grid.fit(train_features, labels)
best_params = grid.best_params_
print('The best parameters for using this model is ', best_params)

The best parameters for using this model is  {'C': 10, 'max_iter': 100, 'penalty': 'l2'}


In [33]:
from sklearn.model_selection import cross_val_score
lr_model = LogisticRegression(C = 10, penalty='l2')
print("logistic Regression performance: %f" % (cross_val_score(lr_model, train_features, labels, cv=5,scoring='accuracy').mean()))

logistic Regression performance: 0.864803


In [46]:
# Random Forest model
rf_parameters = { 'n_estimators': [30, 50],
              'max_depth' : [3,5],
              'criterion' :['gini', 'entropy']}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_parameters)
rf_grid.fit(train_features, labels)
best_params = rf_grid.best_params_
print('The best parameters for RandomForest model are ', best_params)

The best parameters for RandomForest model are  {'criterion': 'gini', 'max_depth': 3, 'n_estimators': 30}


In [47]:
rf_model = RandomForestClassifier(criterion = 'gini', max_depth=3, n_estimators=30)
print(" RandomForest performance: %f" % (cross_val_score(rf_model, train_features, labels, cv=5,scoring='accuracy').mean()))

 RandomForest performance: 0.780673


In [41]:
#SVM Model
svm_parameters = [{'penalty':['l1','l2'], 'loss':['hinge', 'log', 'squared_hinge']}]

svm_grid = GridSearchCV(SGDClassifier(), svm_parameters)
svm_grid.fit(train_features, labels)
best_params = svm_grid.best_params_
print('The best parameters for SVM model are ', best_params)





The best parameters for SVM model are  {'loss': 'squared_hinge', 'penalty': 'l2'}


In [42]:
from sklearn.model_selection import cross_val_score
svm_model = SGDClassifier(penalty='l2', loss='squared_hinge')
print("SVM performance: %f" % (cross_val_score(svm_model, train_features, labels, cv=5,scoring='accuracy').mean()))



SVM performance: 0.783677


After cross validation
The accuracy of the Logistic regression model is 0.86
The accuracy of the Random Forest model is 0.78
The accuracy of the SVM is 0.78
Therefore we need to utilize the LSTM RNNs or CNN to improve the predictive model