Data Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_curve, auc

import re
import string
from nltk.corpus import stopwords

from tqdm import tqdm
import os

In [2]:
# using SQLite Table to read data.
con = sqlite3.connect('database.sqlite') 

# filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3

filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 LIMIT 5000""", con) 

# Give reviews with Score>3 a positive rating(1), and reviews with a score<3 a negative rating(0).
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative
print("Number of data points in our data", filtered_data.shape)
filtered_data.head(3)

Number of data points in our data (5000, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [3]:
filtered_data['Score'].value_counts(normalize=True)

1    0.8374
0    0.1626
Name: Score, dtype: float64

In [4]:
display = pd.read_sql_query("""
SELECT UserId, ProductId, ProfileName, Time, Score, Text, COUNT(*)
FROM Reviews
GROUP BY UserId
HAVING COUNT(*)>1
""", con)

In [5]:
print(display.shape)
display.head()

(80668, 7)


Unnamed: 0,UserId,ProductId,ProfileName,Time,Score,Text,COUNT(*)
0,#oc-R115TNMSPFT9I7,B005ZBZLT4,Breyton,1331510400,2,Overall its just OK when considering the price...,2
1,#oc-R11D9D7SHXIJB9,B005HG9ESG,"Louis E. Emory ""hoppy""",1342396800,5,"My wife has recurring extreme muscle spasms, u...",3
2,#oc-R11DNU2NBKQ23Z,B005ZBZLT4,Kim Cieszykowski,1348531200,1,This coffee is horrible and unfortunately not ...,2
3,#oc-R11O5J5ZVQE25C,B005HG9ESG,Penguin Chick,1346889600,5,This will be the bottle that you grab from the...,3
4,#oc-R12KPBODL2B5ZD,B007OSBEV0,Christopher P. Presta,1348617600,1,I didnt like this coffee. Instead of telling y...,2


In [6]:
#It was inferred after analysis that reviews with same parameters other than ProductId belonged to the same product,
#just having different flavour or quantity.
#Hence in order to reduce redundancy it was decided to eliminate the rows having same parameters.

In [7]:
sorted_data=filtered_data.sort_values('Id', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [8]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(4986, 10)

In [9]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [10]:
final['Text'] = final['Summary'].astype(str) + "\n" + \
                 final['Text'].astype(str)

In [11]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [12]:
# https://gist.github.com/sebleier/554280
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [13]:
from tqdm import tqdm
from bs4 import BeautifulSoup

preprocessed_reviews = []
# tqdm is for printing the status bar
for sentence in tqdm(final['Text'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # https://gist.github.com/sebleier/554280
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sentence.strip())

100%|██████████| 4986/4986 [00:02<00:00, 2148.21it/s]


In [14]:
final['Text']=preprocessed_reviews

In [15]:
final['Word_count']=final['Text'].str.split().str.len()

In [16]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Word_count
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,good quality dog food bought several vitality ...,27
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,not advertised product arrived labeled jumbo s...,21
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",delight says confection around centuries light...,43
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,cough medicine looking secret ingredient robit...,20
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,great taffy great taffy great price wide assor...,15


## Lemmatization or Stemming

In [17]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
lem=WordNetLemmatizer()
por=PorterStemmer()

In [18]:
tokens=[word_tokenize(word) for word in final['Text']]

In [19]:
stemmed_tokens=[[lem.lemmatize(word) for word in sentence] for sentence in tokens]

In [20]:
tokens

[['good',
  'quality',
  'dog',
  'food',
  'bought',
  'several',
  'vitality',
  'canned',
  'dog',
  'food',
  'products',
  'found',
  'good',
  'quality',
  'product',
  'looks',
  'like',
  'stew',
  'processed',
  'meat',
  'smells',
  'better',
  'labrador',
  'finicky',
  'appreciates',
  'product',
  'better'],
 ['not',
  'advertised',
  'product',
  'arrived',
  'labeled',
  'jumbo',
  'salted',
  'peanuts',
  'peanuts',
  'actually',
  'small',
  'sized',
  'unsalted',
  'not',
  'sure',
  'error',
  'vendor',
  'intended',
  'represent',
  'product',
  'jumbo'],
 ['delight',
  'says',
  'confection',
  'around',
  'centuries',
  'light',
  'pillowy',
  'citrus',
  'gelatin',
  'nuts',
  'case',
  'filberts',
  'cut',
  'tiny',
  'squares',
  'liberally',
  'coated',
  'powdered',
  'sugar',
  'tiny',
  'mouthful',
  'heaven',
  'not',
  'chewy',
  'flavorful',
  'highly',
  'recommend',
  'yummy',
  'treat',
  'familiar',
  'story',
  'c',
  'lewis',
  'lion',
  'witch',
 

In [21]:
stemmed_tokens

[['good',
  'quality',
  'dog',
  'food',
  'bought',
  'several',
  'vitality',
  'canned',
  'dog',
  'food',
  'product',
  'found',
  'good',
  'quality',
  'product',
  'look',
  'like',
  'stew',
  'processed',
  'meat',
  'smell',
  'better',
  'labrador',
  'finicky',
  'appreciates',
  'product',
  'better'],
 ['not',
  'advertised',
  'product',
  'arrived',
  'labeled',
  'jumbo',
  'salted',
  'peanut',
  'peanut',
  'actually',
  'small',
  'sized',
  'unsalted',
  'not',
  'sure',
  'error',
  'vendor',
  'intended',
  'represent',
  'product',
  'jumbo'],
 ['delight',
  'say',
  'confection',
  'around',
  'century',
  'light',
  'pillowy',
  'citrus',
  'gelatin',
  'nut',
  'case',
  'filbert',
  'cut',
  'tiny',
  'square',
  'liberally',
  'coated',
  'powdered',
  'sugar',
  'tiny',
  'mouthful',
  'heaven',
  'not',
  'chewy',
  'flavorful',
  'highly',
  'recommend',
  'yummy',
  'treat',
  'familiar',
  'story',
  'c',
  'lewis',
  'lion',
  'witch',
  'wardrobe'

In [22]:
ab=[" ".join(i) for i in stemmed_tokens]

In [23]:
final['Text']=ab

In [24]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Word_count
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,good quality dog food bought several vitality ...,27
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,not advertised product arrived labeled jumbo s...,21
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",delight say confection around century light pi...,43
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,cough medicine looking secret ingredient robit...,20
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,great taffy great taffy great price wide assor...,15


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final[["Text",'Word_count']].reset_index().drop('index',axis=1), final['Score'].reset_index().drop('index',axis=1), random_state = 0,shuffle=False)

In [26]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams

count_vect = CountVectorizer(ngram_range=(1,2),max_features=5000)
final_bigram_counts = count_vect.fit_transform(X_train['Text'])
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the shape of out text BOW vectorizer  (3739, 5000)
the number of unique words including both unigrams and bigrams  5000


In [27]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2),max_features=5000)

final_tf_idf = tf_idf_vect.fit_transform(X_train['Text'])
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

the shape of out text TFIDF vectorizer  (3739, 5000)
the number of unique words including both unigrams and bigrams  5000


In [28]:
a=pd.DataFrame(final_bigram_counts.toarray(),columns=count_vect.get_feature_names())

In [29]:
b=pd.DataFrame(final_tf_idf.toarray(),columns=tf_idf_vect.get_feature_names())

In [30]:
a.shape

(3739, 5000)

In [31]:
b.shape

(3739, 5000)

In [32]:
X_train_count=pd.concat([a,X_train[['Word_count']]],axis=1)
X_train_tf=pd.concat([b,X_train[['Word_count']]],axis=1)

In [33]:
X_train["Word_count"].shape

(3739,)

In [34]:
X_train_count.head()

Unnamed: 0,ability,able,able buy,able find,able get,absolute,absolute favorite,absolutely,absolutely delicious,absolutely love,...,yuk,yum,yum yum,yummy,yummy love,zero,zero star,zip,zip lock,Word_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,27
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,43
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,15


In [35]:
X_train_tf.head()

Unnamed: 0,ability,able,able buy,able find,able get,absolute,absolute favorite,absolutely,absolutely delicious,absolutely love,...,yuk,yum,yum yum,yummy,yummy love,zero,zero star,zip,zip lock,Word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.138978,0.0,0.0,0.0,0.0,0.0,43
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.123157,0.0,0.0,0.0,0.0,0.0,15


In [36]:
# Readying test data

In [37]:
test_count = count_vect.transform(X_test['Text'])
test_tf = tf_idf_vect.transform(X_test['Text'])

In [38]:
a=pd.DataFrame(test_count.toarray(),columns=count_vect.get_feature_names())
b=pd.DataFrame(test_tf.toarray(),columns=tf_idf_vect.get_feature_names())

In [39]:
X_test_count=pd.concat([a,X_test[['Word_count']].reset_index().drop('index',axis=1)],axis=1)
X_test_tf=pd.concat([b,X_test[['Word_count']].reset_index().drop('index',axis=1)],axis=1)

In [40]:
X_test_count.head()

Unnamed: 0,ability,able,able buy,able find,able get,absolute,absolute favorite,absolutely,absolutely delicious,absolutely love,...,yuk,yum,yum yum,yummy,yummy love,zero,zero star,zip,zip lock,Word_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,63
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,155
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,157


In [41]:
from sklearn.preprocessing import MinMaxScaler
mm=MinMaxScaler()
X_train_count=mm.fit_transform(X_train_count)
X_test_count=mm.transform(X_test_count)

In [42]:
mm1=MinMaxScaler()
X_train_tf=mm1.fit_transform(X_train_tf)
X_test_tf=mm1.fit_transform(X_test_tf)

In [43]:
#Converting back to sparse matrix

In [44]:
import scipy

In [45]:
X_train_count=scipy.sparse.csr_matrix(X_train_count)
X_train_tf=scipy.sparse.csr_matrix(X_train_tf)

In [46]:
X_test_count=scipy.sparse.csr_matrix(X_test_count)
X_test_tf=scipy.sparse.csr_matrix(X_test_tf)

In [47]:
# Modeling

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_auc_score

In [49]:
def get_auc_score(X_train,X_test,y_train,y_test,classifier,vectorizer):
    cls=classifier()
    scores=cross_val_score(cls,X_train,y_train,scoring='roc_auc',cv=5)
    if vectorizer==count_vect:
        print('For countvectorizer:')
    else:
        print('For tf_idf')
    print('AUC: for ',classifier,' ', scores.mean())

In [50]:
get_auc_score(X_train_count,X_test_count,y_train,y_test,LogisticRegression,count_vect)
get_auc_score(X_train_tf,X_test_tf,y_train,y_test,LogisticRegression,tf_idf_vect)

For countvectorizer:
AUC: for  <class 'sklearn.linear_model._logistic.LogisticRegression'>   0.9366478662635052
For tf_idf
AUC: for  <class 'sklearn.linear_model._logistic.LogisticRegression'>   0.9403533460651948


In [51]:
get_auc_score(X_train_count,X_test_count,y_train,y_test,MultinomialNB,count_vect)
get_auc_score(X_train_tf,X_test_tf,y_train,y_test,MultinomialNB,tf_idf_vect)

For countvectorizer:
AUC: for  <class 'sklearn.naive_bayes.MultinomialNB'>   0.9097813573697252
For tf_idf
AUC: for  <class 'sklearn.naive_bayes.MultinomialNB'>   0.9090631603035038


In [52]:
get_auc_score(X_train_count,X_test_count,y_train,y_test,RandomForestClassifier,count_vect)
get_auc_score(X_train_tf,X_test_tf,y_train,y_test,RandomForestClassifier,tf_idf_vect)

For countvectorizer:
AUC: for  <class 'sklearn.ensemble._forest.RandomForestClassifier'>   0.8983349965147062
For tf_idf
AUC: for  <class 'sklearn.ensemble._forest.RandomForestClassifier'>   0.9034914603615423


In [53]:
get_auc_score(X_train_count,X_test_count,y_train,y_test,XGBClassifier,count_vect)
get_auc_score(X_train_tf,X_test_tf,y_train,y_test,XGBClassifier,tf_idf_vect)

For countvectorizer:
AUC: for  <class 'xgboost.sklearn.XGBClassifier'>   0.9221446798320075
For tf_idf
AUC: for  <class 'xgboost.sklearn.XGBClassifier'>   0.9253238617481655


In [54]:
# Logistic Regression yields best result so far.

In [55]:
#SCORE ON TEST DATA

In [56]:
def auc_test(X_train,X_test,y_train,y_test,classifier,vectorizer):
    cls=classifier()
    cls.fit(X_train,y_train)
    if vectorizer==count_vect:
        print('For countvectorizer:')
    else:
        print('For tf_idf')
    print('AUC: for ',classifier,' ', roc_auc_score(cls.predict(X_test),y_test))

In [57]:
auc_test(X_train_tf,X_test_tf,y_train,y_test,LogisticRegression,tf_idf_vect)

For tf_idf
AUC: for  <class 'sklearn.linear_model._logistic.LogisticRegression'>   0.8621105169340464


In [58]:
lr_final=LogisticRegression()
lr_final.fit(X_train_tf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
feature_names = np.array(tf_idf_vect.get_feature_names())
sorted_coef_index = lr_final.coef_[0].argsort()
print('Smallest Coef: \n{}\n'.format(feature_names[sorted_coef_index-1][:10]))
print('Largest Coef: \n{}\n'.format(feature_names[sorted_coef_index-1][:-11:-1]))

Smallest Coef: 
['disappoint' 'nose' 'not going' 'awesome' 'worse' 'hormel compleats'
 'aware' 'term' 'disappointed product' 'swiss']

Largest Coef: 
['greasy' 'lot le' 'besides' 'goo' 'delicate' 'next time' 'percent'
 'example' 'smoothy' 'yum yum']

