# Amazon Food Reviews
### data is downloaded from https://www.kaggle.com/snap/amazon-fine-food-reviews

In [1]:
%matplotlib inline
import sqlite3
import sklearn
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
import os
import string
from gensim.models import Word2Vec, KeyedVectors
import pickle



## Data Cleaning: deduplication and anomaly removal

In [3]:
conn = sqlite3.connect('C:/Users/HP/Desktop/ml/afr.sqlite')
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, conn)
filtered_data['Score'] = filtered_data.Score.apply(lambda x: 1 if x>3 else 0)
filtered_data.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [4]:
#sorting data on the basis of ProductId
sorted_data = filtered_data.sort_values('ProductId', axis = 0, ascending = True, inplace = False, 
                                        kind = 'quicksort', na_position = 'last')
sorted_data.keys()

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [5]:
final =  sorted_data.drop_duplicates(subset = {"UserId", "Time", "Text"}, keep = 'first', inplace = False)
final.shape

(364133, 10)

In [6]:
final = final.drop_duplicates(subset = {"UserId", "ProductId"}, keep = 'first', inplace = False)
final.shape

(362899, 10)

In [7]:
final = final.drop_duplicates(subset = {"ProductId", "Text", "Time"}, keep = 'first', inplace = False)
final.shape

(362893, 10)

In [8]:
final = final.drop_duplicates(subset = {"UserId", "ProductId"}, keep = 'first', inplace = False)
final.shape

(362893, 10)

In [9]:
def ams(filtered_data, regex_string):
    mask = filtered_data.Summary.str.lower().str.contains(regex_string)
    filtered_data.drop(filtered_data[mask].index, inplace = True)
def amt(filtered_data, regex_string):
    mask = filtered_data.Text.str.lower().str.contains(regex_string)
    filtered_data.drop(filtered_data[mask].index, inplace = True)
ams(final, re.compile(r"\story|book|bbook|learn|read&/((?!review).)*/s"))

  


In [10]:
final.shape

(362764, 10)

In [11]:
amt(final, re.compile(r"\story|book|bbook|learn|read&/((?!review).*/s)"))
final.shape

  """


(357386, 10)

In [12]:
ams(final,re.compile(r"\breads"))
amt(final, re.compile(r"\breads"))
ams(final, re.compile(r"\breading"))
amt(final, re.compile(r"\breading"))
ams(final, re.compile(r"\  movie"))
amt(final, re.compile(r"\ movie"))
ams(final, re.compile(r"\dress"))
amt(final, re.compile(r"\dress"))
ams(final, re.compile(r"\story"))
final.shape

(352572, 10)

In [13]:
fianl = final.drop_duplicates(subset = {"UserId", "Time"}, inplace = False, keep = 'first')
final.shape

(352572, 10)

In [14]:
final.keys()

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [15]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]
final.shape

(352570, 10)

In [16]:
print(final['Text'].values[:5])

["A very entertaining rhyming story--cleaver and catchy.The illustrations are imaginative and fit right in. However, the paperback is somewhat small and flimsy. I'd opt for a bigger edition."
 'This product by Archer Farms is the best drink mix ever. Just mix a flavored packet with your 16 oz. water bottle. Contains the all natural sweetner Stevia, real fruit flavoring and no food coloring. Just colored with fruit or vegetable colors. Pure and natural and tastes great. There are eight packets in a box and only contains 10 calories per packet. Thank you Archer Farms!'
 'Our dogs just love them.  I saw them in a pet store and a tag was attached regarding them being made in China and it satisfied me that they were safe.'
 'My dogs loves this chicken but its a product from China, so we wont be buying it anymore.  Its very hard to find any chicken products made in the USA but they are out there, but this one isnt.  Its too bad too because its a good product but I wont take any chances till 

In [17]:
final = final[1:]

## Text Preprocessing

In [18]:
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')
i =0
last = final.shape[0]
print(last)
final_string = []
pos_words = []
neg_words = []
while i <last:
    sent = final['Text'].values[i]
    sent = re.sub(r'[.|,|)|(|\|/|!]', r' ', sent)
    sent = re.sub(re.compile('<.*?>'),' ', sent)
    sent = re.sub('[^A-Za-z0-9]+', r' ', sent)
    sent = re.sub("\S*\d\S*", r' ', sent)
    new_sent = []
    for w in sent.split():
        if(len(w)>2 and w.isalpha()):
            if(w.lower() not in stop):
                w = (sno.stem(w.lower())).encode('utf8')
                new_sent.append(w)
                if final['Score'].values[i]==1:
                    pos_words.append(w)
                else:
                    neg_words.append(w)
    sent = b" ".join(new_sent) 
    final_string.append(sent)
    i=i+1
final['CleanedText'] = final_string
final['CleanedText'] = final['CleanedText'].str.decode("utf8")

352569


In [19]:
final.keys()

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'CleanedText'],
      dtype='object')

In [20]:
final['CleanedText'].head()

476617    product archer farm best drink mix ever mix fl...
22621     dog love saw pet store tag attach regard made ...
22620     dog love chicken product china wont buy anymor...
157850    feed greyhound treat year hound littl finicki ...
157849    one product welsh terrier eat sophi food alerg...
Name: CleanedText, dtype: object

In [21]:
conn = sqlite3.connect('C:/Users/HP/Desktop/ml/final.sqlite')
c = conn.cursor()
conn.text_factory  = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)
conn.close()

## Bag of Words
##      Using CountVectorizer()

In [22]:
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final['CleanedText'].values)
final_counts

<352569x68616 sparse matrix of type '<class 'numpy.int64'>'
	with 10716885 stored elements in Compressed Sparse Row format>

In [23]:
print("the shape of out text BOW vectorizer ",final_counts.get_shape())
print("the number of unique words ", final_counts.get_shape()[1])

the shape of out text BOW vectorizer  (352569, 68616)
the number of unique words  68616


## Using TfidfTransformer()

In [24]:
tfidf_transform = TfidfTransformer(smooth_idf = False)
final_tfidf_transform = tfidf_transform.fit_transform(final_counts)
final_tfidf_transform

<352569x68616 sparse matrix of type '<class 'numpy.float64'>'
	with 10716885 stored elements in Compressed Sparse Row format>

## Using TfidfVectorizer()

In [25]:
tfidf_vect = TfidfVectorizer()
final_tfidf_vect= tfidf_vect.fit_transform(final['CleanedText'].values)
final_tfidf_vect

<352569x68616 sparse matrix of type '<class 'numpy.float64'>'
	with 10716885 stored elements in Compressed Sparse Row format>

In [26]:
print(final_tfidf_vect)

  (0, 47436)	0.06050537545914882
  (0, 3080)	0.49155353467559537
  (0, 21287)	0.3144026918747362
  (0, 5678)	0.07572620583226496
  (0, 17960)	0.08465896677381184
  (0, 38444)	0.17262630288126524
  (0, 20317)	0.09433020770849425
  (0, 22290)	0.11910170060947274
  (0, 43523)	0.37144731028038597
  (0, 65785)	0.09060375134405176
  (0, 7036)	0.1051422337103849
  (0, 13159)	0.20134392569659074
  (0, 40139)	0.2004016445471627
  (0, 58841)	0.17334011588740228
  (0, 57271)	0.14771650641216308
  (0, 49174)	0.1058673881982854
  (0, 23537)	0.21854637171002342
  (0, 22696)	0.07787433001599138
  (0, 12386)	0.35553734130836867
  (0, 64542)	0.1315135280839111
  (0, 48133)	0.1313641950428206
  (0, 59492)	0.05285735212605119
  (0, 25921)	0.05873018615907351
  (0, 18909)	0.1664268528135696
  (0, 7163)	0.08628351096256691
  :	:
  (352568, 60224)	0.13278784098647836
  (352568, 35238)	0.07461602825099076
  (352568, 2652)	0.17758913865028697
  (352568, 27103)	0.12585703865280465
  (352568, 25374)	0.073225748

## Considering Bigrams in CountVectorizer() and TfidfVectorizer()

In [27]:
count_vect = CountVectorizer(ngram_range = (1, 2)) #in scikit-learn
final_counts = count_vect.fit_transform(final['CleanedText'].values)
final_counts

<352569x2779476 sparse matrix of type '<class 'numpy.int64'>'
	with 23117833 stored elements in Compressed Sparse Row format>

In [28]:
bigram_tfidf_vect = TfidfVectorizer(ngram_range = (1, 2))
bigram_final_tfidf_vect= tfidf_vect.fit_transform(final['CleanedText'].values)
bigram_final_tfidf_vect

<352569x68616 sparse matrix of type '<class 'numpy.float64'>'
	with 10716885 stored elements in Compressed Sparse Row format>

## Word2Vec

In [42]:
list_of_sent = []
for sent in final['CleanedText'].values:
    list_of_sent.append(sent.split())

In [43]:
list_of_sent

[['product',
  'archer',
  'farm',
  'best',
  'drink',
  'mix',
  'ever',
  'mix',
  'flavor',
  'packet',
  'water',
  'bottl',
  'contain',
  'natur',
  'sweetner',
  'stevia',
  'real',
  'fruit',
  'flavor',
  'food',
  'color',
  'color',
  'fruit',
  'veget',
  'color',
  'pure',
  'natur',
  'tast',
  'great',
  'eight',
  'packet',
  'box',
  'contain',
  'calori',
  'per',
  'packet',
  'thank',
  'archer',
  'farm'],
 ['dog',
  'love',
  'saw',
  'pet',
  'store',
  'tag',
  'attach',
  'regard',
  'made',
  'china',
  'satisfi',
  'safe'],
 ['dog',
  'love',
  'chicken',
  'product',
  'china',
  'wont',
  'buy',
  'anymor',
  'hard',
  'find',
  'chicken',
  'product',
  'made',
  'usa',
  'one',
  'isnt',
  'bad',
  'good',
  'product',
  'wont',
  'take',
  'chanc',
  'till',
  'know',
  'go',
  'china',
  'import'],
 ['feed',
  'greyhound',
  'treat',
  'year',
  'hound',
  'littl',
  'finicki',
  'love',
  'treat',
  'expens',
  'relat',
  'biscuit',
  'find',
  'good'

In [44]:
w2v_model = Word2Vec(list_of_sent, min_count = 5, size = 50, workers = 4)

In [45]:
w2v_words = list(w2v_model.wv.vocab)

In [51]:
w2v_model.wv.most_similar('tasti')

[('delici', 0.8180373311042786),
 ('yummi', 0.799370288848877),
 ('good', 0.6957471370697021),
 ('tastey', 0.6858482956886292),
 ('nice', 0.6728922724723816),
 ('hearti', 0.6589317321777344),
 ('satisfi', 0.6502076387405396),
 ('great', 0.6346753239631653),
 ('delish', 0.6196246147155762),
 ('nutriti', 0.6106041669845581)]

In [53]:
len(w2v_words)

21336

In [52]:
w2v_words[0:50]

['product',
 'archer',
 'farm',
 'best',
 'drink',
 'mix',
 'ever',
 'flavor',
 'packet',
 'water',
 'bottl',
 'contain',
 'natur',
 'sweetner',
 'stevia',
 'real',
 'fruit',
 'food',
 'color',
 'veget',
 'pure',
 'tast',
 'great',
 'eight',
 'box',
 'calori',
 'per',
 'thank',
 'dog',
 'love',
 'saw',
 'pet',
 'store',
 'tag',
 'attach',
 'regard',
 'made',
 'china',
 'satisfi',
 'safe',
 'chicken',
 'wont',
 'buy',
 'anymor',
 'hard',
 'find',
 'usa',
 'one',
 'isnt',
 'bad']

## Avg W2V, TFIDF-W2V

In [54]:
sent_vectors = []; # the avg-w2v for each review is stored in this list
for sent in tqdm(list_of_sent):
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the review
    for word in sent: # for each word in a review
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)

100%|█████████████████████████████████████████████████████████████████████████| 352569/352569 [14:37<00:00, 494.53it/s]


In [60]:
len(sent_vectors)

352569

In [57]:
dictionary = dict(zip(tfidf_vect.get_feature_names(), list(tfidf_vect.idf_)))

In [58]:
tfidf_feat = tfidf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sent): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|█████████████████████████████████████████████████████████████████████████| 352569/352569 [17:27<00:00, 336.64it/s]


In [61]:
len(tfidf_sent_vectors)

352569

In [63]:
data_1000 = sent_vectors[0:1000][:]

In [66]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
# configuring the parameteres
# the number of components = 2
# default perplexity = 30
# default learning rate = 200
# default Maximum number of iterations for the optimization = 1000

tsne_data = model.fit_transform(data_1000)

