### Amazon Food Reviews 

In [9]:
# packages

In [193]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, precision_score,recall_score,f1_score,accuracy_score,auc
from sklearn import metrics
from sklearn.metrics import auc,roc_curve

# NLP
import re, string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle, os, sys, json, warnings

%matplotlib inline

In [26]:
# load the data 

import gzip

def parse(path):
    g = gzip.open(path,'rb')
    for file in g:
        yield json.loads(file)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient = 'index')

In [27]:
df_a = getDF('Dataset/Appliances_5.json.gz')
df_b = getDF('Dataset/All_Beauty_5.json.gz')
df_c = getDF('Dataset/AMAZON_FASHION_5.json.gz')

df = pd.concat([df_a,df_b,df_c])
df.shape

(10722, 12)

In [28]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"08 22, 2013",A34A1UP40713F8,B00009W3I4,{'Style:': ' Dryer Vent'},James. Backus,I like this as a vent as well as something tha...,Great product,1377129600,,
1,5.0,True,"02 8, 2016",A1AHW6I678O6F2,B00009W3PA,{'Size:': ' 6-Foot'},kevin.,good item,Five Stars,1454889600,,
2,5.0,True,"08 5, 2015",A8R48NKTGCJDQ,B00009W3PA,{'Size:': ' 6-Foot'},CDBrannom,Fit my new LG dryer perfectly.,Five Stars,1438732800,,
3,5.0,True,"04 24, 2015",AR3OHHHW01A8E,B00009W3PA,{'Size:': ' 6-Foot'},Calvin E Reames,Good value for electric dryers,Perfect size,1429833600,,
4,5.0,True,"03 21, 2015",A2CIEGHZ7L1WWR,B00009W3PA,{'Size:': ' 6-Foot'},albert j. kong,Price and delivery was excellent.,Five Stars,1426896000,,


#### Drop Reviews with score 3

In [32]:
print('before')
print(df.shape)

before
(10722, 12)


In [35]:
df = df.loc[df.overall != 3].shape

(9855, 12)

In [36]:
print('after')
df.shape

after


(10722, 12)

In [37]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"08 22, 2013",A34A1UP40713F8,B00009W3I4,{'Style:': ' Dryer Vent'},James. Backus,I like this as a vent as well as something tha...,Great product,1377129600,,
1,5.0,True,"02 8, 2016",A1AHW6I678O6F2,B00009W3PA,{'Size:': ' 6-Foot'},kevin.,good item,Five Stars,1454889600,,
2,5.0,True,"08 5, 2015",A8R48NKTGCJDQ,B00009W3PA,{'Size:': ' 6-Foot'},CDBrannom,Fit my new LG dryer perfectly.,Five Stars,1438732800,,
3,5.0,True,"04 24, 2015",AR3OHHHW01A8E,B00009W3PA,{'Size:': ' 6-Foot'},Calvin E Reames,Good value for electric dryers,Perfect size,1429833600,,
4,5.0,True,"03 21, 2015",A2CIEGHZ7L1WWR,B00009W3PA,{'Size:': ' 6-Foot'},albert j. kong,Price and delivery was excellent.,Five Stars,1426896000,,


In [39]:
df["score"] = df['overall'].map(lambda x:0 if x < 3 else 1 )

In [40]:
# check the data types

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10722 entries, 0 to 3175
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         10722 non-null  float64
 1   verified        10722 non-null  bool   
 2   reviewTime      10722 non-null  object 
 3   reviewerID      10722 non-null  object 
 4   asin            10722 non-null  object 
 5   style           7769 non-null   object 
 6   reviewerName    10722 non-null  object 
 7   reviewText      10701 non-null  object 
 8   summary         10717 non-null  object 
 9   unixReviewTime  10722 non-null  int64  
 10  vote            2923 non-null   object 
 11  image           1032 non-null   object 
 12  score           10722 non-null  int64  
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 1.1+ MB


In [42]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,score
0,5.0,True,"08 22, 2013",A34A1UP40713F8,B00009W3I4,{'Style:': ' Dryer Vent'},James. Backus,I like this as a vent as well as something tha...,Great product,1377129600,,,1
1,5.0,True,"02 8, 2016",A1AHW6I678O6F2,B00009W3PA,{'Size:': ' 6-Foot'},kevin.,good item,Five Stars,1454889600,,,1
2,5.0,True,"08 5, 2015",A8R48NKTGCJDQ,B00009W3PA,{'Size:': ' 6-Foot'},CDBrannom,Fit my new LG dryer perfectly.,Five Stars,1438732800,,,1
3,5.0,True,"04 24, 2015",AR3OHHHW01A8E,B00009W3PA,{'Size:': ' 6-Foot'},Calvin E Reames,Good value for electric dryers,Perfect size,1429833600,,,1
4,5.0,True,"03 21, 2015",A2CIEGHZ7L1WWR,B00009W3PA,{'Size:': ' 6-Foot'},albert j. kong,Price and delivery was excellent.,Five Stars,1426896000,,,1


In [44]:
df['Unix_Rev_Time'] = pd.to_datetime(df['unixReviewTime'],unit='s')
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,score,Unix_Rev_Time
0,5.0,True,"08 22, 2013",A34A1UP40713F8,B00009W3I4,{'Style:': ' Dryer Vent'},James. Backus,I like this as a vent as well as something tha...,Great product,1377129600,,,1,2013-08-22
1,5.0,True,"02 8, 2016",A1AHW6I678O6F2,B00009W3PA,{'Size:': ' 6-Foot'},kevin.,good item,Five Stars,1454889600,,,1,2016-02-08
2,5.0,True,"08 5, 2015",A8R48NKTGCJDQ,B00009W3PA,{'Size:': ' 6-Foot'},CDBrannom,Fit my new LG dryer perfectly.,Five Stars,1438732800,,,1,2015-08-05
3,5.0,True,"04 24, 2015",AR3OHHHW01A8E,B00009W3PA,{'Size:': ' 6-Foot'},Calvin E Reames,Good value for electric dryers,Perfect size,1429833600,,,1,2015-04-24
4,5.0,True,"03 21, 2015",A2CIEGHZ7L1WWR,B00009W3PA,{'Size:': ' 6-Foot'},albert j. kong,Price and delivery was excellent.,Five Stars,1426896000,,,1,2015-03-21


### Exploratory Data Analysis

In [46]:
# initial shape
df.shape

(10722, 14)

In [50]:
# sort the values first

df_sorted = df.sort_values(['reviewerName','reviewText','summary','Unix_Rev_Time'],
               axis = 0,
               ascending= True,
               kind='quicksort',
               inplace = False,
               na_position='last')
df_sorted.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,score,Unix_Rev_Time
4856,5.0,False,"09 21, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 150 Gram', 'Color:': ' Sea Salt'}",Lynne E.,"I love Pre de Provence soaps, but was intensel...",Luxurious French Soap Has Fresh Sea Breeze Fra...,1505952000,7.0,,1,2017-09-21
4187,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B001LNODUS,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it'...",Clear Gel Creates Nice Lather With Delicate La...,1504483200,,,1,2017-09-04
5034,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B019FWRG3C,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it'...",Clear Gel Creates Nice Lather With Delicate La...,1504483200,,,1,2017-09-04
4848,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Patchouli'}",Lynne E.,This PRE DE PROVENCE PATCHOULI SHEA BUTTER ENR...,Luxurious French Soap With Musky Masculine Fra...,1506470400,,,1,2017-09-27
4924,5.0,False,"08 26, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Olive Oil'}",Lynne E.,This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTI...,"Feels Luxurious, Doesn't Dry Out Sensitive Skin",1503705600,,,1,2017-08-26


In [51]:
# drop duplicates 
df_uniq = df_sorted.drop_duplicates(subset={'reviewerName','reviewText','summary','Unix_Rev_Time'},
                         keep = 'first',
                         inplace = False)

df_uniq.shape

(1998, 14)

In [52]:
df_uniq.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,score,Unix_Rev_Time
4856,5.0,False,"09 21, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 150 Gram', 'Color:': ' Sea Salt'}",Lynne E.,"I love Pre de Provence soaps, but was intensel...",Luxurious French Soap Has Fresh Sea Breeze Fra...,1505952000,7.0,,1,2017-09-21
4187,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B001LNODUS,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it'...",Clear Gel Creates Nice Lather With Delicate La...,1504483200,,,1,2017-09-04
4848,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Patchouli'}",Lynne E.,This PRE DE PROVENCE PATCHOULI SHEA BUTTER ENR...,Luxurious French Soap With Musky Masculine Fra...,1506470400,,,1,2017-09-27
4924,5.0,False,"08 26, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Olive Oil'}",Lynne E.,This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTI...,"Feels Luxurious, Doesn't Dry Out Sensitive Skin",1503705600,,,1,2017-08-26
4849,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Sandalwood'}",Lynne E.,This PRE DE PROVENCE Sandalwood SHEA BUTTER EN...,"Luxurious French Soap With Mild, Masculine Fra...",1506470400,,,1,2017-09-27


In [54]:
# percentage of uique data is 
np.round((df_uniq.shape[0]/df.shape[0])*100,2)

18.63

In [56]:
# display few reviews full text

pd.set_option('display.max_colwidth',None)
print(df_uniq.reviewText.sample(10))

2155    Let's think about this product for a minute.  You have a toothbrush which supposedly uses sonic waves to clean the teeth.  (I have to believe "truth in advertising" laws are at work here, because my Sonicare doesn't produce results that are any better than any battery-operated toothbrush I've used.)  Then you get this item which is designed to sanitize said toothbrush.\n\nHow can you really tell if this thing "sanitizes" your toothbrush head?  Even if you tried to fake it by putting, say, spaghetti sauce on the toothbrush and then sanitizing it, that still doesn't prove anything.  It hums and then you take your brush head out.  But what are the benefits of sanitizing the brush head?  Again, I notice no difference in my mouth from how things were before I had this, compared to after using this.  So, it does nothing obvious, but if you're paranoid, you might want to get it just to believe you're chasing down every tiny bacterium.
751                                               

### Preprocessing the text data

In [58]:
from bs4 import BeautifulSoup

In [None]:
tf_

In [93]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

# combine all the steps into one
def expand(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
    
def text_preprocess(text_series):
    output = []
    for i,review in enumerate(text_series.values):
        try:   
            # remove url tags
            review = re.sub(r'http\S+',"",review)
            # remove html tags
            review = BeautifulSoup(review,"lxml").get_text()
            review = expand(review)
            review = re.sub("\S*d\S","",review).strip()
            review = re.sub('[^A-Za-z]+'," ",review)
    
            review = ' '.join(e.lower() for e in review.split(" ") if e.lower() not in stopwords)
            output.append(review.strip())
        except:
            print(i,review)
            output.append(None)
    return output


# Remove html tags
# Remove punctuations or limited set of special characters like , . # etc.
# Make sure words are made of english letters and are not alpha-numeric
# See if the length of the word is greater than 2
# Convert to lowercase
# Remove Stopwords, that is, words that dont add much to the meaning.
# Finally Snowball Stemming the word

In [94]:
df_uniq['processed_Text'] = text_preprocess(df_uniq['reviewText'])

  review = BeautifulSoup(review,"lxml").get_text()


526 nan
613 nan
1987 nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uniq['processed_Text'] = text_preprocess(df_uniq['reviewText'])


In [96]:
df_uniq.loc[df_uniq.processed_Text.isnull(),:]

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,score,Unix_Rev_Time,processed_Text
172,5.0,True,"04 7, 2018",A1CKPC88NHMYGR,B001IKJOLW,"{'Size:': ' 11 B(M) US', 'Color:': ' Wolf Grey/Black-pink Blast/White'}",Cynthia Foyer,,Five Stars,1523059200,,[https://images-na.ssl-images-amazon.com/images/I/61ifu-JvzQL._SY88.jpg],1,2018-04-07,
285,5.0,True,"07 18, 2017",AN5PL4KUZS35E,B001IKJOLW,"{'Size:': ' 9.5 B(M) US', 'Color:': ' Black/White/Anthracite/Stealth'}",Dilly Anderson,,Five Stars,1500336000,,,1,2017-07-18,
359,5.0,True,"02 28, 2015",A3DA9MP7OGLPC0,B00006L9LC,{'Size:': ' 38'},verenice,,Five Stars,1425081600,,,1,2015-02-28,


In [97]:
# keep only the not null data rows
df_uniq = df_uniq.loc[df_uniq.processed_Text.notnull(),:]

In [98]:
df_uniq['processed_Text'].head()

4856    love pre provence soaps intensely curious sea salt bar would smell like would want wash sea salt happily pre de provence sea salt french soap bar g ounce no fferent pre provence soaps smells rful ces rich thick lather fragrance hard scribe kind floral maybe best scribed fresh sea breeze makes one think fresh air sunshine fragrance es not linger overlong soap es not irritate sensitive skin may little ying pre provence soaps unless imagination working overtime soap finitely contains salt ents um palmate um palm kernelate water fragrance palm titanium glycerin palm kernel sodium chloride shea butter um um um onate benzyl salicylate linalool geraniol citral
4187                                                                                                                                                                                                                                                                                                 r favorite soap fragrance no surprise

## Preprocessing the review summary

In [101]:
text_preprocess(df_uniq['summary'][:5])

['luxurious french soap fresh sea breeze fragrance',
 'clear gel creates nice lather delicate r scent',
 'luxurious french soap musky masculine fragrance',
 'feels luxurious not dry sensitive skin',
 'luxurious french soap masculine fragrance']

In [102]:
df_uniq['processed_Summary'] = text_preprocess(df_uniq['summary'])

  review = BeautifulSoup(review,"lxml").get_text()


768 nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uniq['processed_Summary'] = text_preprocess(df_uniq['summary'])


In [103]:
df_uniq = df_uniq[df_uniq['processed_Summary'].notnull()]
df_uniq.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,score,Unix_Rev_Time,processed_Text,processed_Summary
4856,5.0,False,"09 21, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 150 Gram', 'Color:': ' Sea Salt'}",Lynne E.,"I love Pre de Provence soaps, but was intensely curious about the Sea Salt bar. What would it smell like? Why would I want to wash my hands with sea salt? Happily, this PRE DE PROVENCE SEA SALT FRENCH SOAP BAR (150g, 5.2 Ounce) is no different from the other Pre de Provence soaps--it smells wonderful, and produces a rich, thick lather.\n\nThe fragrance is hard to describe. Its kind of floral, but maybe is best described as fresh sea breeze. It makes one think of fresh air and sunshine. The fragrance doesn't linger overlong. The soap doesn't irritate my sensitive skin, but may be a little more drying than other Pre de Provence soaps (unless it's my imagination working overtime).\n\nThe soap definitely contains salt. The ingredients are: sodium palmate, sodium palm kernelate, water, fragrance, palm acid, titanium dioxide, glycerin, palm kernel acid, SODIUM CHLORIDE, shea butter, sodium hydroxide, tetrasodium edta, tetrasodium etidronate, benzyl salicylate, linalool, geraniol, citral.",Luxurious French Soap Has Fresh Sea Breeze Fragrance,1505952000,7.0,,1,2017-09-21,love pre provence soaps intensely curious sea salt bar would smell like would want wash sea salt happily pre de provence sea salt french soap bar g ounce no fferent pre provence soaps smells rful ces rich thick lather fragrance hard scribe kind floral maybe best scribed fresh sea breeze makes one think fresh air sunshine fragrance es not linger overlong soap es not irritate sensitive skin may little ying pre provence soaps unless imagination working overtime soap finitely contains salt ents um palmate um palm kernelate water fragrance palm titanium glycerin palm kernel sodium chloride shea butter um um um onate benzyl salicylate linalool geraniol citral,luxurious french soap fresh sea breeze fragrance
4187,5.0,False,"09 4, 2017",A2MJ8OL2FYN7CW,B001LNODUS,{'Color:': ' Shower Gel'},Lynne E.,"Lavender is my favorite soap fragrance, so it's no surprise that I love this PRE DE PROVENCE FRENCH LAVENDER BATH & SHOWER GEL. For the shower, a pump or two is enough for a nice lather, and the lather washes off easily.\n\nThe lavender fragrance is delicate, and it lingers for only a short time. It doesn't overwhelm perfume, aftershave lotion, shampoo, or other scented products you may like to use. The clear gel arrives in an attractive square pump dispenser.\n\nThe shower gel leaves my skin silky smooth, and it doesn't irritate my sensitive skin.",Clear Gel Creates Nice Lather With Delicate Lavender Scent,1504483200,,,1,2017-09-04,r favorite soap fragrance no surprise love pre de provence french lavender bath shower gel shower pump two enough nice lather lather washes easily r fragrance licate lingers short time es not overwhelm perfume aftershave lotion shampoo scented cts may like use clear gel arrives attractive square pump spenser shower gel leaves skin silky smooth es not irritate sensitive skin,clear gel creates nice lather delicate r scent
4848,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Patchouli'}",Lynne E.,"This PRE DE PROVENCE PATCHOULI SHEA BUTTER ENRICHED SOAP (250 Gram) has the mild, musky fragrance that many women find sexy, but the fragrance is risky. To some people (including me) it smells like fungus or damp basement. Still, this soap may appeal to male household members, because it's so different from most other Pre de Provence soaps (with their fairly strong, flowery fragrances).\n\nThis is a large bar of luxurious French soap. It lathers beautifully, and the fragrance never lingers too long. The soap is very gentle, and doesn't irritate sensitive skin.\n\nFor men, a much safer Pre de Provence soap is the Sandalwood bar (see&nbsp;<a data-hook=""product-link-linked"" class=""a-link-normal"" href=""/Pre-de-Provence-Sandalwood/dp/B01LWWOLW2/ref=cm_cr_arp_d_rvw_txt?ie=UTF8"">Pre de Provence Sandalwood</a>).",Luxurious French Soap With Musky Masculine Fragrance,1506470400,,,1,2017-09-27,pre de provence patchouli shea butter enriched soap gram musky fragrance many women find sexy fragrance risky people ng smells like fungus mp basement still soap may appeal male household members fferent pre provence soaps fairly strong flowery fragrances large bar luxurious french soap lathers beautifully fragrance never lingers long soap gentle es not irritate sensitive skin men much safer pre provence soap lwood bar see pre provence,luxurious french soap musky masculine fragrance
4924,5.0,False,"08 26, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Olive Oil'}",Lynne E.,"This PRE DE PROVENCE SHEA BUTTER ENRICHED ARTISANAL FRENCH SOAP BAR (Olive Oil (250 g)) lathers beautifully, and feels luxurious. It doesn't dry out or irritate my sensitive skin, so I can use it every day. This is a large 250 gram olive oil bar that gives good value for the money (about $7 on Amazon).","Feels Luxurious, Doesn't Dry Out Sensitive Skin",1503705600,,,1,2017-08-26,pre de provence shea butter enriched artisanal french soap bar olive oil g lathers beautifully feels luxurious es not irritate sensitive skin use every large gram olive oil bar gives good value money amazon,feels luxurious not dry sensitive skin
4849,5.0,False,"09 27, 2017",A2MJ8OL2FYN7CW,B00W259T7G,"{'Size:': ' 250 Gram', 'Color:': ' Sandalwood'}",Lynne E.,"This PRE DE PROVENCE Sandalwood SHEA BUTTER ENRICHED SOAP (250 Gram) should appeal to the male members of the household, because of the mild, woody fragrance. (Most Pre de Provence soaps have fairly strong, flowery fragrances.)\n\nIt's a large bar of luxurious French soap. It lathers beautifully, and the fragrance never lingers too long. The soap is very gentle, and doesn't irritate sensitive skin.","Luxurious French Soap With Mild, Masculine Fragrance",1506470400,,,1,2017-09-27,pre de provence lwood shea butter enriched soap gram appeal male members fragrance pre provence soaps fairly strong flowery fragrances large bar luxurious french soap lathers beautifully fragrance never lingers long soap gentle es not irritate sensitive skin,luxurious french soap masculine fragrance


## Feature Engineering for the Text Data

## - Bag of Words

In [None]:
# train count Vectorizer

count_vectorizer = CountVectorizer()
count_vectorizer.fit(df_uniq['processed_Text'])

In [110]:
print("Sample features :", count_vectorizer.get_feature_names_out()[:10])
print('Total no of features :',count_vectorizer.get_feature_names_out().shape[0])

Sample features : ['aaa' 'abating' 'abbey' 'abd' 'abetes' 'ability' 'abit' 'able' 'abrasing'
 'abrasive']
Total no of features : 5768


#### Transform the entire dataset to a document term matrix form

In [116]:
final_counts = count_vectorizer.transform(df_uniq['processed_Text'])
print('count vectorizer type ',type(final_counts))
print('The shape of our BOW vectorizer ',final_counts.shape)
print('The no of unique elements ',final_counts.shape[1])

count vectorizer type  <class 'scipy.sparse._csr.csr_matrix'>
The shape of our BOW vectorizer  (1994, 5768)
The no of unique elements  5768


#### view the sparse matrix

In [121]:
dtm = final_counts.toarray()
print("The shape of the Document Term Matrix ",dtm.shape)
dtm[:10,:20]

The shape of the Document Term Matrix  (1994, 5768)


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

#### Total count of words in entire dataset

In [122]:
dtm.sum()

48607

## Lets try creating Bigrams and Trigrams

##### create bigrams

In [125]:
count_vect = CountVectorizer(ngram_range=(1,2),min_df=10,max_features=5000)

final_bigram_counts = count_vect.fit_transform(df_uniq['processed_Text'])
print("bigrams count shape :",final_bigram_counts.shape)
print('bigrams bigram word count is :',final_bigram_counts.shape[1])

bigrams count shape : (1994, 944)
bigrams bigram word count is : 944


In [127]:
print("sample features : ", count_vect.get_feature_names_out()[:200])

sample features :  ['able' 'absolutely' 'absolutely love' 'active' 'actually' 'ad' 'age'
 'ago' 'air' 'al' 'alcohol' 'alcohol free' 'allow' 'almost' 'along' 'also'
 'although' 'always' 'amazing' 'amazon' 'amount' 'another' 'anymore'
 'anyone' 'anything' 'anyway' 'anywhere' 'apply' 'applying' 'appreciate'
 'arch' 'arch support' 'area' 'areas' 'aroma' 'around' 'arrived' 'ask'
 'available' 'average' 'away' 'awesome' 'baby' 'back' 'bad' 'bag' 'bar'
 'bar soap' 'bars' 'base' 'based' 'basic' 'bath' 'bathroom' 'beautiful'
 'beauty' 'believe' 'benefits' 'best' 'better' 'big' 'bigger' 'bit' 'bits'
 'black' 'ble' 'blossoms' 'blow' 'blue' 'bottle' 'bottles' 'bottom'
 'bought' 'box' 'braces' 'brand' 'break' 'breaking' 'breath' 'bright'
 'bring' 'brush' 'brushes' 'brushing' 'bubble' 'burning' 'butter'
 'butter enriched' 'buy' 'buying' 'ca' 'ca not' 'came' 'cannot' 'cap'
 'care' 'careful' 'carrying' 'case' 'cause' 'ce' 'cent' 'certainly'
 'change' 'changed' 'cheap' 'cheaper' 'check' 'chemical' 'choi

##### create trigrams

In [128]:
count_vect = CountVectorizer(ngram_range=(1,3),min_df=10,max_features=5000)

final_trigram_counts = count_vect.fit_transform(df_uniq['processed_Text'])
print("trigram count shape :", final_trigram_counts.shape)
print("trigram count unique features :",final_trigram_counts.shape[1])

trigram count shape : (1994, 951)
trigram count unique features : 951


In [129]:
print("sample features of trigrams are : ",count_vect.get_feature_names_out()[:200])

sample features of trigrams are :  ['able' 'absolutely' 'absolutely love' 'active' 'actually' 'ad' 'age'
 'ago' 'air' 'al' 'alcohol' 'alcohol free' 'allow' 'almost' 'along' 'also'
 'although' 'always' 'amazing' 'amazon' 'amount' 'another' 'anymore'
 'anyone' 'anything' 'anyway' 'anywhere' 'apply' 'applying' 'appreciate'
 'arch' 'arch support' 'area' 'areas' 'aroma' 'around' 'arrived' 'ask'
 'available' 'average' 'away' 'awesome' 'baby' 'back' 'bad' 'bag' 'bar'
 'bar soap' 'bars' 'base' 'based' 'basic' 'bath' 'bathroom' 'beautiful'
 'beauty' 'believe' 'benefits' 'best' 'better' 'big' 'bigger' 'bit' 'bits'
 'black' 'ble' 'blossoms' 'blow' 'blue' 'bottle' 'bottles' 'bottom'
 'bought' 'box' 'braces' 'brand' 'break' 'breaking' 'breath' 'bright'
 'bring' 'brush' 'brushes' 'brushing' 'bubble' 'burning' 'butter'
 'butter enriched' 'buy' 'buying' 'ca' 'ca not' 'came' 'cannot' 'cap'
 'care' 'careful' 'carrying' 'case' 'cause' 'ce' 'cent' 'certainly'
 'change' 'changed' 'cheap' 'cheaper' 'check' 

## Creating TF-IDF vector

In [242]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2),min_df=10,max_features=300)
tf_idf_vect.fit(df_uniq['processed_Text'])

In [243]:
# some sample features 
tf_idf_vect.get_feature_names_out()[:100]

array(['able', 'absolutely', 'actually', 'alcohol', 'almost', 'also',
       'always', 'amazing', 'amazon', 'amount', 'another', 'anything',
       'area', 'around', 'available', 'away', 'awesome', 'back', 'bad',
       'bar', 'bar soap', 'bars', 'bath', 'beautiful', 'best', 'better',
       'big', 'bit', 'bottle', 'bought', 'brush', 'butter', 'buy', 'care',
       'citrus', 'clean', 'cleaning', 'coat', 'color', 'come', 'comes',
       'comfortable', 'could', 'cream', 'crest', 'ct', 'cts', 'easily',
       'easy', 'enough', 'es', 'es not', 'especially', 'essie', 'even',
       'ever', 'every', 'excellent', 'exfoliating', 'expensive', 'eye',
       'face', 'far', 'favorite', 'feel', 'feeling', 'feels', 'feet',
       'fference', 'fferent', 'find', 'fine', 'finitely', 'first', 'fit',
       'flavor', 'floral', 'foot', 'found', 'fragrance', 'free', 'french',
       'fresh', 'gel', 'gentle', 'get', 'getting', 'gift', 'give', 'go',
       'goes', 'going', 'good', 'got', 'great', 'great ct',

In [245]:
final_tf_idf_count = tf_idf_vect.transform(df_uniq['processed_Text'])
print("tfidf vect shape :",final_tf_idf_count.shape)
print("The unique terms count in the tfidf vect :",final_tf_idf_count.shape[1])

tfidf vect shape : (1994, 300)
The unique terms count in the tfidf vect : 300


In [246]:
final_tf_idf_count.toarray()[:5,:10]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.20081873, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

## Training Word2Vec

In [137]:
list_of_reviews = []
for rev in df_uniq['processed_Text']:
    list_of_reviews.append(rev.split())

##### Train word2vec or use pretrained model

In [157]:
use_google_w2v = True
train_w2v = False

if train_w2v:
    # only words which occured 5 or more times.
    w2c_model = Word2Vec(list_of_reviews, min_count = 5, vector_size=50, workers = 4)
    print(w2c_model.wv.most_similar('great'))
    print("\n",'-'*50,"\n")
    print(w2c_model.wv.most_similar('bad'))

elif use_google_w2v:

    if os.path.isfile(r'D:\ML Block\NLP\resources\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin'):
        w2v_model = KeyedVectors.load_word2vec_format(r'D:\ML Block\NLP\resources\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin',binary=True)
        print(w2v_model.most_similar('great'))
        print(w2v_model.most_similar('bad'))
    else:
        print("word2vec file not found, set train_w2v = True to train your own model")

[('terrific', 0.7989331483840942), ('fantastic', 0.7935212254524231), ('tremendous', 0.7748855352401733), ('wonderful', 0.7647868990898132), ('good', 0.7291510105133057), ('incredible', 0.7032873630523682), ('marvelous', 0.6971103549003601), ('phenomenal', 0.6841564178466797), ('amazing', 0.663412868976593), ('awesome', 0.6510507464408875)]
[('good', 0.7190051674842834), ('terrible', 0.6828612089157104), ('horrible', 0.6702597737312317), ('Bad', 0.669891893863678), ('lousy', 0.6647640466690063), ('crummy', 0.567781925201416), ('horrid', 0.5651682615280151), ('awful', 0.5527253150939941), ('dreadful', 0.5526429414749146), ('horrendous', 0.5445998311042786)]


In [162]:
w2v_words = list(w2v_model.index_to_key)
print("number of words", len(w2v_words))
print("Sample of words - occured at least 5 times:",w2v_words[0:50])

number of words 3000000
Sample of words - occured at least 5 times: ['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said', 'was', 'the', 'at', 'not', 'as', 'it', 'be', 'from', 'by', 'are', 'I', 'have', 'he', 'will', 'has', '####', 'his', 'an', 'this', 'or', 'their', 'who', 'they', 'but', '$', 'had', 'year', 'were', 'we', 'more', '###', 'up', 'been', 'you', 'its', 'one', 'about', 'would', 'which', 'out']


## Convert entire texts into Vectors

### - Average W2v

In [163]:
from tqdm import tqdm

In [167]:
# compute average w2v for each review

reviews_vector = []

for review in tqdm(list_of_reviews):
    rev_vector = np.zeros(300) # 
    word_count = 0
    for word in review:
        if word in w2v_words:
            vec = w2v_model[word] # get the vector
            rev_vector += vec
            word_count += 1
    if word_count != 0:
        rev_vector /= word_count
    reviews_vector.append(rev_vector)
print(len(reviews_vector))
print(len(reviews_vector[0]))

100%|██████████████████████████████████████████████████████████████████████████████| 1994/1994 [01:05<00:00, 30.55it/s]

1994
300





In [169]:
# print output

for i,(review,vec) in enumerate(zip(list_of_reviews,reviews_vector)):
    if i <= 3:
        print("-"*50, "\n",review,"\n","-"*50, "\n", vec)

-------------------------------------------------- 
 ['love', 'pre', 'provence', 'soaps', 'intensely', 'curious', 'sea', 'salt', 'bar', 'would', 'smell', 'like', 'would', 'want', 'wash', 'sea', 'salt', 'happily', 'pre', 'de', 'provence', 'sea', 'salt', 'french', 'soap', 'bar', 'g', 'ounce', 'no', 'fferent', 'pre', 'provence', 'soaps', 'smells', 'rful', 'ces', 'rich', 'thick', 'lather', 'fragrance', 'hard', 'scribe', 'kind', 'floral', 'maybe', 'best', 'scribed', 'fresh', 'sea', 'breeze', 'makes', 'one', 'think', 'fresh', 'air', 'sunshine', 'fragrance', 'es', 'not', 'linger', 'overlong', 'soap', 'es', 'not', 'irritate', 'sensitive', 'skin', 'may', 'little', 'ying', 'pre', 'provence', 'soaps', 'unless', 'imagination', 'working', 'overtime', 'soap', 'finitely', 'contains', 'salt', 'ents', 'um', 'palmate', 'um', 'palm', 'kernelate', 'water', 'fragrance', 'palm', 'titanium', 'glycerin', 'palm', 'kernel', 'sodium', 'chloride', 'shea', 'butter', 'um', 'um', 'um', 'onate', 'benzyl', 'salicylate

## - TF-IDF weighted W2V

In [171]:
model_wt_w2v = TfidfVectorizer()
model_wt_w2v.fit(df_uniq['processed_Text'])
# we are generating a dictionary with word as a key and the idf as a value
dictionary = dict(zip(model_wt_w2v.get_feature_names_out(), list(model_wt_w2v.idf_)))

In [172]:
# view dict containing idf values 
dictionary

{'aaa': 7.905252148764019,
 'abating': 7.905252148764019,
 'abbey': 7.905252148764019,
 'abd': 7.905252148764019,
 'abetes': 7.905252148764019,
 'ability': 7.905252148764019,
 'abit': 7.905252148764019,
 'able': 4.884827262619655,
 'abrasing': 7.905252148764019,
 'abrasive': 7.905252148764019,
 'absence': 7.499787040655854,
 'absinthe': 7.905252148764019,
 'absolute': 6.518957787644128,
 'absolutely': 4.9095198752100275,
 'absolutly': 7.905252148764019,
 'absorb': 7.905252148764019,
 'absorbent': 7.905252148764019,
 'absorbs': 6.518957787644128,
 'absorption': 7.905252148764019,
 'abstain': 7.905252148764019,
 'abut': 7.905252148764019,
 'acceptable': 6.988961416889864,
 'accepted': 7.905252148764019,
 'accessories': 7.905252148764019,
 'accessory': 7.499787040655854,
 'accompanies': 7.905252148764019,
 'account': 6.988961416889864,
 'accounting': 7.905252148764019,
 'accurate': 7.212104968204073,
 'accurately': 7.905252148764019,
 'accustomed': 7.499787040655854,
 'acetate': 7.2121049

### feature names(words)

In [173]:
tfidf_features = model_wt_w2v.get_feature_names_out()
tfidf_features[:5]

array(['aaa', 'abating', 'abbey', 'abd', 'abetes'], dtype=object)

In [179]:
# review to vectors

tfidf_review_vectors = []

row = 0

for review in tqdm(list_of_reviews):
    rev_vec = np.zeros(300)
    weight_sum = 0
    for word in review:
        if word in w2v_words and word in tfidf_features:
            vec = w2v_model[word]
            tf_idf = dictionary[word] * (review.count(word)/len(review)) #IDF * TF
            rev_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        rev_vec /= weight_sum

    tfidf_review_vectors.append(rev_vec)
    row += 1

100%|██████████████████████████████████████████████████████████████████████████████| 1994/1994 [01:08<00:00, 28.93it/s]


In [180]:
# print output 

for i, (rev,vec) in enumerate(zip(list_of_reviews,tfidf_review_vectors)):
    if i <= 3:
        print("-"*50, "\n", rev, "\n", "-"*50, "\n", vec)

-------------------------------------------------- 
 ['love', 'pre', 'provence', 'soaps', 'intensely', 'curious', 'sea', 'salt', 'bar', 'would', 'smell', 'like', 'would', 'want', 'wash', 'sea', 'salt', 'happily', 'pre', 'de', 'provence', 'sea', 'salt', 'french', 'soap', 'bar', 'g', 'ounce', 'no', 'fferent', 'pre', 'provence', 'soaps', 'smells', 'rful', 'ces', 'rich', 'thick', 'lather', 'fragrance', 'hard', 'scribe', 'kind', 'floral', 'maybe', 'best', 'scribed', 'fresh', 'sea', 'breeze', 'makes', 'one', 'think', 'fresh', 'air', 'sunshine', 'fragrance', 'es', 'not', 'linger', 'overlong', 'soap', 'es', 'not', 'irritate', 'sensitive', 'skin', 'may', 'little', 'ying', 'pre', 'provence', 'soaps', 'unless', 'imagination', 'working', 'overtime', 'soap', 'finitely', 'contains', 'salt', 'ents', 'um', 'palmate', 'um', 'palm', 'kernelate', 'water', 'fragrance', 'palm', 'titanium', 'glycerin', 'palm', 'kernel', 'sodium', 'chloride', 'shea', 'butter', 'um', 'um', 'um', 'onate', 'benzyl', 'salicylate

# Model Building 

In [192]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

##### Using TF-IDF weighted Average vector as input 

In [194]:
X = tfidf_review_vectors
y = df_uniq['score']

In [200]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X),y,train_size=0.7)

In [201]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1395, 300) (599, 300) (1395,) (599,)


In [204]:
lg_model = LogisticRegression()
lg_model.fit(X_train,y_train)

In [205]:
y_pred = lg_model.predict(X_test)

In [239]:
def print_metrics(model_name, y,y_hat):
    print(f"Metrics for {model_name} model \n")
    print("Accuracy : ", accuracy_score(y,y_hat))
    print("--"*15)
    print("F1 Score : ", f1_score(y,y_hat))
    print("--"*15)
    print("Precision : ", precision_score(y,y_hat))
    print("--"*15)
    print("Recall : ", recall_score(y_hat,y))
    print("--"*15)
    # print("AUC score :", auc(y_hat,y))

In [217]:
print_metrics("Logistic Regression using weighted average tfidf", y_test, y_pred)

Metrics for Logistic Regression model 

Accuracy :  0.9432387312186978
------------------------------
F1 Score :  0.9707903780068728
------------------------------
Precision :  0.9432387312186978
------------------------------
Recall :  0.9432387312186978
------------------------------
AUC score : 0.0


##### Using TF-IDF BoW vector as input 

In [247]:
X = final_tf_idf_count.toarray()
y = df_uniq['score']
X_train, X_test, y_train, y_test = train_test_split(np.array(X),y,train_size=0.7)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1395, 300) (599, 300) (1395,) (599,)


In [248]:
lg_model_tf_idf_bow = LogisticRegression()
lg_model_tf_idf_bow.fit(X_train,y_train)
y_pred = lg_model_tf_idf_bow.predict(X_test)

In [249]:
print_metrics("Logistic Regression using bow tfidf", y_test, y_pred)

Metrics for Logistic Regression using bow tfidf model 

Accuracy :  0.9549248747913188
------------------------------
F1 Score :  0.9769427839453458
------------------------------
Precision :  0.9549248747913188
------------------------------
Recall :  0.9549248747913188
------------------------------


#### Using TF-IDF W2V Avg vector as input

In [232]:
X = reviews_vector
y = df_uniq['score']
X_train, X_test, y_train, y_test = train_test_split(np.array(X),y,train_size=0.7)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1395, 300) (599, 300) (1395,) (599,)


In [235]:
lg_model_w2v_avg = LogisticRegression()
lg_model_w2v_avg.fit(X_train,y_train)
y_pred = lg_model_w2v_avg.predict(X_test)

In [240]:
print_metrics("Logistic Regression using w2v avg vector", y_test, y_pred)

Metrics for Logistic Regression using w2v avg vector model 

Accuracy :  0.9482470784641068
------------------------------
F1 Score :  0.9733905579399141
------------------------------
Precision :  0.9481605351170569
------------------------------
Recall :  0.9481605351170569
------------------------------


In [251]:
# let do hybrid approach by combining all the vector inputs 

X = tfidf_review_vectors + final_tf_idf_count.toarray() + reviews_vector
y = df_uniq['score']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1395, 300) (599, 300) (1395,) (599,)


In [253]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

In [255]:
y_pred = rf_model.predict(X_test)
print_metrics("Random Forest using hybrid approach of vector summation", y_test, y_pred)

Metrics for Random Forest using hybrid approach of vector summation model 

Accuracy :  0.9415692821368948
------------------------------
F1 Score :  0.9698015530629853
------------------------------
Precision :  0.9413735343383585
------------------------------
Recall :  0.9413735343383585
------------------------------
