<a href="https://colab.research.google.com/github/hisics/-datawhisperers_final-project_dsml2021/blob/main/datawhisperers_sentiment_analysis_amazon_apparel_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Option 8 - Sentiment Analysis for Ecommerce

Look at this document for reference,<br/>
https://colab.research.google.com/drive/1g6ZtYOOhGUj1WsiPpeaVPp5T7VFw234e?usp=sharing

In [1]:
# (1) Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

In [2]:
# (2) Download product reviews from Amazon Datasets
# You can use reviews from Amazon Datasets for a particular product.
# Note that the data are quite large so do not run this cell unless you are willing to wait!
# For reference, visit:
# https://www.tensorflow.org/datasets/catalog/amazon_us_reviews
# https://s3.amazonaws.com/amazon-reviews-pds/readme.html
!curl -O https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Apparel_v1_00.tsv.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  618M  100  618M    0     0  43.1M      0  0:00:14  0:00:14 --:--:-- 45.5M


In [3]:
!echo "y" | gzip -d amazon_reviews_us_Apparel_v1_00.tsv.gz

In [7]:
# (3.a) Read the contents of the folder 
!ls

amazon_reviews_us_Apparel_v1_00.tsv  sample_data


In [5]:
# (3.b) Explore data (see what is categorial and numerical)

In [41]:
df = pd.read_table('amazon_reviews_us_Apparel_v1_00.tsv', delim_whitespace=False, error_bad_lines=False, nrows=100000)
df.info()

b'Skipping line 49396: expected 15 fields, saw 22\nSkipping line 57008: expected 15 fields, saw 22\n'
b'Skipping line 82075: expected 15 fields, saw 22\n'


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        100000 non-null  object
 1   customer_id        100000 non-null  int64 
 2   review_id          100000 non-null  object
 3   product_id         100000 non-null  object
 4   product_parent     100000 non-null  int64 
 5   product_title      100000 non-null  object
 6   product_category   100000 non-null  object
 7   star_rating        100000 non-null  int64 
 8   helpful_votes      100000 non-null  int64 
 9   total_votes        100000 non-null  int64 
 10  vine               100000 non-null  object
 11  verified_purchase  100000 non-null  object
 12  review_headline    100000 non-null  object
 13  review_body        99947 non-null   object
 14  review_date        99997 non-null   object
dtypes: int64(5), object(10)
memory usage: 11.4+ MB


In [42]:
# To see how the data looks like and if any is boolean
df.tail()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
99995,US,43330569,R209JPXL8VMOVJ,B00VW37AMU,354148485,"Cybertela Women's Nice Fish You Caught, Can.. ...",Apparel,5,0,0,N,Y,Five Stars,My girlfriend loved it.,2015-05-05
99996,US,25578439,R2ISQWVCXLRNXQ,B00VW367FG,90102146,Cybertela Women's If You Don’t Fish I See.. Fu...,Apparel,4,0,0,N,Y,Four Stars,Great shirt,2015-05-18
99997,US,4282714,RB6XI6R464ZQL,B00VW366V6,789256548,Asgard Blacksmith's Alliance Thor Men's T-Shirt,Apparel,5,0,0,N,Y,Five Stars,Sweet shirt,2015-06-29
99998,US,41294636,R2RW7ZCM8J5YGV,B00VW2UKTG,233863271,Naturana Women's Mastectomy Bra,Apparel,5,0,1,N,Y,Five Stars,very well made,2015-08-06
99999,US,10624669,RN6BUHLG6XC8U,B00VW2TT0W,255023016,Ah Pardon Me My Good Sir I Believe I May Have ...,Apparel,4,0,0,N,Y,So Great!,This was a hit at the baby shower!,2015-07-07


###Cleaning data and converting data types


In [16]:
# Checking if marketplace is only in the US
df.marketplace.unique()

array(['US'], dtype=object)

In [43]:
# creating numerical df -> df_num
# creating categorical df -> df_cat
cat_list = ['marketplace','review_id','product_id','product_title','product_category','vine','verified_purchase','review_headline','review_body','review_date']
df_num = df.drop(cat_list, inplace = False , axis = 1) 

df_cat = df[cat_list]


# Cleaning
# (1) Drop: marketplace + product_category + review_id
df.drop(['marketplace','product_category','review_id'], axis=1, inplace=True)

# (2) Map: vine + verified_purchase (y:1, n:0)
YN_to_bool = {'Y': 1,'N': 0}
df['vine'] = df['vine'].map( YN_to_bool )
df['verified_purchase'] = df['verified_purchase'].map( YN_to_bool)

# (3) Datetime: review_date
df['review_date'] = pd.to_datetime(df['review_date'])

# (4) Create a year column for EDA
df['Year'] = df.review_date.dt.year

# (5) Drop the rows where there are no reviews
df.dropna(subset = ['review_body'], inplace = True)

 
# (6) Combine the review headline and review date columns
df['review_body'] = df['review_headline'] + ' ' + df['review_body']
del df['review_headline']

df.head()

Unnamed: 0,customer_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_body,review_date,Year
0,32158956,B01KL6O72Y,24485154,Easy Tool Stainless Steel Fruit Pineapple Core...,4,0,0,0,1,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKIN...,2013-01-14,2013.0
1,2714559,B01ID3ZS5W,363128556,V28 Women Cowl Neck Knit Stretchable Elasticit...,5,1,2,0,1,Favorite for winter. Very warm! I love this dr...,2014-03-04,2014.0
2,12608825,B01I497BGY,811958549,James Fiallo Men's 12-Pairs Low Cut Athletic S...,5,0,0,0,1,"Great Socks for the money. Nice socks, great c...",2015-07-12,2015.0
3,25482800,B01HDXFZK6,692205728,Belfry Gangster 100% Wool Stain-Resistant Crus...,5,0,0,0,1,Slick hat! I bought this for my husband and WO...,2015-06-03,2015.0
4,9310286,B01G6MBEBY,431150422,JAEDEN Women's Beaded Spaghetti Straps Sexy Lo...,5,0,0,0,1,I would do it again! Perfect dress and the cus...,2015-06-12,2015.0


#### Pre-processing the text fields
##### Useful resource for cleaning text
https://medium.com/mlearning-ai/10-python-functions-you-need-to-apply-before-you-build-your-nlp-sentiment-analysis-model-874a37e0217e

In [44]:
# Removing the hashtags
import re      # Import REGEX
def remove_mentions(text):
  text = re.sub("@[A-Za-z0-9_]+","", text)
  text = re.sub("#[A-Za-z0-9_]+","", text)
  return text

df['review_body'] = df.apply(lambda row: remove_mentions(row['review_body']), axis=1)

In [29]:
# Importing text clearning libraries 
# Tokenizer: Tokenization of data. At one stage we will need to deal with our text as a words list to iterate between the words and apply certain functions on it

import nltk         #Natural language processing tool-kit
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

from nltk.tokenize import word_tokenize, sent_tokenize  # Import Tokenizer.



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [46]:
# lower all upper case and enumerate words into a list
df['review_body'] = df['review_body'].str.lower()
# df['review_body'] = df['review_body'].values.tolist()
df['review_body']

0        ★ these really do work great with some tweakin...
1        favorite for winter. very warm! i love this dr...
2        great socks for the money. nice socks, great c...
3        slick hat! i bought this for my husband and wo...
4        i would do it again! perfect dress and the cus...
                               ...                        
99995                   five stars my girlfriend loved it.
99996                               four stars great shirt
99997                               five stars sweet shirt
99998                            five stars very well made
99999         so great! this was a hit at the baby shower!
Name: review_body, Length: 99947, dtype: object

In [47]:
# Removal of numbers and digits
import re      # Import REGEX
def remove_numbers(text):
  text = re.sub(r'\d+', '', str(text))
  return text

df['review_body'] = df.apply(lambda row: remove_numbers(row['review_body']), axis=1)


In [48]:
# Removal of special characters
import re, string, unicodedata # Import Regex, string and unicodedata.
def remove_non_ascii(words):
    words = unicodedata.normalize('NFKD', words).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return words

df['review_body'] = df.apply(lambda row: remove_non_ascii(row['review_body']), axis=1)

In [49]:
# Removal of punctuation
import re      # Import REGEX
def remove_punctuation(text):
  text = re.sub(r'[^\w\s]', '', str(text))
  return text

df['review_body'] = df.apply(lambda row: remove_punctuation(row['review_body']), axis=1)


In [50]:
# Import nltk stop words
# # Removal of stop words
import nltk
from nltk.corpus import stopwords            #Stopwords corpus
stopwords = stopwords.words('english')

In [52]:
# Customising the stop words - we need to review this
customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
        "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
        "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# # Set custom stop-word's list as not, couldn't etc. words matter in Sentiment, so not removing them from original data.

stopwords = list(set(stopwords) - set(customlist))

In [53]:
# after cleaning text, bring the words into a list
df['review_body'] = df.apply(lambda row: nltk.word_tokenize(row['review_body']), axis=1)
df['review_body']

0        [these, really, do, work, great, with, some, t...
1        [favorite, for, winter, very, warm, i, love, t...
2        [great, socks, for, the, money, nice, socks, g...
3        [slick, hat, i, bought, this, for, my, husband...
4        [i, would, do, it, again, perfect, dress, and,...
                               ...                        
99995             [five, stars, my, girlfriend, loved, it]
99996                          [four, stars, great, shirt]
99997                          [five, stars, sweet, shirt]
99998                      [five, stars, very, well, made]
99999    [so, great, this, was, a, hit, at, the, baby, ...
Name: review_body, Length: 99947, dtype: object

### EDA and Feature Engineering

In [None]:
# Quick check: Using discriptive statistics + measures of dispersion
df.drop('customer_id','product_parent',inplace = False).describe()

In [None]:
# Check the distribution of the star rating with the DataFrame
df['star_rating'].value_counts()

In [None]:
# (4) Choose the label and features

In [None]:
# (5) Feature engineer for data that is
# (a) relevant 
# (b) unique
# (c) correct 
# (d) not missing

# Drop data that is not a,b,c, or d
# Use one-hot encoding for nominal
# Reduce dimensions of your features

In [None]:
# (6) Confirm data is ready with further exploratory analysis

In [None]:
# (7) Training, Testing (and/or Validation) data split 

# for example, 60/20/20

In [None]:
# (7.b) If using Deep Learning, building the model

# Add Input Layer
# Add Hidden Layers
# Add Output

In [None]:
# (8) Training the Machine Learning Model (i.e, Fitting the Model)

In [None]:
# (9) Evaluate the model metrics for Training (and/or Validation) data

In [None]:
# (10) Evaluate the model metris for Testing data


# If metrics are poor, optimize either (a) the data, (b) the hyperpamaters

In [None]:
# (11) Use the model for prediction

In [None]:
# (12) Write final predicted data  (e.g, to CSV or JSON, etc.)