## Text Mining of the Product Reviews

### Preparation

#### Read in packages

In [336]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud.wordcloud import WordCloud, STOPWORDS
from PIL import Image
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
import gender_guesser.detector as gender
import warnings
warnings.filterwarnings("ignore")
from dask import compute, delayed
import dask.multiprocessing
import multiprocessing
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import six
import statsmodels.api as sm
from sklearn.tree import export_graphviz
import graphviz
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
import yaml
import textstat

sns.set_style("white")
plt.style.use('seaborn-deep')

#pd.set_option('display.max_colwidth', 1)
# start=time.time()

#### Read in data

In [337]:
df = pd.read_csv("sample.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop(['asin'], axis=1, inplace=True)
df.head(3)

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,1113,True,"10 18, 2005",AILCWT1IIP7ZT,{'Style:': ' Retail'},Charles Chen,"As a software developer, I am literally attach...",Best Overall Keyboard to Date,1129593600,
1,5.0,16,True,"10 12, 2005",A300T403J8526F,{'Style:': ' Retail'},T. Becker,"I've had a Natural Keyboard Pro for years, and...",Best keyboard I've used,1129075200,
2,4.0,11,True,"10 5, 2005",AFPGV3IQ9K691,{'Style:': ' Retail'},GDC,"As with most Microsoft products, great hardwar...","Great keyboard, poor software.",1128470400,


In [338]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2567 entries, 0 to 2566
Data columns (total 11 columns):
overall           2567 non-null float64
vote              174 non-null object
verified          2567 non-null bool
reviewTime        2567 non-null object
reviewerID        2567 non-null object
style             2554 non-null object
reviewerName      2567 non-null object
reviewText        2567 non-null object
summary           2567 non-null object
unixReviewTime    2567 non-null int64
image             13 non-null object
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 203.2+ KB


#### Text cleaning & pre-processing

### Text mining (New features)

`length`, `reviewer identity`, `foreign brand`, `product type`, `extremity (low/high ratings)`,`writing style`, `timeliness`, `spelling errors`, `readability`, `subjectivity`, `rating scores`, ``

#### Convert vote variable to float type

In [339]:
def cleanvote(df):
    '''The function takes in the cell of the `vote` column and outputs the votes as numerical values.'''
    try:
        return float(df['vote'])
    except ValueError:
        return float(df['vote'].replace(',', ''))
    
df['vote'] = df.apply(cleanvote, axis=1)

#### Get the number of associated images

In [340]:
def imagenum(df):
    '''The function takes in the cell of the `image` column and outputs the number of images.'''
    if type(df['image']) == float:
        return 0
    else:
        return len(df['image'])

df['imagenum'] = df.apply(imagenum, axis=1)

#### Get the indicator of being verified or not

In [341]:
def verified(df):
    '''The function takes in the cell of the `verified` column and outputs the indicator (1/0).'''
    if df['verified'] == True:
        return 1
    else:
        return 0
    
df['verified'] = df.apply(verified, axis=1)

#### Get the indicator of using a real name

One tried approach: named entities extraction

Pbm: cannot tell the real or seemingly-real names

https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718

https://spacy.io/usage/linguistic-features#named-entities

Approach: return true if the `revierName` has verfied first names by gender guesser (not unknown in the name database), (middle name), and last name 

https://pypi.org/project/gender-guesser/

In [342]:
def nameverified(df):
    '''The function takes in the cell of the `reviewerName` column and outputs the indicator (1/0).'''
    try:
        parts = df['reviewerName'].split(' ')
        if len(parts) > 2:
            if gender.Detector().get_gender(parts[0]) != 'unknown': 
                return 1
        else:
            return 0
    except AttributeError:
        return 0

df['nameverified'] = df.apply(nameverified, axis=1)

#### Sentiment

In [343]:
def sentiment(df):
    '''The function outputs the sentiment score for each review.'''
    return analyser.polarity_scores(df['reviewText'])['compound']

df['Sentiment'] = df.apply(sentiment, axis=1)

In [344]:
# Sentiment scores are positively correlated with the overall rating, but the correlation is not strong
round(np.corrcoef(df['overall'],  df["Sentiment"])[0,1],3)

0.376

In [345]:
dfnotnull = df[df['vote'].isna()==0]
np.corrcoef(dfnotnull['vote'],  dfnotnull["Sentiment"])[0,1]

0.033122594194551894

#### Length

https://pypi.org/project/textstat/

In [346]:
def length(df):
    '''The function outputs the length for each review.'''
    return textstat.lexicon_count(df['reviewText'], removepunct=True)

df['Length'] = df.apply(length, axis=1)

#### Sentence Count

In [347]:
def sentence_count(df):
    '''The function outputs the number of sentences for each review.'''
    return textstat.sentence_count(df['reviewText'])

df['Sentences'] = df.apply(sentence_count, axis=1)

#### Flesch reading ease score

In [348]:
def flesch(df):
    '''The function outputs the Flesch reading ease score for each review.'''
    return textstat.flesch_reading_ease(df['reviewText'])

df['Flesch'] = df.apply(flesch, axis=1)

#### Days since the first review

In [351]:
df['unixReviewTime'] = pd.to_datetime(df['unixReviewTime'], unit='s')
df['Days'] = df['unixReviewTime'] - df['unixReviewTime'].min()
df['Days'] = [i.days for i in df['Days']]

#### Before/After verified purchase policy

In [352]:
def verified_option(df):
    '''The function outputs whether the review was written before the introduction of verified purchase label policy.'''
    if df['unixReviewTime'].date()> datetime(2016, 11, 1).date():
        return 1
    else:
        return 0

df['verified_option'] = df.apply(verified_option, axis=1)
round(100*sum(df['verified_option'])/len(df))

13

#### Styles (Different models)

In [353]:
df['style'].value_counts()

{'Style:': ' Retail'}      1930
{'Style:': ' Business'}     624
Name: style, dtype: int64

In [354]:
def style(df):
    '''The function outputs the style of the product.'''
    try:
        if 'Size:' in t:
            return yaml.load(df['style'])['Style:']
    except:
        return 0

df['style'] = df.apply(style, axis=1)
df['Retail'] = np.where(df['style']==' Retail', 1, 0)

#### topics 

### Exploratory Data Analysis 

In [355]:
df.drop(['style', 'reviewerID', 'reviewerName', 
         'image', 'reviewTime',], axis=1, inplace=True)

In [356]:
df.columns

Index(['overall', 'vote', 'verified', 'reviewText', 'summary',
       'unixReviewTime', 'imagenum', 'nameverified', 'Sentiment', 'Length',
       'Sentences', 'Flesch', 'Days', 'verified_option', 'Retail'],
      dtype='object')

#### Summary statistics

In [357]:
df.head()

Unnamed: 0,overall,vote,verified,reviewText,summary,unixReviewTime,imagenum,nameverified,Sentiment,Length,Sentences,Flesch,Days,verified_option,Retail
0,5.0,1113.0,1,"As a software developer, I am literally attach...",Best Overall Keyboard to Date,2005-10-18,0,0.0,0.9981,780,1,-703.31,13,0,1
1,5.0,16.0,1,"I've had a Natural Keyboard Pro for years, and...",Best keyboard I've used,2005-10-12,0,0.0,0.8232,267,1,-182.62,7,0,1
2,4.0,11.0,1,"As with most Microsoft products, great hardwar...","Great keyboard, poor software.",2005-10-05,0,0.0,0.9947,917,15,34.84,0,0,1
3,5.0,,1,Love it.,Five Stars,2018-05-15,0,0.0,0.6369,2,1,120.21,4605,1,0
4,5.0,,1,I've had various versions of this keyboard for...,Five Stars,2018-05-12,0,0.0,0.4753,16,1,80.62,4602,1,0


In [358]:
df['vote_fillna'] = df['vote'] .fillna(0)

In [359]:
df.describe().round(2)

Unnamed: 0,overall,vote,verified,imagenum,nameverified,Sentiment,Length,Sentences,Flesch,Days,verified_option,Retail,vote_fillna
count,2567.0,174.0,2567.0,2567.0,2484.0,2567.0,2567.0,2567.0,2567.0,2567.0,2567.0,2567.0,2567.0
mean,4.21,16.41,0.86,0.55,0.08,0.45,81.37,3.12,54.32,3131.83,0.13,0.75,1.11
std,1.21,90.98,0.34,9.56,0.27,0.54,103.03,3.74,65.76,965.37,0.33,0.43,23.98
min,1.0,2.0,0.0,0.0,0.0,-0.99,0.0,1.0,-1108.3,0.0,0.0,0.0,0.0
25%,4.0,2.0,1.0,0.0,0.0,0.14,20.0,1.0,46.44,2740.0,0.0,1.0,0.0
50%,5.0,3.0,1.0,0.0,0.0,0.64,49.0,2.0,71.48,3351.0,0.0,1.0,0.0
75%,5.0,5.0,1.0,0.0,0.0,0.88,102.0,4.0,83.66,3789.5,0.0,1.0,0.0
max,5.0,1113.0,1.0,370.0,1.0,1.0,1179.0,41.0,206.84,4609.0,1.0,1.0,1113.0


In [360]:
# % of reviews don't have any votes
df['vote'].isna().sum()/df.shape[0]

0.9322165952473704

In [362]:
# % of reviews don't have images
len(df[df['imagenum']==0])/df.shape[0]

0.9949357226334242

In [363]:
df['verified'].value_counts()

1    2215
0     352
Name: verified, dtype: int64

In [364]:
# Verified reviews have higher average scores
df.groupby(['verified']).mean()['overall']

verified
0    3.593750
1    4.303837
Name: overall, dtype: float64

In [365]:
# Verified reviews have higher votes
df.groupby(['verified']).mean()['vote']

verified
0     9.294118
1    19.357724
Name: vote, dtype: float64

In [366]:
df.to_csv('clean_data.csv')