# Sentimental Analysis Amazon Review (Music Instrument) by Group 3

In [1]:
import json
import nltk # for porter stemmer
import numpy as np
import pandas as pd
import patsy #for matrices
import re
import sklearn as skl
import string
import time


## Preprocessing data

In [2]:
#Read the train and test data
train = pd.read_json("data/music_200.json", lines=True)
test = pd.read_json("data/music_test_200.json", lines=True)
#display the top 5 preview
train.head()


Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1384719342,"[0, 0]",5,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600
1,1384719342,"[13, 14]",5,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000
2,1384719342,"[1, 1]",5,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000
3,1384719342,"[0, 0]",5,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000
4,1384719342,"[0, 0]",5,This pop filter is great. It looks and perform...,"02 21, 2014",A94QU4C90B1AX,SEAN MASLANKA,No more pops when I record my vocals.,1392940800


In [3]:
#drop the unwanted columns in our data
train = train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
test = test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
#display what's left
test.head()

Unnamed: 0,overall,reviewText
0,5,Works great. Easy to use
1,5,My acoustic only has the end pin strap button ...
2,5,Leather type material and corrosive free butto...
3,4,... but... they're the cheesiest looking leath...
4,5,Yep. it holds the strap on the neck of my guit...


## Classify the data based on review rating

In [4]:
#create a column called "good" if the overall rating is 5 and above and assign boolean value
train2 = train.assign(good = lambda g: g.overall >= 5)
#create a binary classfication 1 for all data that are good and 0 for that are not
trainFinal = train.assign(good = train2['good'].apply(lambda g: 1 if g else 0))

#display the first 10 data
trainFinal.head(10)

Unnamed: 0,overall,reviewText,good
0,5,"Not much to write about here, but it does exac...",1
1,5,The product does exactly as it should and is q...,1
2,5,The primary job of this device is to block the...,1
3,5,Nice windscreen protects my MXL mic and preven...,1
4,5,This pop filter is great. It looks and perform...,1
5,5,So good that I bought another one. Love the h...,1
6,5,"I have used monster cables for years, and with...",1
7,3,I now use this cable to run from the output of...,0
8,5,Perfect for my Epiphone Sheraton II. Monster ...,1
9,5,Monster makes the best cables and a lifetime w...,1


In [5]:
# function to remove stop word and punctuations 
def rm_stopwords_punctuation(text):
    text = text.lower()
    with open("stopwords.json") as stopword_file:
        stopwords = json.load(stopword_file)
        for word in stopwords:
            if word in text:
                # replace only complete words ('\b' is a word boundary)
                text = re.sub(r"\b{}\b".format(word), "", text)
    # remove punctuation
    for char in string.punctuation:
        text = text.replace(char, "")
    text = re.sub(r"\b[a-z]\b", "", text)
    # remove whitespace
    for char in string.punctuation:
        text = text.replace(char, "")
    text = ' '.join(text.split(None))
    return text

In [6]:
# funtion to remove stemmer using porterstem
def stem(text):
    stemmer = nltk.stem.porter.PorterStemmer()
    # stem each word individually, and concatenate
    text = ' '.join([stemmer.stem(word) for word in text.split(None)])
#     text = [stemmer.stem(word) for word in text.split(None)]
    return text

In [7]:
# a method to apply 2 functions described above stopwords,punctuation and stemmer
def process_text(text):
    text = rm_stopwords_punctuation(text)
    text = stem(text)
    return text

In [8]:
# copy train data to apply process_text method describe above
trainNoProcess = trainFinal.copy()
trainFinal['reviewText'] = trainFinal['reviewText'].apply(lambda t: process_text(t))

#display the first 10 data
trainFinal.head(10)

Unnamed: 0,overall,reviewText,good
0,5,much write exactli suppos filter pop sound now...,1
1,5,product exactli quit afford realiz doubl scree...,1
2,5,primari job devic block breath otherwis produc...,1
3,5,nice windscreen protect mxl mic prevent pop th...,1
4,5,pop filter great look perform like studio filt...,1
5,5,good bought anoth one love heavi cord gold con...,1
6,5,use monster cabl year good reason lifetim warr...,1
7,3,now use cabl run output pedal chain input fend...,0
8,5,perfect epiphon sheraton ii monster cabl well ...,1
9,5,monster make best cabl lifetim warranti doesnt...,1


### Load the test data with no process vs processed and store in Matrices

In [9]:
# just a test block to compare the result if we pick the overall rating greater or equal to 4
test2 = test.assign(good = lambda g: g.overall >= 4)
test2Final = test2.assign(good = test2['good'].apply(lambda g: 1 if g else 0))
test2NoProcess = test2Final.copy()
test2Final['reviewText'] = test2Final['reviewText'].apply(lambda t: process_text(t))
#test2NoProcess.head(10)
#test2Final.head(10)

In [10]:
# store trained data as matrices Y contains binary classification and X contain text to process as list later
y, X = patsy.dmatrices("good ~ reviewText", trainFinal, return_type="dataframe")
y_test, X_test = patsy.dmatrices("good ~ reviewText", test2Final, return_type="dataframe")
y_noP, X_noP = patsy.dmatrices("good ~ reviewText", trainNoProcess, return_type="dataframe")
y_test_noP, X_test_noP = patsy.dmatrices("good ~ reviewText", test2NoProcess, return_type="dataframe")

In [11]:
# display y which is 200 rows x 1 columns
print(y)
# display X which is 200 rows x 200 columns
#print(X)
#print(len(y), len(X))

     good
0     1.0
1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     1.0
7     0.0
8     1.0
9     1.0
10    1.0
11    0.0
12    0.0
13    1.0
14    1.0
15    0.0
16    0.0
17    1.0
18    1.0
19    1.0
20    0.0
21    1.0
22    0.0
23    1.0
24    1.0
25    0.0
26    1.0
27    1.0
28    0.0
29    1.0
..    ...
170   0.0
171   1.0
172   0.0
173   0.0
174   0.0
175   0.0
176   1.0
177   0.0
178   0.0
179   0.0
180   0.0
181   0.0
182   1.0
183   1.0
184   0.0
185   0.0
186   0.0
187   1.0
188   1.0
189   0.0
190   0.0
191   1.0
192   1.0
193   1.0
194   1.0
195   1.0
196   1.0
197   1.0
198   1.0
199   0.0

[200 rows x 1 columns]


## Testing with different dataset

In [13]:
start_time = time.time()
#music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

# read the different file, video game file as test 
vg_test = pd.read_json("data/reviews_Video_Games_5.json", lines=True)


# drop dummy columns that are not useful
vg_test = vg_test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

# show how long it takes in secs
elapsed_time = time.time() - start_time
print(elapsed_time, "seconds")

# for this much data 
vg_test.describe()

15.52587604522705 seconds


Unnamed: 0,overall
count,231780.0
mean,4.086397
std,1.20233
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


### Repeat Classification (with no processing)

In [14]:
#start_time = time.time()

# repeat the same process, create classficiation 
vg_test2 = vg_test.assign(good = lambda g: g.overall >= 5)
vg_testFinal = vg_test.assign(good = vg_test2['good'].apply(lambda g: 1 if g else 0))
vg_test_noP = vg_testFinal.copy()

# vg_testFinal['reviewText'] = vg_testFinal['reviewText'].apply(lambda t: process_text(t))

#put into matrices
y_vg_test, X_vg_test = patsy.dmatrices("good ~ reviewText", vg_test_noP, return_type="dataframe")
#elapsed_time = time.time() - start_time
#print(elapsed_time)
# show test data with no processing which has 10261 rows × 3 columns
vg_test_noP.head(20)

NameError: name 'testNoProcess' is not defined

In [None]:
# testing time efficiency and checking length of the data
start_time = time.time()
print(len(vg_test_noP))
#y_vg_test_noP, X_vg_test_noP = patsy.dmatrices("good ~ reviewText", vg_test_noP, return_type="dataframe")
elapsed_time = time.time() - start_time
print(len(y_vg_test_noP), len(X_vg_test_noP))
print(elapsed_time)

# Logistic Regression

In [None]:
logRegrModel = skl.linear_model.LogisticRegression()
logRegrModel = logRegrModel.fit(X, y['good'])

In [None]:
logRegrModel.score(X, y['good'])

In [None]:
logRegrModel.score(X_test, y_test['good'])

In [None]:
logRegrModel_noP = skl.linear_model.LogisticRegression()
logRegrModel_noP = logRegrModel.fit(X_noP, y_noP['good'])

In [None]:
logRegrModel_noP.score(X_noP, y_noP['good'])

In [None]:
logRegrModel.score(X_test_noP, y_test_noP['good'])

In [None]:
y.mean()

In [None]:
# logRegrModel.score(X_vg_test, y_vg_test['good'])
logRegrModel_noP.fit_transform(X_vg_test_noP, np.ravel(y_vg_test_noP))
# logRegrModel_noP.score(X_vg_test_noP, y_vg_test_noP['good'])
# logRegrModel_noP.score(X_noP, y_noP['good'])
y_vg_test_noP.mean()

In [None]:
# iterate between the column and transposed matrix of logistic regression model and store it as list
pd.DataFrame(list(zip(X_vg_test_noP.columns, np.transpose(logRegrModel_noP.coef_))))

In [None]:
big_noP = skl.linear_model.LogisticRegression()
big_noP = logRegrModel.fit(X_vg_test_noP, y_vg_test_noP['good'])

In [None]:
big_noP.score(X_vg_test_noP, y_vg_test_noP['good'])

# test stuff

In [None]:
# cv = skl.feature_extraction.text.CountVectorizer(list(trainFinal['reviewText']))
cv = skl.feature_extraction.text.CountVectorizer()

In [None]:
cv

In [None]:
trainFinal['reviewText'][199]

In [None]:
# counts = cv.fit_transform(list(trainFinal['reviewText']))
counts = cv.fit_transform(list(vg_test_noP['reviewText']))

In [None]:
counts.shape

In [None]:
tf_transformer = skl.feature_extraction.text.TfidfTransformer(use_idf=False).fit(counts)

In [None]:
train_tf = tf_transformer.transform(counts)
train_tf

In [None]:
tfidf_transformer = skl.feature_extraction.text.TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(counts)
train_tfidf

In [None]:
sklmodel = skl.linear_model.LogisticRegression()
sklmodel = sklmodel.fit(train_tfidf, y_vg_test_noP['good'])

In [None]:
sklmodel.score(train_tfidf, y_vg_test_noP['good'])

In [None]:
y.mean()