# Sentimental Analysis Amazon Review (Music Instrument) by Group 3

In [1]:
import json
import nltk # for porter stemmer
import numpy as np
import pandas as pd
# import patsy #for matrices
import re
import sklearn as skl
import string
import time

## Preprocessing data

In [2]:
# Read the train and test data
train = pd.read_json("data/music_200.json", lines=True)
test = pd.read_json("data/music_test_200.json", lines=True)
# display the top 5 preview
train.head()

music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

In [3]:
# drop the unwanted columns in our data
# train = train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
# test = test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
# display what's left
# test.head()

music_test = music_test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

## Classify the data based on review rating

In [4]:
# create a column called "good" if the overall rating is 5 and above and assign boolean value
# train2 = train.assign(good = lambda g: g.overall >= 5)
test2 = music_test.assign(good = lambda g: g.overall >= 5)
# create a binary classfication 1 for all data that are good and 0 for that are not
# trainFinal = train.assign(good = train2['good'].apply(lambda g: 1 if g else 0))
music_test = music_test.assign(good = test2['good'].apply(lambda g: 1 if g else 0))


# display the first 10 data
music_test.head(10)

Unnamed: 0,overall,reviewText,good
0,5,"Not much to write about here, but it does exac...",1
1,5,The product does exactly as it should and is q...,1
2,5,The primary job of this device is to block the...,1
3,5,Nice windscreen protects my MXL mic and preven...,1
4,5,This pop filter is great. It looks and perform...,1
5,5,So good that I bought another one. Love the h...,1
6,5,"I have used monster cables for years, and with...",1
7,3,I now use this cable to run from the output of...,0
8,5,Perfect for my Epiphone Sheraton II. Monster ...,1
9,5,Monster makes the best cables and a lifetime w...,1


In [5]:
# function to remove stop word and punctuations 
def rm_stopwords_punctuation(text):
    text = text.lower()
    with open("stopwords.json") as stopword_file:
        stopwords = json.load(stopword_file)
        for word in stopwords:
            if word in text:
                # replace only complete words ('\b' is a word boundary)
                text = re.sub(r"\b{}\b".format(word), "", text)
    # remove punctuation
    for char in string.punctuation:
        text = text.replace(char, "")
    text = re.sub(r"\b[a-z]\b", "", text)
    # remove whitespace
    for char in string.punctuation:
        text = text.replace(char, "")
    text = ' '.join(text.split(None))
    return text

In [6]:
# funtion to remove stemmer using porterstem
def stem(text):
    stemmer = nltk.stem.porter.PorterStemmer()
    # stem each word individually, and concatenate
    text = ' '.join([stemmer.stem(word) for word in text.split(None)])
#     text = [stemmer.stem(word) for word in text.split(None)]
    return text

In [7]:
# a method to apply 2 functions described above stopwords, punctuation and stemmer
def process_text(text):
    text = rm_stopwords_punctuation(text)
    text = stem(text)
    return text

In [8]:
# copy train data to apply process_text method describe above
music_test_noP = music_test.copy()
music_test_P = music_test.copy()
music_test_P['reviewText'] = music_test['reviewText'].apply(lambda t: process_text(t))

# display the first 10 data
music_test_P.head(10)

Unnamed: 0,overall,reviewText,good
0,5,much write exactli suppos filter pop sound now...,1
1,5,product exactli quit afford realiz doubl scree...,1
2,5,primari job devic block breath otherwis produc...,1
3,5,nice windscreen protect mxl mic prevent pop th...,1
4,5,pop filter great look perform like studio filt...,1
5,5,good bought anoth one love heavi cord gold con...,1
6,5,use monster cabl year good reason lifetim warr...,1
7,3,now use cabl run output pedal chain input fend...,0
8,5,perfect epiphon sheraton ii monster cabl well ...,1
9,5,monster make best cabl lifetim warranti doesnt...,1


## Testing with different dataset

In [9]:
start_time = time.time()
#music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

# read the different file, video game file as test 
vg_train = pd.read_json("data/reviews_Video_Games_5.json", lines=True)


# drop dummy columns that are not useful
vg_train = vg_train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

# show how long it takes in secs
elapsed_time = time.time() - start_time
print("vg import:", elapsed_time, "sec")

# for this much data 
vg_train.describe()

9.726276874542236 seconds


Unnamed: 0,overall
count,231780.0
mean,4.086397
std,1.20233
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [10]:
start_time = time.time()

# repeat the same process, create classficiation 
vg_train2 = vg_train.assign(good = lambda g: g.overall >= 5)
vg_train = vg_train.assign(good = vg_train2['good'].apply(lambda g: 1 if g else 0))
vg_train_noP = vg_train.copy()

vg_train_P = vg_train.copy()
vg_train_P['reviewText'] = vg_train['reviewText'].apply(lambda t: process_text(t))

elapsed_time = time.time() - start_time
print("vg process:", elapsed_time, "sec")

KeyboardInterrupt: 

In [None]:
y_vg_train = vg_train["good"]

# test stuff

In [None]:
# cv = skl.feature_extraction.text.CountVectorizer(list(trainFinal['reviewText']))
cv = skl.feature_extraction.text.CountVectorizer()

In [None]:
# counts = cv.fit_transform(list(trainFinal['reviewText']))
# vg_counts = cv.fit_transform(list(vg_train_noP['reviewText']))
vg_counts = cv.fit_transform(list(vg_train['reviewText']))

In [None]:
vg_counts.shape

In [None]:
tf_transformer = skl.feature_extraction.text.TfidfTransformer(use_idf=False).fit(vg_counts)

In [None]:
vg_train_tf = tf_transformer.transform(vg_counts)

In [None]:
tfidf_transformer = skl.feature_extraction.text.TfidfTransformer()
vg_train_tfidf = tfidf_transformer.fit_transform(vg_counts)

In [None]:
sklmodel = skl.linear_model.LogisticRegression()
# sklmodel = sklmodel.fit(train_tfidf, y_vg_train)
# type(train_tfidf)
# sklmodel = sklmodel.fit(vg_train_noP['reviewText'].values, y_vg_train)
sklmodel = sklmodel.fit(vg_train_P['reviewText'].values, y_vg_train)

In [None]:
sklmodel.score(train_tfidf, y_vg_train)

In [None]:
y_vg_train.mean()

In [None]:
# vocab = np.array(cv.get_feature_names())
# len()

## Naive Bayes prediction

In [None]:
cv = skl.feature_extraction.text.CountVectorizer()

music_counts = cv.fit_transform(list(music_test_P['reviewText']))

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
y_music_test = music_test_P["overall"]
NB_clf.fit(music_counts, y_music_test)

In [None]:
music_counts = cv.transform(list(music_test_P['reviewText']))

# music_counts = cv.transform(music_test_P['reviewText'])

predictions = NB_clf.predict(music_counts)

In [None]:
predictions


In [None]:
sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_music_test[i]:
        sum += 1

1 - sum/len(predictions)

In [None]:
from collections import Iterable

isinstance(music_test_P['reviewText'], Iterable)

In [None]:
music_test_P['reviewText'].iterable