# Sentimental Analysis Amazon Review (Music Instrument) by Group 3

In [1]:
import json
import nltk # for porter stemmer
import numpy as np
import pandas as pd
import re
import sklearn as skl
import string
import time

## Preprocessing data

### Text processing methods

In [2]:
# function to remove stop word and punctuations 
def rm_stopwords_punctuation(text):
    text = text.lower()
    with open("stopwords.json") as stopword_file:
        stopwords = json.load(stopword_file)
        for word in stopwords:
            if word in text:
                # replace only complete words ('\b' is a word boundary)
                text = re.sub(r"\b{}\b".format(word), "", text)
    # remove punctuation
    for char in string.punctuation:
        text = text.replace(char, "")
    text = re.sub(r"\b[a-z]\b", "", text)
    # remove whitespace
    for char in string.punctuation:
        text = text.replace(char, "")
    text = ' '.join(text.split(None))
    return text

In [3]:
# funtion to remove stemmer using porterstem
def stem(text):
    stemmer = nltk.stem.porter.PorterStemmer()
    # stem each word individually, and concatenate
    text = ' '.join([stemmer.stem(word) for word in text.split(None)])
    return text

In [4]:
# a method to apply 2 functions described above stopwords, punctuation and stemmer
def process_text(text):
    text = rm_stopwords_punctuation(text)
    text = stem(text)
    return text

## Import and process music dataset

In [5]:
# Read the train and test data
music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

In [6]:
# drop the unwanted columns in our data
music_test = music_test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

#### Classify the data based on review rating

In [7]:
# create a column called "good" if the overall rating is 5 and above and assign boolean value
test2 = music_test.assign(good = lambda g: g.overall >= 5)
# create a binary classfication 1 for all data that are good and 0 for that are not
music_test = music_test.assign(good = test2['good'].apply(lambda g: 1 if g else 0))


# display the first 10 data
music_test.head(10)

Unnamed: 0,overall,reviewText,good
0,5,"Not much to write about here, but it does exac...",1
1,5,The product does exactly as it should and is q...,1
2,5,The primary job of this device is to block the...,1
3,5,Nice windscreen protects my MXL mic and preven...,1
4,5,This pop filter is great. It looks and perform...,1
5,5,So good that I bought another one. Love the h...,1
6,5,"I have used monster cables for years, and with...",1
7,3,I now use this cable to run from the output of...,0
8,5,Perfect for my Epiphone Sheraton II. Monster ...,1
9,5,Monster makes the best cables and a lifetime w...,1


In [8]:
# copy train data to apply process_text method describe above
music_test_noP = music_test.copy()
music_test_P = music_test.copy()
music_test_P['reviewText'] = music_test['reviewText'].apply(lambda t: process_text(t))

# display the first 10 data
music_test_P.head(10)

Unnamed: 0,overall,reviewText,good
0,5,much write exactli suppos filter pop sound now...,1
1,5,product exactli quit afford realiz doubl scree...,1
2,5,primari job devic block breath otherwis produc...,1
3,5,nice windscreen protect mxl mic prevent pop th...,1
4,5,pop filter great look perform like studio filt...,1
5,5,good bought anoth one love heavi cord gold con...,1
6,5,use monster cabl year good reason lifetim warr...,1
7,3,now use cabl run output pedal chain input fend...,0
8,5,perfect epiphon sheraton ii monster cabl well ...,1
9,5,monster make best cabl lifetim warranti doesnt...,1


## Import and process video game dataset

In [9]:
start_time = time.time()

# read the different file, video game file as test 
vg_train = pd.read_json("data/reviews_Video_Games_5.json", lines=True)


# drop dummy columns that are not useful
vg_train = vg_train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

# show how long it takes in secs for this much data 
elapsed_time = time.time() - start_time
print("vg import:", elapsed_time, "sec")

vg_train.describe()

vg import: 11.644109010696411 sec


Unnamed: 0,overall
count,231780.0
mean,4.086397
std,1.20233
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [10]:
start_time = time.time()

# repeat the same process, create classficiation 
vg_train2 = vg_train.assign(good = lambda g: g.overall >= 5)
vg_train = vg_train.assign(good = vg_train2['good'].apply(lambda g: 1 if g else 0))
vg_train_noP = vg_train.copy()

# This is too much data to process
# vg_train_P = vg_train.copy()
# vg_train_P['reviewText'] = vg_train['reviewText'].apply(lambda t: process_text(t))

vg_train_P_sample = vg_train.sample(frac=0.1)
vg_train_P_sample['reviewText'] = vg_train_P_sample['reviewText'].apply(lambda t: process_text(t))

elapsed_time = time.time() - start_time
print("vg process:", elapsed_time, "sec")

vg_train_P_sample.describe()

vg process: 187.19876503944397 sec


Unnamed: 0,overall,good
count,23178.0,23178.0
mean,4.077487,0.512124
std,1.199424,0.499864
min,1.0,0.0
25%,4.0,0.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


## Export processed data

In [11]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
vg_train_P_sample.to_json("vg_reviews_text_processed_10%.json")
print("saved ", "vg_reviews_text_processed_10%.json")
vg_train_noP.to_json("vg_reviews_original_text.json")
print("saved ", "vg_reviews_original_text.json")
music_test_P.to_json("music_reviews_text_processed.json")
print("saved ", "music_reviews_text_processed.json")
music_test_noP.to_json("music_reviews_original_text.json")
print("saved ", "music_reviews_original_text.json")

saved  vg_reviews_text_processed_10%.json
saved  vg_reviews_original_text.json
saved  music_reviews_text_processed.json
saved  music_reviews_original_text.json


### Import preprocessed data

This lets us skip every line of code above this point

In [12]:
vg_train = pd.read_json("vg_reviews_text_processed_10%.json")
music_test = pd.read_json("music_reviews_text_processed.json")

vg_train.describe()

Unnamed: 0,good,overall
count,23178.0,23178.0
mean,0.512124,4.077487
std,0.499864,1.199424
min,0.0,1.0
25%,0.0,4.0
50%,1.0,5.0
75%,1.0,5.0
max,1.0,5.0


# Prediction Algorithms: Logistic Regression

## Train logistic regression on video games

In [13]:
y_vg_train = vg_train["good"]

cv_vg = skl.feature_extraction.text.CountVectorizer()
vg_counts = cv_vg.fit_transform(vg_train['reviewText'].values)

In [14]:
vg_counts.shape

(23178, 78085)

In [15]:
# These aren't used, but they could be.
tf_transformer = skl.feature_extraction.text.TfidfTransformer(use_idf=False).fit(vg_counts)
vg_train_tf = tf_transformer.transform(vg_counts)
tfidf_transformer = skl.feature_extraction.text.TfidfTransformer()
vg_train_tfidf = tfidf_transformer.fit_transform(vg_counts)

In [16]:
sklmodel = skl.linear_model.LogisticRegression()

sklmodel = sklmodel.fit(vg_counts, y_vg_train)
# sklmodel = sklmodel.fit(vg_train_tfidf, y_vg_train)

### Test full video game dataset against itself

In [17]:
sklmodel.score(vg_counts, y_vg_train)
# sklmodel.score(vg_train_tfidf, y_vg_train)

0.92932953662956252

In [18]:
# compare against random guessing
y_vg_train.mean()

0.51212356544999571

## Re-train and test logistic regression on only video games

We can split the video game dataset into two randomized subsets, 75% for training and 25% for testing.

In [19]:
cv_vg = skl.feature_extraction.text.CountVectorizer()
vg_counts = cv_vg.fit_transform(vg_train['reviewText'].values)

In [20]:
from sklearn.model_selection import train_test_split

vg_X = vg_counts
vg_y = vg_train['good']

# random_state=0 to use the same RNG seed
vg_X_train, vg_X_test, vg_y_train, vg_y_test = train_test_split(vg_X, vg_y, random_state=0)

In [21]:
sklmodel = skl.linear_model.LogisticRegression()

sklmodel = sklmodel.fit(vg_X_train, vg_y_train)

In [22]:
sklmodel.score(vg_X_test, vg_y_test)

0.71078515962036237

## Test logistic regression on music dataset

In [23]:
y_music_test = music_test["good"]

vg_vocab = cv_vg.get_feature_names()
cv_music = skl.feature_extraction.text.CountVectorizer(vocabulary=vg_vocab)
music_counts = cv_music.fit_transform(music_test['reviewText'].values)

# This isn't needed, because the words in the music review vocabulary that *aren't* in
# the video game vocabulary haven't been trained against anyways.
# music_vocab = cv_music.get_feature_names()
# len(cv_music.get_feature_names())
# len(music_vocab + list(set(vg_vocab) - set(music_vocab)))

In [24]:
predictions = sklmodel.predict(music_counts)
predictions

array([0, 0, 1, ..., 0, 1, 0])

In [25]:
sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_music_test[i]:
        sum += 1

1 - sum/len(predictions)

0.5284085371796121

In [26]:
music_tf_transformer = skl.feature_extraction.text.TfidfTransformer(use_idf=False).fit(music_counts)
music_train_tf = music_tf_transformer.transform(music_counts)
# music_tfidf_transformer = skl.feature_extraction.text.TfidfTransformer()
# music_train_tfidf = music_tfidf_transformer.fit_transform(music_counts)

In [27]:
predictions = sklmodel.predict(music_train_tf)
predictions

array([1, 1, 1, ..., 1, 1, 0])

In [28]:
sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_music_test[i]:
        sum += 1

1 - sum/len(predictions)

0.6229412337978755

# Prediction Algorithms: Naive Bayes

## Train naive bayes on full video game dataset

In [29]:
cv_vg = skl.feature_extraction.text.CountVectorizer()
vg_counts = cv_vg.fit_transform(vg_train['reviewText'].values)
y_vg_train = vg_train["overall"]

print(vg_counts.shape)

(23178, 78085)


In [30]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
NB_clf.fit(vg_counts, y_vg_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### test full dataset against itself

In [31]:
NB_clf.score(vg_counts, y_vg_train)

0.69000776598498581

In [32]:
# This is what `score(vg_counts, y_vg_train)` basically is.

predictions = NB_clf.predict(vg_counts)
predictions

sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_vg_train.values[i]:
        sum += 1

1 - sum/len(predictions)

0.6900077659849857

## Re-train and test Naive Bayes on only video games

We can split the video game dataset into two randomized subsets, 75% for training and 25% for testing.

In [33]:
cv_vg = skl.feature_extraction.text.CountVectorizer()
vg_counts = cv_vg.fit_transform(vg_train['reviewText'].values)

In [34]:
from sklearn.model_selection import train_test_split

vg_X = vg_counts
vg_y = vg_train['overall']

# random_state=0 to use the same RNG seed
vg_X_train, vg_X_test, vg_y_train, vg_y_test = train_test_split(vg_X, vg_y, random_state=0)

Score with a "normal" `alpha=1`

In [35]:
NB_clf = MultinomialNB(alpha=1)
NB_clf.fit(vg_X_train, vg_y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [36]:
NB_clf.score(vg_X_train, vg_y_train)

0.67439452338491634

In [37]:
NB_clf.score(vg_X_test, vg_y_test)

0.5358067299396031

If we set `alpha=0.1`, the score of testing the dataset against *itself* improves dramatically, but scoring against the test dataset remains unchanged.

In [38]:
NB_clf = MultinomialNB(alpha=0.1)
NB_clf.fit(vg_X_train, vg_y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [39]:
NB_clf.score(vg_X_train, vg_y_train)

0.84179945924178801

In [40]:
NB_clf.score(vg_X_test, vg_y_test)

0.52372735116479718

In [41]:
# alpha=1 is what we're going to be working with

NB_clf = MultinomialNB(alpha=1)
NB_clf.fit(vg_X_train, vg_y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

## Test naive Bayes on music dataset

In [42]:
y_music_test = music_test_P["overall"]

vg_vocab = cv_vg.get_feature_names()
cv_music = skl.feature_extraction.text.CountVectorizer(vocabulary=vg_vocab)
music_counts = cv_music.fit_transform(music_test_P['reviewText'].values)

In [43]:
predictions = NB_clf.predict(music_counts)

In [44]:
sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_music_test[i]:
        sum += 1

1 - sum/len(predictions)

0.6238183412922718

In [45]:
NB_clf.score(music_counts, y_music_test)

0.62381834129227176

## not used anymore

// not used ## Train naive bayes on video game dataset

preserved for possible future use, but should be deleted

In [46]:
# cv = skl.feature_extraction.text.CountVectorizer()
# music_counts = cv.fit_transform(music_test_P['reviewText'].values)

# print(vg_counts.shape)

In [47]:
# from sklearn.naive_bayes import MultinomialNB

# NB_clf = MultinomialNB()
# y_music_test = music_test_P["overall"]
# NB_clf.fit(music_counts, y_music_test)

In [48]:
# music_counts = cv.transform(list(music_test_P['reviewText']))
# 
# music_counts = cv.transform(music_test_P['reviewText'].values)
# 
# predictions = NB_clf.predict(music_counts)

In [49]:
# predictions

In [50]:
# sum = 0
# for i in range(0, len(predictions)):
#     if predictions[i] != y_music_test[i]:
#         sum += 1

# 1 - sum/len(predictions)