# Sentimental Analysis Amazon Review (Music Instrument) by Group 3

In [None]:
import json
import nltk # for porter stemmer
import numpy as np
import pandas as pd
import patsy #for matrices
import re
import sklearn as skl
import string
import time


## Preprocessing data

In [None]:
#Read the train and test data
train = pd.read_json("data/music_200.json", lines=True)
test = pd.read_json("data/music_test_200.json", lines=True)
#display the top 5 preview
train.head()


In [None]:
#drop the unwanted columns in our data
train = train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
test = test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
#display what's left
test.head()

## Classify the data based on review rating

In [None]:
#create a column called "good" if the overall rating is 5 and above and assign boolean value
train2 = train.assign(good = lambda g: g.overall >= 5)
#create a binary classfication 1 for all data that are good and 0 for that are not
trainFinal = train.assign(good = train2['good'].apply(lambda g: 1 if g else 0))

#display the first 10 data
trainFinal.head(10)

In [None]:
# function to remove stop word and punctuations 
def rm_stopwords_punctuation(text):
    text = text.lower()
    with open("stopwords.json") as stopword_file:
        stopwords = json.load(stopword_file)
        for word in stopwords:
            if word in text:
                # replace only complete words ('\b' is a word boundary)
                text = re.sub(r"\b{}\b".format(word), "", text)
    # remove punctuation
    for char in string.punctuation:
        text = text.replace(char, "")
    text = re.sub(r"\b[a-z]\b", "", text)
    # remove whitespace
    for char in string.punctuation:
        text = text.replace(char, "")
    text = ' '.join(text.split(None))
    return text

In [None]:
# funtion to remove stemmer using porterstem
def stem(text):
    stemmer = nltk.stem.porter.PorterStemmer()
    # stem each word individually, and concatenate
    text = ' '.join([stemmer.stem(word) for word in text.split(None)])
#     text = [stemmer.stem(word) for word in text.split(None)]
    return text

In [None]:
# a method to apply 2 functions described above stopwords,punctuation and stemmer
def process_text(text):
    text = rm_stopwords_punctuation(text)
    text = stem(text)
    return text

In [None]:
# copy train data to apply process_text method describe above
trainNoProcess = trainFinal.copy()
trainFinal['reviewText'] = trainFinal['reviewText'].apply(lambda t: process_text(t))

#display the first 10 data
trainFinal.head(10)

### Load the test data with no process vs processed and store in Matrices

In [None]:
# # just a test block to compare the result if we pick the overall rating greater or equal to 4
# test2 = test.assign(good = lambda g: g.overall >= 4)
# test2Final = test2.assign(good = test2['good'].apply(lambda g: 1 if g else 0))
# test2NoProcess = test2Final.copy()
# test2Final['reviewText'] = test2Final['reviewText'].apply(lambda t: process_text(t))
# #test2NoProcess.head(10)
# #test2Final.head(10)

In [None]:
# # store trained data as matrices Y contains binary classification and X contain text to process as list later
# y, X = patsy.dmatrices("good ~ reviewText", trainFinal, return_type="dataframe")
# y_test, X_test = patsy.dmatrices("good ~ reviewText", test2Final, return_type="dataframe")
# y_noP, X_noP = patsy.dmatrices("good ~ reviewText", trainNoProcess, return_type="dataframe")
# y_test_noP, X_test_noP = patsy.dmatrices("good ~ reviewText", test2NoProcess, return_type="dataframe")

## Testing with different dataset

In [None]:
start_time = time.time()
#music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

# read the different file, video game file as test 
vg_test = pd.read_json("data/reviews_Video_Games_5.json", lines=True)


# drop dummy columns that are not useful
vg_test = vg_test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

# show how long it takes in secs
elapsed_time = time.time() - start_time
print(elapsed_time, "seconds")

# for this much data 
vg_test.describe()

### Repeat Classification (with no processing)

In [None]:
# #start_time = time.time()

# # repeat the same process, create classficiation 
# vg_test2 = vg_test.assign(good = lambda g: g.overall >= 5)
# vg_testFinal = vg_test.assign(good = vg_test2['good'].apply(lambda g: 1 if g else 0))
# vg_test_noP = vg_testFinal.copy()

# # vg_testFinal['reviewText'] = vg_testFinal['reviewText'].apply(lambda t: process_text(t))

# #put into matrices
# y_vg_test, X_vg_test = patsy.dmatrices("good ~ reviewText", vg_test_noP, return_type="dataframe")
# #elapsed_time = time.time() - start_time
# #print(elapsed_time)
# # show test data with no processing which has 10261 rows × 3 columns
# vg_test_noP.head(20)

In [None]:
# # testing time efficiency and checking length of the data
# start_time = time.time()
# print(len(vg_test_noP))
# #y_vg_test_noP, X_vg_test_noP = patsy.dmatrices("good ~ reviewText", vg_test_noP, return_type="dataframe")
# elapsed_time = time.time() - start_time
# print(len(y_vg_test_noP), len(X_vg_test_noP))
# print(elapsed_time)

# Logistic Regression

In [None]:
# logRegrModel = skl.linear_model.LogisticRegression()
# logRegrModel = logRegrModel.fit(X, y['good'])

In [None]:
# logRegrModel.score(X, y['good'])

In [None]:
# logRegrModel.score(X_test, y_test['good'])

In [None]:
# logRegrModel_noP = skl.linear_model.LogisticRegression()
# logRegrModel_noP = logRegrModel.fit(X_noP, y_noP['good'])

In [None]:
# logRegrModel_noP.score(X_noP, y_noP['good'])

In [None]:
# logRegrModel.score(X_test_noP, y_test_noP['good'])

In [None]:
# y.mean()

In [None]:
# logRegrModel.score(X_vg_test, y_vg_test['good'])
# logRegrModel_noP.fit_transform(X_vg_test_noP, np.ravel(y_vg_test_noP))
# logRegrModel_noP.score(X_vg_test_noP, y_vg_test_noP['good'])
# logRegrModel_noP.score(X_noP, y_noP['good'])
# y_vg_test_noP.mean()

In [None]:
# iterate between the column and transposed matrix of logistic regression model and store it as list
# pd.DataFrame(list(zip(X_vg_test_noP.columns, np.transpose(logRegrModel_noP.coef_))))

In [None]:
# big_noP = skl.linear_model.LogisticRegression()
# big_noP = logRegrModel.fit(X_vg_test_noP, y_vg_test_noP['good'])

In [None]:
# big_noP.score(X_vg_test_noP, y_vg_test_noP['good'])

# test stuff

In [None]:
# cv = skl.feature_extraction.text.CountVectorizer(list(trainFinal['reviewText']))
cv = skl.feature_extraction.text.CountVectorizer()

In [None]:
cv

In [None]:
trainFinal['reviewText'][199]

In [None]:
# counts = cv.fit_transform(list(trainFinal['reviewText']))
counts = cv.fit_transform(list(vg_test_noP['reviewText']))

In [None]:
counts.shape

In [None]:
tf_transformer = skl.feature_extraction.text.TfidfTransformer(use_idf=False).fit(counts)

In [None]:
train_tf = tf_transformer.transform(counts)
train_tf

In [None]:
tfidf_transformer = skl.feature_extraction.text.TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(counts)
train_tfidf

In [None]:
sklmodel = skl.linear_model.LogisticRegression()
sklmodel = sklmodel.fit(train_tfidf, y_vg_test_noP['good'])

In [None]:
sklmodel.score(train_tfidf, y_vg_test_noP['good'])

In [None]:
y.mean()