# Sentimental Analysis Amazon Review (Music Instrument) by Group 3

In [None]:
import json
import nltk # for porter stemmer
import numpy as np
import pandas as pd
# import patsy #for matrices
import re
import sklearn as skl
import string
import time

## Preprocessing data

In [None]:
# Read the train and test data
train = pd.read_json("data/music_200.json", lines=True)
test = pd.read_json("data/music_test_200.json", lines=True)
# display the top 5 preview
train.head()

music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

In [None]:
# drop the unwanted columns in our data
# train = train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
# test = test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)
# display what's left
# test.head()

music_test = music_test.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

## Classify the data based on review rating

In [None]:
# create a column called "good" if the overall rating is 5 and above and assign boolean value
# train2 = train.assign(good = lambda g: g.overall >= 5)
test2 = music_test.assign(good = lambda g: g.overall >= 5)
# create a binary classfication 1 for all data that are good and 0 for that are not
# trainFinal = train.assign(good = train2['good'].apply(lambda g: 1 if g else 0))
music_test = music_test.assign(good = test2['good'].apply(lambda g: 1 if g else 0))


# display the first 10 data
music_test.head(10)

In [None]:
# function to remove stop word and punctuations 
def rm_stopwords_punctuation(text):
    text = text.lower()
    with open("stopwords.json") as stopword_file:
        stopwords = json.load(stopword_file)
        for word in stopwords:
            if word in text:
                # replace only complete words ('\b' is a word boundary)
                text = re.sub(r"\b{}\b".format(word), "", text)
    # remove punctuation
    for char in string.punctuation:
        text = text.replace(char, "")
    text = re.sub(r"\b[a-z]\b", "", text)
    # remove whitespace
    for char in string.punctuation:
        text = text.replace(char, "")
    text = ' '.join(text.split(None))
    return text

In [None]:
# funtion to remove stemmer using porterstem
def stem(text):
    stemmer = nltk.stem.porter.PorterStemmer()
    # stem each word individually, and concatenate
    text = ' '.join([stemmer.stem(word) for word in text.split(None)])
#     text = [stemmer.stem(word) for word in text.split(None)]
    return text

In [None]:
# a method to apply 2 functions described above stopwords, punctuation and stemmer
def process_text(text):
    text = rm_stopwords_punctuation(text)
    text = stem(text)
    return text

In [None]:
# copy train data to apply process_text method describe above
music_test_noP = music_test.copy()
music_test_P = music_test.copy()
music_test_P['reviewText'] = music_test['reviewText'].apply(lambda t: process_text(t))

# display the first 10 data
music_test_P.head(10)

## Testing with different dataset

In [None]:
start_time = time.time()
#music_test = pd.read_json("data/reviews_Musical_Instruments_5.json", lines=True)

# read the different file, video game file as test 
vg_train = pd.read_json("data/reviews_Video_Games_5.json", lines=True)


# drop dummy columns that are not useful
vg_train = vg_train.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis=1)

# show how long it takes in secs
elapsed_time = time.time() - start_time
print("vg import:", elapsed_time, "sec")

# for this much data 
vg_train.describe()

In [None]:
start_time = time.time()

# repeat the same process, create classficiation 
vg_train2 = vg_train.assign(good = lambda g: g.overall >= 5)
vg_train = vg_train.assign(good = vg_train2['good'].apply(lambda g: 1 if g else 0))
vg_train_noP = vg_train.copy()

vg_train_P = vg_train.copy()
# vg_train_P['reviewText'] = vg_train['reviewText'].apply(lambda t: process_text(t))

vg_train_P_sample = vg_train.sample(frac=0.1)
vg_train_P_sample['reviewText'] = vg_train_P_sample['reviewText'].apply(lambda t: process_text(t))

elapsed_time = time.time() - start_time
print("vg process:", elapsed_time, "sec")

# vg_train_P_sample.describe()

In [None]:
y_vg_train = vg_train["good"]

In [None]:
vg_train_P_sample.describe()
# music_test_P.describe()

# Export processed data

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
vg_train_P_sample.to_json("vg_reviews_text_processed_10%.json")
print("saved ", "vg_reviews_text_processed_10%.json")
vg_train_noP.to_json("vg_reviews_original_text.json")
print("saved ", "vg_reviews_original_text.json")
music_test_P.to_json("music_reviews_text_processed.json")
print("saved ", "music_reviews_text_processed.json")
music_test_noP.to_json("music_reviews_original_text.json")
print("saved ", "music_reviews_original_text.json")

In [None]:
vg_train = pd.read_json("vg_reviews_text_processed_10%.json")
y_vg_train = vg_train["good"]
vg_train.describe()

# test stuff

In [None]:
# cv = skl.feature_extraction.text.CountVectorizer(list(trainFinal['reviewText']))
cv_vg = skl.feature_extraction.text.CountVectorizer()
vg_counts = cv_vg.fit_transform(vg_train['reviewText'].values)

In [None]:
# counts = cv.fit_transform(list(trainFinal['reviewText']))
# vg_counts = cv.fit_transform(list(vg_train_noP['reviewText']))
# vg_counts = cv.fit_transform(list(vg_train['reviewText']))

In [None]:
vg_counts.shape

In [None]:
tf_transformer = skl.feature_extraction.text.TfidfTransformer(use_idf=False).fit(vg_counts)

In [None]:
vg_train_tf = tf_transformer.transform(vg_counts)

In [None]:
tfidf_transformer = skl.feature_extraction.text.TfidfTransformer()
vg_train_tfidf = tfidf_transformer.fit_transform(vg_counts)

In [None]:
sklmodel = skl.linear_model.LogisticRegression()
# sklmodel = sklmodel.fit(train_tfidf, y_vg_train)
# type(train_tfidf)
# sklmodel = sklmodel.fit(vg_train_noP['reviewText'].values, y_vg_train)
# sklmodel = sklmodel.fit(vg_train_P['reviewText'].values, y_vg_train)

# sklmodel = sklmodel.fit(vg_train['reviewText'].values, y_vg_train)
# vg_train['reviewText'].head()
# y_vg_train.head()
# vg_train['reviewText']

# from sklearn.preprocessing import LabelEncoder

# LEnc = LabelEncoder()

# # train_labeled = LEnc.fit_transform(vg_train['reviewText'].values)
# # train_labeled = LEnc.fit_transform(list(vg_train['reviewText']))
# # sklmodel = sklmodel.fit(train_labeled, y_vg_train)
# # train_labeled
# vg_train['reviewText']

## OKAY

sklmodel = sklmodel.fit(vg_counts, y_vg_train)

In [None]:
sklmodel.score(vg_train_tfidf, y_vg_train)

In [None]:
y_vg_train.mean()

# LogRegr on music test data

In [None]:
y_music_test = music_test_P["good"]

vg_vocab = cv_vg.get_feature_names()
cv_music = skl.feature_extraction.text.CountVectorizer(vocabulary=vg_vocab)
music_counts = cv_music.fit_transform(music_test_P['reviewText'].values)

# music_vocab = cv_music.get_feature_names()
# print("music vocab:", len(music_vocab))
# print("vidya vocab:", len(vg_vocab))
# len(cv_music.get_feature_names())
# len(music_vocab + list(set(vg_vocab) - set(music_vocab)))

In [None]:
predictions = sklmodel.predict(music_counts)
predictions

In [None]:
sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_music_test[i]:
        sum += 1

1 - sum/len(predictions)

## Naive Bayes prediction

In [None]:
cv = skl.feature_extraction.text.CountVectorizer()
music_counts = cv.fit_transform(music_test_P['reviewText'].values)

# print(type(music_counts))

# print(vg_counts.shape)

# music_counts = cv.fit(music_test_P['reviewText'].values)

cv.fit(music_test_P['reviewText'].values)

# music_counts.shape

# print(type(music_counts))
print(vg_counts.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
y_music_test = music_test_P["overall"]
NB_clf.fit(music_counts, y_music_test)

In [None]:
music_counts = cv.transform(list(music_test_P['reviewText']))

# music_counts = cv.transform(music_test_P['reviewText'])

predictions = NB_clf.predict(music_counts)

In [None]:
predictions


In [None]:
sum = 0
for i in range(0, len(predictions)):
    if predictions[i] != y_music_test[i]:
        sum += 1

1 - sum/len(predictions)

In [None]:
from collections import Iterable

isinstance(music_test_P['reviewText'], Iterable)

In [None]:
music_test_P['reviewText'].iterable