# Amazon Review Data (2018)

In [None]:
import pandas as pd
import json
import gzip
import numpy as np
import random
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from surprise import SVD,SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

from surprise.model_selection import GridSearchCV
from tqdm import tqdm
import matplotlib.pyplot as plt
from surprise.model_selection.validation import cross_validate

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

In [6]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF(r"C:\Users\pruth\Downloads\final project\baseline\datasets\Electronics_5.json.gz")

In [7]:
df

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,67,True,"09 18, 1999",AAP7PPBU72QFM,0151004714,{'Format:': ' Hardcover'},D. C. Carrad,This is the best novel I have read in 2 or 3 y...,A star is born,937612800,
1,3.0,5,True,"10 23, 2013",A2E168DTVGE6SV,0151004714,{'Format:': ' Kindle Edition'},Evy,"Pages and pages of introspection, in the style...",A stream of consciousness novel,1382486400,
2,5.0,4,False,"09 2, 2008",A1ER5AYS3FQ9O3,0151004714,{'Format:': ' Paperback'},Kcorn,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...,1220313600,
3,5.0,13,False,"09 4, 2000",A1T17LMQABMBN5,0151004714,{'Format:': ' Hardcover'},Caf Girl Writes,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!,968025600,
4,3.0,8,True,"02 4, 2000",A3QHJ0FXK33OBE,0151004714,{'Format:': ' Hardcover'},W. Shane Schmidt,I was taken in by reviews that compared this b...,A dissenting view--In part.,949622400,
...,...,...,...,...,...,...,...,...,...,...,...,...
6739585,4.0,,True,"03 21, 2017",A33MAQA919J2V8,B01HJH40WU,,Kurt Wurm,"These seem like quality USB cables, time will ...",Four Stars,1490054400,
6739586,4.0,,True,"01 9, 2017",A1AKHSCPD1BHM4,B01HJH40WU,,C.L Momof3,"Works great, love the longer cord. As with any...",Nice long cord,1483920000,
6739587,5.0,2,True,"12 1, 2016",A2HUZO7MQAY5I2,B01HJH40WU,,michael clontz,"Ok here is an odd thing that happened to me, I...",Not the correct product as linked in the sale.,1480550400,
6739588,5.0,2,True,"11 29, 2016",AJJ7VX2L91X2W,B01HJH40WU,,Faith,Works well.,Five Stars,1480377600,


In [9]:
df.to_csv(r"C:\Users\pruth\Downloads\final project\baseline\datasets\amazon_review.csv", header=False)

In [12]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["reviewerID","asin","overall"]], reader)
raw_ratings = data.raw_ratings

# re arranging the dataset
random.shuffle(raw_ratings)

# splitting the data set into 80:20 training and testing data set
train_test_split_index = int(0.8 * len(raw_ratings))
raw_ratings_train = raw_ratings[:train_test_split_index]
raw_ratings_test = raw_ratings[train_test_split_index:]

# constructing the train set
data.raw_ratings = raw_ratings_train  # assign the raw data back

# constructing the test set
testset = data.construct_testset(raw_ratings_test)

In [13]:
"""
SVDpp parameters
N_factors: The number of factors.
N_epochs: The number of iterations of the SGD procedure.
lr_all – The learning rate for all parameters.
reg_all – The regularization term for all parameters.
"""

# constructing different parameter with different range of values
### note: these values are changed in the whole process of training to arrive at this specific values based on rmse scores.
svdpp_params = {
        "n_factors": [10, 50],
    "n_epochs": [10, 50],
    "lr_all": [0.001, 0.01],
    "reg_all": [0.02, 0.1]
    }

# using grid search cv with above parameters with cross validation of 3 and refit = True
grid_search = GridSearchCV(
    SVDpp,
    param_grid = svdpp_params,
    measures=["rmse"],
    cv=3,
    refit=True,
    n_jobs=-1,
    joblib_verbose=1
)

# fitting the data
grid_search.fit(data)

# getting best model out of the grid search and best parameters
best_model = grid_search.best_estimator["rmse"]
print(grid_search.best_params)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 22.3min finished


{'rmse': {'n_factors': 50, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}}


In [15]:
# pickle the model in case the run time stop in between
import pickle
with open(r"C:\Users\pruth\Downloads\final project\models\svdpp.pickle", 'wb') as f:
    pickle.dump(best_model, f)

In [16]:

# calculating rmse on test set
testset_predictions = best_model.test(testset)

accuracy.mae(testset_predictions)

MAE:  0.7578


0.7578238082278719

In [17]:
accuracy.rmse(testset_predictions)


RMSE: 1.0803


1.080331676097945

In [None]:
df = pd.read_csv(r"C:\Users\pruth\Downloads\final project\baseline\datasets\Electronics_5.json.gz")