In [3]:
import pandas as pd
import json
import gzip
import numpy as np
import random
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from surprise import SVD,SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

from surprise.model_selection import GridSearchCV
from tqdm import tqdm
import matplotlib.pyplot as plt
from surprise.model_selection.validation import cross_validate

In [4]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [6]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF(r"C:\Users\pruth\Downloads\final project\baseline\datasets\review-Wyoming_10.json.gz")

In [7]:
df

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,111435346941066248277,Jenn Thomas,1602893531994,5,"When knowledge is key and kindness matters, Ni...",,,0x8758dd1ca83449d9:0xb6156dcfc5e04c9b
1,103613723635264236030,Michelle Barta,1575991509006,5,The entire team is outstanding! They are profe...,,"{'time': 1580484156696, 'text': 'We are lucky ...",0x8758dd1ca83449d9:0xb6156dcfc5e04c9b
2,113556685435236755030,Bethany Baldes,1626981484302,5,,,,0x8758dd1ca83449d9:0xb6156dcfc5e04c9b
3,102235352482458236558,nichole vogt,1583381337082,5,,,,0x8758dd1ca83449d9:0xb6156dcfc5e04c9b
4,114021734869233847378,Kristi Friday,1571112557173,4,,,"{'time': 1571246498430, 'text': 'Thanks for th...",0x8758dd1ca83449d9:0xb6156dcfc5e04c9b
...,...,...,...,...,...,...,...,...
427803,107899148903686037545,Carlene Calabaza,1614741323428,5,,,,0x53349429523d615b:0x2a66a9272032e8b4
427804,117387828996525813955,Gizmo,1561931911835,5,,,,0x53349429523d615b:0x2a66a9272032e8b4
427805,107887136514890657569,Jan Soucek,1501029366785,5,,,,0x53349429523d615b:0x2a66a9272032e8b4
427806,104765216668868688471,Kyle Leatherow,1565466860108,3,,,,0x53349429523d615b:0x2a66a9272032e8b4


In [13]:
df.to_csv(r"C:\Users\pruth\Downloads\final project\baseline\datasets\google_reviews.csv",index=False)

In [8]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user_id","gmap_id","rating"]], reader)
raw_ratings = data.raw_ratings

# re arranging the dataset
random.shuffle(raw_ratings)

# splitting the data set into 80:20 training and testing data set
train_test_split_index = int(0.8 * len(raw_ratings))
raw_ratings_train = raw_ratings[:train_test_split_index]
raw_ratings_test = raw_ratings[train_test_split_index:]

# constructing the train set
data.raw_ratings = raw_ratings_train  # assign the raw data back

# constructing the test set
testset = data.construct_testset(raw_ratings_test)

In [9]:
"""
SVDpp parameters
N_factors: The number of factors.
N_epochs: The number of iterations of the SGD procedure.
lr_all – The learning rate for all parameters.
reg_all – The regularization term for all parameters.
"""

# constructing different parameter with different range of values
### note: these values are changed in the whole process of training to arrive at this specific values based on rmse scores.
svdpp_params = {
        "n_factors": [10, 50],
    "n_epochs": [10, 50],
    "lr_all": [0.001, 0.01],
    "reg_all": [0.02, 0.1]
    }

# using grid search cv with above parameters with cross validation of 3 and refit = True
grid_search = GridSearchCV(
    SVDpp,
    param_grid = svdpp_params,
    measures=["rmse"],
    cv=3,
    refit=True,
    n_jobs=-1,
    joblib_verbose=1
)

# fitting the data
grid_search.fit(data)

# getting best model out of the grid search and best parameters
best_model = grid_search.best_estimator["rmse"]
print(grid_search.best_params)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.4min finished


{'rmse': {'n_factors': 10, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.1}}


In [14]:
# pickle the model in case the run time stop in between
import pickle
with open(r"C:\Users\pruth\Downloads\final project\models\google_svdpp.pickle", 'wb') as f:
    pickle.dump(best_model, f)

In [11]:

# calculating rmse on test set
testset_predictions = best_model.test(testset)

accuracy.mae(testset_predictions)

MAE:  0.6535


0.6534657686415346

In [12]:
accuracy.rmse(testset_predictions)


RMSE: 0.9095


0.9095397751966761