In [1]:
import pandas as pd
import json
import gzip
import numpy as np
import random
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from surprise import SVD,SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

from surprise.model_selection import GridSearchCV
from tqdm import tqdm
import matplotlib.pyplot as plt
from surprise.model_selection.validation import cross_validate

C:\Users\pruth\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
C:\Users\pruth\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [2]:
df = pd.read_csv(r"C:\Users\pruth\Downloads\final project\baseline\datasets\google_reviews.csv")

In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user_id","gmap_id","rating"]], reader)
raw_ratings = data.raw_ratings

# re arranging the dataset
random.shuffle(raw_ratings)

# splitting the data set into 80:20 training and testing data set
train_test_split_index = int(0.8 * len(raw_ratings))
raw_ratings_train = raw_ratings[:train_test_split_index]
raw_ratings_test = raw_ratings[train_test_split_index:]

# constructing the train set
data.raw_ratings = raw_ratings_train  # assign the raw data back

# constructing the test set
testset = data.construct_testset(raw_ratings_test)

In [5]:
"""
SVDpp parameters
N_factors: The number of factors.
N_epochs: The number of iterations of the SGD procedure.
lr_all – The learning rate for all parameters.
reg_all – The regularization term for all parameters.
"""

# constructing different parameter with different range of values
### note: these values are changed in the whole process of training to arrive at this specific values based on rmse scores.
svdpp_params = {
        "n_factors": [10, 50],
    "n_epochs": [10, 50],
    "lr_all": [0.001, 0.01],
    "reg_all": [0.02, 0.1]
    }

# using grid search cv with above parameters with cross validation of 3 and refit = True
grid_search = GridSearchCV(
    SVD,
    param_grid = svdpp_params,
    measures=["rmse"],
    cv=3,
    refit=True,
    n_jobs=-1,
    joblib_verbose=1
)

# fitting the data
grid_search.fit(data)

# getting best model out of the grid search and best parameters
best_model = grid_search.best_estimator["rmse"]
print(grid_search.best_params)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   19.1s finished


{'rmse': {'n_factors': 10, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.1}}


In [7]:
# pickle the model in case the run time stop in between
import pickle
with open(r"C:\Users\pruth\Downloads\final project\models\google_svd.pickle", 'wb') as f:
    pickle.dump(best_model, f)

In [8]:

# calculating rmse on test set
testset_predictions = best_model.test(testset)

accuracy.mae(testset_predictions)

MAE:  0.6599


0.6598561127923851

In [9]:
accuracy.rmse(testset_predictions)


RMSE: 0.9172


0.91718727853865