# Using - joshk-rating-pred-2021-12-10.py

In [5]:
import pandas as pd
import tarfile
from tqdm import tqdm
import json
import numpy as np
import time
from copy import deepcopy
from src.data_mgmt.BaseDataClass import BaseDataClass as bdc

from surprise import SVD
from surprise import SVDpp
from surprise import accuracy
from surprise import Reader
from surprise import Dataset
from surprise import BaselineOnly

In [6]:
rawdf = bdc._loadUpDf('/Users/andrealevy/cmu_msba_2022_ml_applications_2/data/','train','.json.gz')
rawdf['date'] = rawdf['reviewTime']
df = rawdf.copy()

Loading JSON into Dataframe: 200000it [00:15, 12724.43it/s]


In [7]:
SAMPLING_RATE = 5/5
user_id_unique = df['reviewerID'].unique()
user_id_sample = pd.DataFrame(user_id_unique, columns=['unique_user_id']) \
                    .sample(frac= SAMPLING_RATE, replace=False, random_state=1)
ratings_sample = df.merge(user_id_sample, left_on='reviewerID', right_on='unique_user_id') \
                    .drop(['unique_user_id'], axis=1)

In [8]:
# most recent review is testset, second most recent is val, rest are train
ratings_sample['review_date_rank'] = ratings_sample.groupby('reviewerID')['date'].rank(ascending=False)
testset = ratings_sample[ratings_sample['review_date_rank'] == 1].copy()
valset = ratings_sample[ratings_sample['review_date_rank'] == 2].copy()
trainset = ratings_sample[ratings_sample['review_date_rank'] > 2].copy()

In [9]:
reader = Reader(rating_scale = (1.0, 5.0))
train_data = Dataset.load_from_df(trainset[['reviewerID','itemID','rating']], reader)
val_data = Dataset.load_from_df(valset[['reviewerID','itemID','rating']], reader)
test_data = Dataset.load_from_df(testset[['reviewerID','itemID','rating']], reader)
train_sr = train_data.build_full_trainset()
val_sr_before = val_data.build_full_trainset()
val_sr = val_sr_before.build_testset()
test_sr_before = test_data.build_full_trainset()
test_sr = test_sr_before.build_testset()
bsl_options = {'method': 'als', 'n_epochs':3}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(val_sr)

Estimating biases using als...


In [12]:
RMSE_tune = {}
#min(list(RMSE_tune.values()))
n_epochs = [10, 15, 25, 50]  # the number of iteration of the SGD procedure
lr_all = [0.002, 0.003, 0.005, 0.008] # the learning rate for all parameters
reg_all =  [0.4, 0.5, 0.6, 0.7, 0.8] # the regularization term for all parameters
# n_epochs = [25]  # the number of iteration of the SGD procedure
# lr_all = [0.005] # the learning rate for all parameters
# reg_all =  [0.02] # the regularization term for all parameters
for n in n_epochs:
    for l in lr_all:
        for r in reg_all:
            print(f'Starting n={n}, l={l}, r={r}')
            algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
            algo.fit(train_sr)
            predictions = algo.test(val_sr)
            RMSE_tune[n,l,r] = accuracy.rmse(predictions)
algo_real = SVD(n_epochs = 25, lr_all = 0.008, reg_all = 0.8)
algo_real.fit(train_sr)
predictions = algo_real.test(test_sr)
accuracy.rmse(predictions)

Starting n=10, l=0.002, r=0.4
RMSE: 1.0929
Starting n=10, l=0.002, r=0.5
RMSE: 1.0920
Starting n=10, l=0.002, r=0.6
RMSE: 1.0929
Starting n=10, l=0.002, r=0.7
RMSE: 1.0928
Starting n=10, l=0.002, r=0.8
RMSE: 1.0939
Starting n=10, l=0.003, r=0.4
RMSE: 1.0882
Starting n=10, l=0.003, r=0.5
RMSE: 1.0877
Starting n=10, l=0.003, r=0.6
RMSE: 1.0886
Starting n=10, l=0.003, r=0.7
RMSE: 1.0877
Starting n=10, l=0.003, r=0.8
RMSE: 1.0888
Starting n=10, l=0.005, r=0.4
RMSE: 1.0822
Starting n=10, l=0.005, r=0.5
RMSE: 1.0813
Starting n=10, l=0.005, r=0.6
RMSE: 1.0821
Starting n=10, l=0.005, r=0.7
RMSE: 1.0819
Starting n=10, l=0.005, r=0.8
RMSE: 1.0822
Starting n=10, l=0.008, r=0.4
RMSE: 1.0763
Starting n=10, l=0.008, r=0.5
RMSE: 1.0755
Starting n=10, l=0.008, r=0.6
RMSE: 1.0763
Starting n=10, l=0.008, r=0.7
RMSE: 1.0761
Starting n=10, l=0.008, r=0.8
RMSE: 1.0764
Starting n=15, l=0.002, r=0.4
RMSE: 1.0884
Starting n=15, l=0.002, r=0.5
RMSE: 1.0889
Starting n=15, l=0.002, r=0.6
RMSE: 1.0890
Starting n=

1.0806867213189424

In [13]:
min(list(RMSE_tune.values()))

1.0714571741783212

In [17]:
#with minimum RMSE values
RMSE_tune = {}
#n_epochs = [10, 15, 25, 50]  # the number of iteration of the SGD procedure
#lr_all = [0.002, 0.003, 0.005, 0.008] # the learning rate for all parameters
#reg_all =  [0.4, 0.5, 0.6, 0.7, 0.8] # the regularization term for all parameters
n_epochs = [25]  # the number of iteration of the SGD procedure
lr_all = [0.008] # the learning rate for all parameters
reg_all =  [0.8] # the regularization term for all parameters
for n in n_epochs:
    for l in lr_all:
        for r in reg_all:
            print(f'Starting n={n}, l={l}, r={r}')
            algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
            algo.fit(train_sr)
            predictions = algo.test(val_sr)
            RMSE_tune[n,l,r] = accuracy.rmse(predictions)
algo_real = SVD(n_epochs = 25, lr_all = 0.008, reg_all = 0.8)
algo_real.fit(train_sr)
predictions = algo_real.test(test_sr)
accuracy.rmse(predictions)

Starting n=25, l=0.008, r=0.8
RMSE: 1.0719
RMSE: 1.0804


1.0803599537599349

In [19]:
RMSE_tune = {}
#n_epochs = [10, 15, 25, 50]  # the number of iteration of the SGD procedure
#lr_all = [0.002, 0.003, 0.005, 0.008] # the learning rate for all parameters
#reg_all =  [0.4, 0.5, 0.6, 0.7, 0.8] # the regularization term for all parameters
n_epochs = [25]  # the number of iteration of the SGD procedure
lr_all = [0.005] # the learning rate for all parameters
reg_all =  [0.02] # the regularization term for all parameters
for n in n_epochs:
    for l in lr_all:
        for r in reg_all:
            print(f'Starting n={n}, l={l}, r={r}')
            algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
            algo.fit(train_sr)
            predictions = algo.test(val_sr)
            RMSE_tune[n,l,r] = accuracy.rmse(predictions)
algo_real = SVD(n_epochs = 25, lr_all = 0.008, reg_all = 0.8)
algo_real.fit(train_sr)
predictions = algo_real.test(test_sr)
accuracy.rmse(predictions)

Starting n=25, l=0.005, r=0.02
RMSE: 1.0785
RMSE: 1.0809


1.08092468532757

In [27]:
actualtestset = pd.read_csv('/Users/andrealevy/cmu_msba_2022_ml_applications_2/data/pairs_Purchase.txt')
actualtestset[['reviewerID','itemID']] = actualtestset['reviewerID-itemID'].str.split("-",expand=True)
actualtestset['rating'] = 0
actual_test_data = Dataset.load_from_df(actualtestset[['reviewerID','itemID','rating']], reader)
actual_test_sr_before = actual_test_data.build_full_trainset()
actual_test_sr = actual_test_sr_before.build_testset()
actual_predictions = algo_real.test(actual_test_sr)
accuracy.rmse(actual_predictions)

RMSE: 4.2541


4.2541170763070495

In [28]:
actual_pred_dict = {}
for i in actual_predictions:
    actual_pred_dict[i[0]] = i[3]

actual_test_output = actualtestset.copy()
actual_test_output['rating'] = actual_test_output['reviewerID'].map(actual_pred_dict)

In [29]:
actual_test_output

Unnamed: 0,reviewerID-itemID,prediction,reviewerID,itemID,rating
0,U938994110-I529819131,,U938994110,I529819131,4.170032
1,U181459539-I863471064,,U181459539,I863471064,4.449839
2,U941668816-I684585522,,U941668816,I684585522,3.649611
3,U768449391-I782253949,,U768449391,I782253949,4.370378
4,U640450168-I232683472,,U640450168,I232683472,4.137017
...,...,...,...,...,...
27995,U337041888-I763827121,,U337041888,I763827121,4.388414
27996,U457455307-I242828364,,U457455307,I242828364,3.862391
27997,U052546714-I111529174,,U052546714,I111529174,4.569858
27998,U566804667-I857242737,,U566804667,I857242737,3.588792


In [40]:
for i in range(len(actual_test_output)):
    if actual_test_output.iloc[i,4] >= 4.5:
        actual_test_output.iloc[i,1] = 1
    else:
        actual_test_output.iloc[i,1] = 0

In [41]:
actual_test_output

Unnamed: 0,reviewerID-itemID,prediction,reviewerID,itemID,rating
0,U938994110-I529819131,0.0,U938994110,I529819131,4.170032
1,U181459539-I863471064,0.0,U181459539,I863471064,4.449839
2,U941668816-I684585522,0.0,U941668816,I684585522,3.649611
3,U768449391-I782253949,0.0,U768449391,I782253949,4.370378
4,U640450168-I232683472,0.0,U640450168,I232683472,4.137017
...,...,...,...,...,...
27995,U337041888-I763827121,0.0,U337041888,I763827121,4.388414
27996,U457455307-I242828364,0.0,U457455307,I242828364,3.862391
27997,U052546714-I111529174,1.0,U052546714,I111529174,4.569858
27998,U566804667-I857242737,0.0,U566804667,I857242737,3.588792


In [42]:
actual_test_output[['reviewerID-itemID','prediction']].to_csv('Andrea-RatingsPurchase-2021-12-13v4.csv')

In [43]:
min(actual_test_output['rating'])

2.303946159698533