In [None]:
print('we have',ratings.shape[0], 'ratings')
print('the number of unique users we have is:', len(ratings.user_id.unique()))
print('the number of unique business we have is:', len(ratings.business_id.unique()))
print("The median user rated %d books."%ratings.user_id.value_counts().median())
print('The max rating is: %d'%ratings.rating.max(),"the min rating is: %d"%ratings.rating.min())
ratings.head()

In [1]:
import pandas as pd
import json
from tqdm import tqdm
import os
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import NormalPredictor
from surprise import BaselineOnly, SVD, KNNBasic
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from surprise.model_selection import split


DATA_FOLDER = 'data'
review_datafile = os.path.join(DATA_FOLDER,"review.json")
line_count = len(open(review_datafile).readlines())
user_ids, business_ids, stars, dates = [], [], [], []
with open(review_datafile) as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]]
        stars += [blob["stars"]]
        dates += [blob["date"]]
ratings = pd.DataFrame(
   {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates}
)
user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

ModuleNotFoundError: No module named 'surprise'

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'business_id', 'rating']], reader)
train_set, test_set = train_test_split(data, test_size=.2)
kSplit = split.KFold(n_splits=2, shuffle=True) # split data into folds. 

In [None]:
# 1. find the best hyperparameters for BaselineOnly
param_grid = {'bsl_options': {'method': ['als','sgd'],
                              'n_epochs': [10, 25], 
                              'reg_u': [3, 5],
                              'reg_i': [3, 5]}
             }
grid_search = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

In [None]:
# 2. find the best hyperparameters for SVD
param_grid = {'n_epochs': [25, 40], 'lr_all': [0.01, 0.02],
              'reg_all': [0.2]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

In [None]:
# 3. find the best hyperparameters for co-clustering
param_grid = {'n_epochs': [3, 5], 'n_cltr_u': [3, 5],
              'n_cltr_i': [3, 5]}
grid_search = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

In [None]:
rmseBaseline = []
rmseSVD = []
rmseCo = []
rmseSlope = []

In [None]:
baselineOnly = BaselineOnly(bsl_options={'method': 'als',
                                   'n_epochs': 25,
                                   'reg_u': 5,
                                   'reg_i': 3
                                   })
for trainset, testset in kSplit.split(data): #iterate through the folds.
    baselineOnly.fit(trainset)
    predictionsBaselineOnly = baselineOnly.test(testset)
    rmseBaseline.append(accuracy.rmse(predictionsBaselineOnly,verbose=True))

In [None]:
svd = SVD(lr_all=0.01,n_epochs=25,reg_all=0.2)
for trainset, testset in kSplit.split(data): #iterate through the folds.
    svd.fit(trainset)
    predictionsSVD = svd.test(testset)
    rmseSVD.append(accuracy.rmse(predictionsSVD,verbose=True))#get root means squared error

In [None]:
coClus = CoClustering(n_epochs=3, n_cltr_u=3, n_cltr_i=3) 
for trainset, testset in kSplit.split(data): #iterate through the folds.
    coClus.fit(trainset)
    predictionsCoClus = coClus.test(testset)
    rmseCo.append(accuracy.rmse(predictionsCoClus,verbose=True))#get root means squared error

In [None]:
# from surprise.prediction_algorithms.slope_one import SlopeOne
# slopeOne = SlopeOne()
# for trainset, testset in kSplit.split(data): #iterate through the folds.
#     slopeOne.fit(trainset)
#     predictionsSlope = slopeOne.test(testset)
#     rmseSlope.append(accuracy.rmse(predictionsSlope,verbose=True))#get root means squared error

In [None]:
from surprise import AlgoBase
import numpy as np
class HybridFacto(AlgoBase):
    def __init__(self,epochs, learning_rate,num_models):
        self.alpha = np.array([1.0/num_models]*num_models)
        self.epochs = epochs
        self.learning_rate = learning_rate
    def fit(self,holdout):
        holdout = holdout.build_full_trainset().build_testset()
        for epoch in range(self.epochs):
            predictions = np.array([baselineOnly.test(holdout),svd.test(holdout),coClus.test(holdout)])
            maeGradient = [accuracy.rmse(prediction,verbose=True) for prediction in predictions]
            newalpha = self.alpha - learning_rate * maeGradient  
            #convergence check:
            if newalpha - self.alpha < 0.001:
                break
            self.alpha = newalpha
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        algoResults = np.array([collabKNN.predict(u,i),funkSVD.predict(u,i),coClus.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))
        

In [None]:
hybrid = HybridFacto(epochs = 10,learning_rate = 0.05,num_models = 3)
hybrid.fit(data)
rmseHyb = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    predhybrid = Hyhybrid.test(testset)
    rmseHyb.append(accuracy.rmse(predhybrid))

In [None]:
# sim_options = sim_options = {'name': 'cosine',
#                'user_based': False  # compute  similarities between items
#                }
# collabKNN = KNNBasic(k=40,sim_options=sim_options) #try removing sim_options. You'll find memory errors. 
# collabKNN = KNNBasic(k=40)
baseline_als = BaselineOnly(bsl_options={'method': 'als',
                                   'n_epochs': 5,
                                   'reg_u': 12,
                                   'reg_i': 5
                                   })
rmseKNN = []
rmseSVD = []
rmseCo = []
rmseSlope = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    baseline_als.fit(trainset)
    predictionsKNN = baseline_als.test(testset)
    rmseKNN.append(accuracy.rmse(predictionsKNN,verbose=True))#get root means squared error

In [None]:
from surprise.prediction_algorithms.co_clustering import CoClustering
coClus = CoClustering(n_cltr_u=4,n_cltr_i=4,n_epochs=25) 
for trainset, testset in kSplit.split(data): #iterate through the folds.
    coClus.fit(trainset)
    predictionsCoClus = coClus.test(testset)
    rmseCo.append(accuracy.rmse(predictionsCoClus,verbose=True))#get root means squared error

In [None]:
algos = {'Baseline_ALS': [BaselineOnly(bsl_options={'method': 'als',
                                   'n_epochs': 5,
                                   'reg_u': 12,
                                   'reg_i': 5
                                   }),]
        }


# algos = {'Baseline_ALS': BaselineOnly(bsl_options={'method': 'als',
#                                    'n_epochs': 5,
#                                    'reg_u': 12,
#                                    'reg_i': 5
#                                    }),
#          'Baseline_SGD': BaselineOnly(bsl_options={'method': 'sgd',
#                                    'learning_rate': .00005,
#                                    }),
#          'SVD': SVD(),
#          'NeighborhoodBased_User': KNNBasic(sim_options={'name': 'cosine',
#                                'user_based': True
#                                })
#         }
algo_list = {}
for algo_name in algos:
    print (f"Algo is: {algo_name}")
    algo = algos[algo_name]
    algo.fit(trainset)
    predictions = algo.test(testset)
    algo_accuracy = accuracy.rmse(predictions)
    algos[algo_name][1] = algo_accuracy
    print (algo_accuracy)