### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader 
from scipy.stats import zscore
from sklearn import metrics
import random
from surprise.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

#accessing the model and memory based techniques
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore 
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import CoClustering

#accessing the similarity metrics
from math import sqrt
from surprise import accuracy
from surprise.model_selection.validation import cross_validate

### Accessing the data

In [2]:
#defining the file path 
file_path = './cleanedDatasets/cleaned_user_ratings.csv'

#reading the data using pandas library
user_ratings_df = pd.read_csv(file_path, encoding= 'unicode_escape')
user_ratings_df

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2
...,...,...,...
8472,2424,1016,5
8473,2424,1017,4
8474,2424,1018,1
8475,2424,1019,2


### Preparing the data in a suitable format

In [3]:
#creating a reader object
#line format defines the order to match with the file - since reading with pandas not needed
#sep is the comma in this case since it is a csv file - since reading with pandas not needed
#rating_scale it is a tuple defining the lowest and highest possible range, if a number is not included in the tuple it will be ignored. 
reader = Reader(rating_scale=(0, 5))

#importing the data in a fromat surprise to be able to work with it - by loading it from the pandas data frame
#data = Dataset.load_from_df(user_ratings_df[['course_id','user_id','rating']], reader=reader) #did not work as later on for the KNNBasic user-based 
data = Dataset.load_from_df(user_ratings_df[['user_id','course_id','rating']], reader=reader)

In [4]:
data

<surprise.dataset.DatasetAutoFolds at 0x22b4630a410>

##### Splitting the data 

In [5]:
raw_ratings_data = data.raw_ratings #lists the raw ratings in the following order: course_id, user_id, rating, and the timestamp

In [6]:
data.raw_ratings

[(1001, 2001, 5.0, None),
 (1002, 2001, 3.0, None),
 (1003, 2001, 1.0, None),
 (1004, 2001, 0.0, None),
 (1005, 2001, 2.0, None),
 (1006, 2001, 1.0, None),
 (1007, 2001, 0.0, None),
 (1008, 2001, 0.0, None),
 (1009, 2001, 0.0, None),
 (1010, 2001, 0.0, None),
 (1011, 2001, 3.0, None),
 (1012, 2001, 0.0, None),
 (1013, 2001, 0.0, None),
 (1014, 2001, 4.0, None),
 (1015, 2001, 1.0, None),
 (1016, 2001, 0.0, None),
 (1017, 2001, 2.0, None),
 (1018, 2001, 0.0, None),
 (1019, 2001, 1.0, None),
 (1020, 2001, 3.0, None),
 (1001, 2002, 3.0, None),
 (1002, 2002, 5.0, None),
 (1003, 2002, 0.0, None),
 (1004, 2002, 2.0, None),
 (1005, 2002, 1.0, None),
 (1006, 2002, 0.0, None),
 (1007, 2002, 4.0, None),
 (1008, 2002, 5.0, None),
 (1009, 2002, 0.0, None),
 (1010, 2002, 1.0, None),
 (1011, 2002, 0.0, None),
 (1012, 2002, 3.0, None),
 (1013, 2002, 2.0, None),
 (1014, 2002, 0.0, None),
 (1015, 2002, 5.0, None),
 (1016, 2002, 2.0, None),
 (1017, 2002, 0.0, None),
 (1018, 2002, 3.0, None),
 (1019, 2002

In [7]:
#before splitting the data it would be ideal to shuffle 
random.shuffle(raw_ratings_data) 

In [8]:
data.raw_ratings #to confirm that the data is shuffled properly, it was compared to the cleaned user rating csv file - note the order of columns is different of course than the orignal file

[(1016, 2343, 5.0, None),
 (1009, 2191, 0.0, None),
 (1018, 2274, 5.0, None),
 (1018, 2420, 1.0, None),
 (1014, 2346, 2.0, None),
 (1018, 2251, 0.0, None),
 (1005, 2224, 5.0, None),
 (1016, 2021, 3.0, None),
 (1018, 2308, 0.0, None),
 (1019, 2378, 0.0, None),
 (1020, 2038, 2.0, None),
 (1012, 2183, 0.0, None),
 (1017, 2415, 5.0, None),
 (1001, 2192, 5.0, None),
 (1013, 2370, 2.0, None),
 (1001, 2273, 4.0, None),
 (1005, 2221, 1.0, None),
 (1017, 2414, 4.0, None),
 (1007, 2395, 5.0, None),
 (1006, 2393, 5.0, None),
 (1018, 2264, 1.0, None),
 (1016, 2123, 0.0, None),
 (1004, 2010, 4.0, None),
 (1001, 2297, 0.0, None),
 (1016, 2040, 0.0, None),
 (1019, 2180, 4.0, None),
 (1016, 2226, 5.0, None),
 (1009, 2300, 2.0, None),
 (1020, 2408, 0.0, None),
 (1003, 2258, 4.0, None),
 (1006, 2322, 3.0, None),
 (1005, 2164, 2.0, None),
 (1010, 2082, 4.0, None),
 (1001, 2163, 2.0, None),
 (1002, 2421, 5.0, None),
 (1020, 2073, 0.0, None),
 (1009, 2200, 5.0, None),
 (1020, 2234, 5.0, None),
 (1012, 2257

In [9]:
#20% assigned to the testing set 
#train_data_raw, test_data_raw = train_test_split(raw_ratings_data, test_size=0.2) # did not work when trying to use the predic on the test set for the KNN Basic method since there were too many values to unpack
ratio = int(len(raw_ratings_data)*0.8)
train_data_raw = raw_ratings_data[:ratio] 
test_data_raw = raw_ratings_data[ratio:] 

#data is set to the training dataset
data.raw_ratings = train_data_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_data_raw)

In [10]:
# Calculate the lengths (i.e., the number of data points) of the training and testing sets
train_data_length = len(train_data_raw)
test_data_length = len(test_data_raw)

# Print out the sizes of the training and testing data sets
print(f"Number of data points in training set: {train_data_length}")
print(f"Number of data points in testing set: {test_data_length}")

# If you also want to show this as a percentage of the total data
total_data_length = train_data_length + test_data_length
train_data_percentage = (train_data_length / total_data_length) * 100
test_data_percentage = (test_data_length / total_data_length) * 100

print(f"Percentage of data in training set: {train_data_percentage:.2f}%")
print(f"Percentage of data in testing set: {test_data_percentage:.2f}%")

Number of data points in training set: 6781
Number of data points in testing set: 1696
Percentage of data in training set: 79.99%
Percentage of data in testing set: 20.01%


### Defining the different techniques for collaborative filtering to determine which one is best

In [11]:
#create dict for different memory-based  and model based recommendation algorithms
recommendation_algorithms =[KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore(), SVD(), BaselineOnly(), CoClustering() ] 
results = {} #to store the scores

### Cross-validation performed on all recommendation algorithms

Hyper parameters of cross validation method:

- cv - defines the type of folding for the model we want to use
- n-jobs - determines the number of folds evaluated in parallel. 

In [12]:
for algorithm in recommendation_algorithms:
    #kfold set to 5
    crossval_scores = cross_validate(algorithm, data, measures=["MAE", "MSE", "RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_mse': 'MSE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(algorithm).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE


Unnamed: 0,MAE,MSE,RMSE,Fit Time,Test Time
matrix_factorization.SVD,1.108614,1.869364,1.367153,0.045175,0.005567
co_clustering.CoClustering,1.236002,2.316279,1.521731,0.074348,0.00328
knns.KNNWithZScore,1.277944,2.404095,1.550133,0.004145,0.016753
knns.KNNWithMeans,1.278986,2.406977,1.551429,0.003925,0.018909
knns.KNNBaseline,1.287121,2.436153,1.56074,0.004081,0.018863
knns.KNNBasic,1.306588,2.483572,1.575728,0.002078,0.011285
baseline_only.BaselineOnly,1.362512,2.603363,1.613201,0.003183,0.004086


### Testing with different parameters

 Preparing the dataset to be used in a way that cross-validation can be applied directly on the entire dataset without the need for a separate test set initially defined 

In [13]:
#using cross-validation to help us test how well the model works accross different subsets of the data
trainset = data.build_full_trainset() #using all the ratings

#finding out how many users and courses we have in the dataset
print('Number of users: ', trainset.n_users, '\n') #20 users that rated
print('Number of items: ', trainset.n_items, '\n') #424 number of courses

Number of users:  20 

Number of items:  424 



#### Hyperparameters for the KNN models

- K parameter - indicates the the upper limit of similar items we want the algorithm to consider. For example in this case if the user rated 21 courses, but k is set to 5, when estimating a rating of a new course which has not yet been rated, only 5 out of the 21 courses that are the closest to the new course will be considered. 

- Similarity option, that defines the way to calculate it. All the similarity functions will return a number between 0 and 1 to a specific (i, j) item pair. 1 means the ratings are perfectly aligned, 0 means there is no connection between the two items. 

Three similarity metrics in the surprise similarity module: cosine, msd,pearson

#### KNNBasic

##### KNNBasic: User-Based Cosine

In [14]:
#fitting
#using user-based cosine similarity
params = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the cosine similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.6611
MSE: 2.7592
MAE:  1.3860

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.45   {'actual_k': 15, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.4534300660410833, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.4119905922113345, details={'actual_k': 15, 'was_impossible': False})]

##### KNNBasic: Item-Based Cosine

In [15]:
# #build train and train train set
# trainset = data.build_full_trainset()

# #fitting
# #using user-based cosine similarity
# params = {
#     "name": "cosine",
#     "user_based": False,  # Compute  similarities between users
# }
# algo = KNNBasic(sim_options = params)
# algo.fit(trainset) #--------------> had to comment this entire section as ZeroDivisionError was displayed

# #test the test set using .test() 
# print('\nAccuracy on the testset:')
# accuracy.rmse(algo.test(testset))
# accuracy.mse(algo.test(testset))
# accuracy.mae(algo.test(testset))

# print('\nPredict Tests: ')
# print(algo.predict(1001, 2001))
# print(algo.predict(2001, 1001))
# print(algo.predict(1001, 88))
# print(algo.predict(1001, 5))

# print('\nPredict Using TestSet list: ')
# testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
# algo.test(testset)[:2]

##### KNNBasic: User - Based MSD 

In [16]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2340
MSE: 1.5227
MAE:  1.0157

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.36   {'actual_k': 15, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.364969078902907, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.840239138855415, details={'actual_k': 15, 'was_impossible': False})]

##### KNNBasic: Item-Based MSD

In [17]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7262
MSE: 0.5273
MAE:  0.4577

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.61   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.6111166254597973, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8720268159554507, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic - cosine, user, shrinkage

In [18]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": True,
    "shrinkage": 0 #no shrinkage
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2340
MSE: 1.5227
MAE:  1.0157

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.36   {'actual_k': 15, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.364969078902907, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.840239138855415, details={'actual_k': 15, 'was_impossible': False})]

##### KNNBasic - MSD, user, shrinkage

In [19]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": False,
    "shrinkage": 0 #shrinkage is 0
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7262
MSE: 0.5273
MAE:  0.4577

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.61   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.6111166254597973, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8720268159554507, details={'actual_k': 40, 'was_impossible': False})]

##### Applying GridSearch

In [20]:
params_grid = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

grid_search_KNNBasic = GridSearchCV(KNNBasic, params_grid, measures=['RMSE','MAE'], cv=5, n_jobs=-1)                               
grid_search_KNNBasic.fit(data)

print(f'\nRMSE Best Parameters: {grid_search_KNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score: {grid_search_KNNBasic.best_score["rmse"]}')
print(f'MAE Best Parameters: {grid_search_KNNBasic.best_params["mae"]}')
print(f'MAE Best Score: {grid_search_KNNBasic.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.104449885479608
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.794641481477073


Applying the hyperparameters found in grid_search_KNNBasic 

In [21]:
finalKBL = KNNBaseline(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8)
predictions = finalKBL.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2066
MSE: 1.4559
MAE:  0.9870


0.9869825995680053

#### KNN With K-Means

In [22]:
algo = KNNWithMeans()

#applying cross validation 
#kfold set to 5
cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5643  1.5277  1.5223  1.5540  1.5958  1.5528  0.0266  
MAE (testset)     1.2851  1.2640  1.2539  1.2757  1.3227  1.2803  0.0237  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.02    0.02    0.01    0.01    0.01    0.02    0.00    
Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2064
MSE: 1.4555
MAE:  0.9864

Predict Tests: 
user: 1001       item: 2001       r_ui = No

[Prediction(uid=1001, iid=2001, r_ui=5, est=2.266743575072092, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.028221106071155, details={'actual_k': 15, 'was_impossible': False})]

##### KNNWithMeans - user-based cosine 

In [23]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7282
MSE: 0.5302
MAE:  0.4845

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.32   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.3223985390862563, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.707173851819456, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans - item-based cosine 

In [24]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7282
MSE: 0.5302
MAE:  0.4845

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.32   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.3223985390862563, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.707173851819456, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: user-based MSD 

In [25]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based msd similarity
parans = {
    "name": "msd",
    "user_based": True,  # Compute similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7282
MSE: 0.5302
MAE:  0.4845

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.32   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.3223985390862563, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.707173851819456, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans - item-based MSD 

In [26]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "msd",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7282
MSE: 0.5302
MAE:  0.4845

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.32   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.3223985390862563, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.707173851819456, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans - Applying the Grid Search

In [27]:
#doing this with name is msd or cosine causes float or zero division error
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

gsKWM = GridSearchCV(KNNWithMeans, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKWM.fit(data)

print(f'\nRMSE Best Parameters: {gsKWM.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKWM.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKWM.best_params["mae"]}')
print(f'MAE Best Score: {gsKWM.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.1309977018220738
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.8461392078649247


##### Applying the best hyperparameters for it 

In [28]:
finalKWM = KNNWithMeans(method = "als", n_epochs = 5, user_based = False, min_support = 3)
predictions = finalKWM.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2064
MSE: 1.4555
MAE:  0.9864


0.986379678516048

#### KNN With Z Score

##### Applying the Grid Search

In [29]:
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "reg_u": [10, 12, 15],
                            "reg_i": [8, 10, 12],
                            "learning_rate": [0.0005, 0.05]}}

gsKZS = GridSearchCV(KNNWithZScore, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKZS.fit(data)

print(f'\nRMSE Best Parameters: {gsKZS.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKZS.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKZS.best_params["mae"]}')
print(f'MAE Best Score: {gsKZS.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
RMSE Best Score: 1.1396880633377524
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
MAE Best Score: 0.8510379455921224


##### Applying the best parameters found

In [30]:
finalKZS = KNNWithZScore(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8, learning_rate = 0.0005)
predictions = finalKZS.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2070
MSE: 1.4568
MAE:  0.9857


0.9857359920400004

#### SVD 

In [31]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3768  1.3524  1.3623  1.3530  1.3885  1.3666  0.0141  
MAE (testset)     1.1124  1.0993  1.1035  1.0741  1.1309  1.1040  0.0185  
Fit time          0.04    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 0.7817
MSE: 0.6111
MAE:  0.5659

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 4.13   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.40   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.40   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=4.1280576363752655, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.0799165463858804, details={'was_impossible': False})]

##### Applying Grid Search

In [32]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.495891867844709
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.2597245001363473


##### Applying the best parameters

In [33]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.3056
MSE: 1.7045
MAE:  1.0920


1.0919878626719564

#### BaselineOnly

In [34]:
# Initialize the BaselineOnly algorithm
baseline_algo = BaselineOnly()

# Cross validate with k-fold set to 5
cross_validate(baseline_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


trainset = data.build_full_trainset()
baseline_algo.fit(trainset)


#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(baseline_algo.test(testset))
accuracy.mse(baseline_algo.test(testset))
accuracy.mae(baseline_algo.test(testset))

print('\nPredict Tests: ')
print(baseline_algo.predict(1001, 2001))
print(baseline_algo.predict(2001, 1001))
print(baseline_algo.predict(1001, 88))
print(baseline_algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
baseline_algo.test(testset)[:2]


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5928  1.6215  1.6377  1.5983  1.6146  1.6130  0.0162  
MAE (testset)     1.3372  1.3743  1.3918  1.3414  1.3620  1.3613  0.0204  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Estimating biases using als...

Accuracy on the testset:
RMSE: 1.5631
MSE: 2.4434
MAE:  1.3153

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.67   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.40   {'was_impossible': False}
user: 1001       item: 5          

[Prediction(uid=1001, iid=2001, r_ui=5, est=1.6660657529504799, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.005488095643174, details={'was_impossible': False})]

##### Applying the Grid Search

In [35]:
# Define the parameter grid
params = {'bsl_options': {
                    'method': ['als', 'sgd'],  # Alternating Least Squares (ALS) and Stochastic Gradient Descent (SGD)
                    'n_epochs': [5, 10, 20],   # Only used for SGD
                    'reg_u': [12, 15, 17],     # Regularization parameter for users, only for ALS
                    'reg_i': [5, 10, 15]       # Regularization parameter for items, only for ALS
                }}

gs_baseline = GridSearchCV(BaselineOnly, params, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1)
gs_baseline.fit(data)

# Print best scores
print(f'\nRMSE Best Parameters: {gs_baseline.best_params["rmse"]}')
print(f'RMSE Best Score: {gs_baseline.best_score["rmse"]}')
print(f'MAE Best Parameters: {gs_baseline.best_params["mae"]}')
print(f'MAE Best Score: {gs_baseline.best_score["mae"]}')



RMSE Best Parameters: {'bsl_options': {'method': 'als', 'n_epochs': 20, 'reg_u': 17, 'reg_i': 10}}
RMSE Best Score: 1.6144445100808766
MAE Best Parameters: {'bsl_options': {'method': 'sgd', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}}
MAE Best Score: 1.353831700063701


##### Applying the best parameters

In [36]:
# Train and test with the best found parameters
final_baseline = BaselineOnly(bsl_options=gs_baseline.best_params['rmse']['bsl_options'])
final_baseline.fit(trainset)
predictions = final_baseline.test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)

Estimating biases using als...

Updated Accuracy: 
RMSE: 1.5632
MSE: 2.4434
MAE:  1.3155


1.3154831147218704

#### CoClustering

In [37]:
# Initialize the CoClustering algorithm
co_clustering = CoClustering()

# Cross validate with k-fold set to 5
cross_validate(co_clustering, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
co_clustering.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(co_clustering.test(testset))
accuracy.mse(co_clustering.test(testset))
accuracy.mae(co_clustering.test(testset))

print('\nPredict Tests: ')
print(co_clustering.predict(1001, 2001))
print(co_clustering.predict(2001, 1001))
print(co_clustering.predict(1001, 88))
print(co_clustering.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
co_clustering.test(testset)[:2]

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5636  1.5138  1.6450  1.6063  1.6289  1.5915  0.0475  
MAE (testset)     1.2484  1.2353  1.3345  1.2928  1.3277  1.2877  0.0402  
Fit time          0.06    0.06    0.06    0.06    0.07    0.06    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 1.3942
MSE: 1.9438
MAE:  1.1258

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.45   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.60   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.60   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.60   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.448666256699652, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.7830212204141587, details={'was_impossible': False})]

##### Applying Grid Search

In [38]:
# Define the parameter grid
params = {
    'n_cltr_u': [3, 5, 7],  # Number of user clusters
    'n_cltr_i': [3, 5, 7],  # Number of item clusters
    'n_epochs': [20, 30, 40]  # Number of epochs
}

# Perform grid search
gs_co_clustering = GridSearchCV(CoClustering, params, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1)
gs_co_clustering.fit(data)

# Print best scores
print(f'\nRMSE Best Parameters: {gs_co_clustering.best_params["rmse"]}')
print(f'RMSE Best Score: {gs_co_clustering.best_score["rmse"]}')
print(f'MAE Best Parameters: {gs_co_clustering.best_params["mae"]}')
print(f'MAE Best Score: {gs_co_clustering.best_score["mae"]}')


RMSE Best Parameters: {'n_cltr_u': 7, 'n_cltr_i': 7, 'n_epochs': 30}
RMSE Best Score: 1.447981870938624
MAE Best Parameters: {'n_cltr_u': 7, 'n_cltr_i': 7, 'n_epochs': 30}
MAE Best Score: 1.1220073209366421


##### Applying the best parameters

In [39]:
# Train and test with the best found parameters
final_co_clustering = CoClustering(n_cltr_u=gs_co_clustering.best_params['rmse']['n_cltr_u'],
                                   n_cltr_i=gs_co_clustering.best_params['rmse']['n_cltr_i'],
                                   n_epochs=gs_co_clustering.best_params['rmse']['n_epochs'])
final_co_clustering.fit(trainset)
predictions = final_co_clustering.test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)


Updated Accuracy: 
RMSE: 1.2505
MSE: 1.5637
MAE:  0.9784


0.9783782835965246