### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader 
from scipy.stats import zscore
from sklearn import metrics
import random
from surprise.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

#accessing the model and memory based techniques
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore 
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import CoClustering

#accessing the similarity metrics
from math import sqrt
from surprise import accuracy
from surprise.model_selection.validation import cross_validate

### Accessing the data

In [2]:
#defining the file path 
file_path = './cleanedDatasets/cleaned_user_ratings.csv'

#reading the data using pandas library
user_ratings_df = pd.read_csv(file_path, encoding= 'unicode_escape')
user_ratings_df

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2
...,...,...,...
8472,2424,1016,5
8473,2424,1017,4
8474,2424,1018,1
8475,2424,1019,2


### Preparing the data in a suitable format

In [3]:
#creating a reader object
#line format defines the order to match with the file - since reading with pandas not needed
#sep is the comma in this case since it is a csv file - since reading with pandas not needed
#rating_scale it is a tuple defining the lowest and highest possible range, if a number is not included in the tuple it will be ignored. 
reader = Reader(rating_scale=(0, 5))

#importing the data in a fromat surprise to be able to work with it - by loading it from the pandas data frame
#data = Dataset.load_from_df(user_ratings_df[['course_id','user_id','rating']], reader=reader) #------> did not work as the order impacted the functionality  for the KNNBasic user-based 
data = Dataset.load_from_df(user_ratings_df[['user_id','course_id','rating']], reader=reader)

In [4]:
data

<surprise.dataset.DatasetAutoFolds at 0x221310aa470>

##### Splitting the data 

In [5]:
raw_ratings_data = data.raw_ratings #lists the raw ratings in the following order: user_id, course_id, rating, and the timestamp

In [6]:
data.raw_ratings

[(1001, 2001, 5.0, None),
 (1002, 2001, 3.0, None),
 (1003, 2001, 1.0, None),
 (1004, 2001, 0.0, None),
 (1005, 2001, 2.0, None),
 (1006, 2001, 1.0, None),
 (1007, 2001, 0.0, None),
 (1008, 2001, 0.0, None),
 (1009, 2001, 0.0, None),
 (1010, 2001, 0.0, None),
 (1011, 2001, 3.0, None),
 (1012, 2001, 0.0, None),
 (1013, 2001, 0.0, None),
 (1014, 2001, 4.0, None),
 (1015, 2001, 1.0, None),
 (1016, 2001, 0.0, None),
 (1017, 2001, 2.0, None),
 (1018, 2001, 0.0, None),
 (1019, 2001, 1.0, None),
 (1020, 2001, 3.0, None),
 (1001, 2002, 3.0, None),
 (1002, 2002, 5.0, None),
 (1003, 2002, 0.0, None),
 (1004, 2002, 2.0, None),
 (1005, 2002, 1.0, None),
 (1006, 2002, 0.0, None),
 (1007, 2002, 4.0, None),
 (1008, 2002, 5.0, None),
 (1009, 2002, 0.0, None),
 (1010, 2002, 1.0, None),
 (1011, 2002, 0.0, None),
 (1012, 2002, 3.0, None),
 (1013, 2002, 2.0, None),
 (1014, 2002, 0.0, None),
 (1015, 2002, 5.0, None),
 (1016, 2002, 2.0, None),
 (1017, 2002, 0.0, None),
 (1018, 2002, 3.0, None),
 (1019, 2002

In [7]:
#before splitting the data it would be ideal to shuffle 
random.shuffle(raw_ratings_data) 

In [8]:
data.raw_ratings #to confirm that the data is shuffled properly, it was compared to the cleaned user rating csv file - note the order of columns is different of course than the orignal file

[(1009, 2400, 2.0, None),
 (1015, 2195, 5.0, None),
 (1019, 2227, 0.0, None),
 (1019, 2039, 0.0, None),
 (1016, 2414, 5.0, None),
 (1008, 2187, 0.0, None),
 (1007, 2250, 4.0, None),
 (1019, 2101, 0.0, None),
 (1016, 2213, 4.0, None),
 (1020, 2071, 2.0, None),
 (1016, 2053, 4.0, None),
 (1018, 2027, 4.0, None),
 (1019, 2020, 3.0, None),
 (1007, 2330, 5.0, None),
 (1012, 2163, 5.0, None),
 (1002, 2191, 5.0, None),
 (1020, 2168, 3.0, None),
 (1016, 2259, 5.0, None),
 (1007, 2090, 4.0, None),
 (1013, 2031, 1.0, None),
 (1011, 2017, 3.0, None),
 (1013, 2311, 1.0, None),
 (1002, 2063, 4.0, None),
 (1002, 2354, 1.0, None),
 (1008, 2301, 0.0, None),
 (1015, 2188, 5.0, None),
 (1015, 2138, 0.0, None),
 (1018, 2344, 0.0, None),
 (1019, 2367, 4.0, None),
 (1018, 2186, 5.0, None),
 (1012, 2089, 0.0, None),
 (1015, 2010, 0.0, None),
 (1002, 2046, 4.0, None),
 (1005, 2268, 5.0, None),
 (1003, 2230, 3.0, None),
 (1020, 2103, 4.0, None),
 (1011, 2400, 4.0, None),
 (1013, 2072, 2.0, None),
 (1013, 2411

In [9]:
#20% assigned to the testing set 
#train_data_raw, test_data_raw = train_test_split(raw_ratings_data, test_size=0.2) # -------> did not work when trying to use the predict on the test set for the KNN Basic method since there were too many values to unpack
ratio = int(len(raw_ratings_data)*0.8)

#assigning data to the training set
train_data_raw = raw_ratings_data[:ratio] 

#assigning data to the testing set
test_data_raw = raw_ratings_data[ratio:] 

# using the entire data to set the training dataset
data.raw_ratings = train_data_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_data_raw)

In [10]:
# Calculate the lengths (i.e., the number of data points) of the training and testing sets
train_data_length = len(train_data_raw)
test_data_length = len(test_data_raw)

# Print out the sizes of the training and testing data sets
print(f"Number of data points in training set: {train_data_length}")
print(f"Number of data points in testing set: {test_data_length}")

#showing this as a percentage of the total data
total_data_length = train_data_length + test_data_length
train_data_percentage = (train_data_length / total_data_length) * 100
test_data_percentage = (test_data_length / total_data_length) * 100

print(f"Percentage of data in training set: {train_data_percentage:.2f}%")
print(f"Percentage of data in testing set: {test_data_percentage:.2f}%")

Number of data points in training set: 6781
Number of data points in testing set: 1696
Percentage of data in training set: 79.99%
Percentage of data in testing set: 20.01%


### Defining the different techniques for collaborative filtering to determine which one is best

In [11]:
#create dict for different memory-based  and model based recommendation algorithms
recommendation_algorithms =[KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore(), SVD(), BaselineOnly(), CoClustering() ] 
results = {} #to store the scores

### Cross-validation performed on all recommendation algorithms

Hyper parameters of cross validation method:

- cv - defines the type of folding for the model we want to use
- n-jobs - determines the number of folds evaluated in parallel. 

In [12]:
for algorithm in recommendation_algorithms:
    #kfold set to 5
    #using three different metrics for evaluation: Mean Absolute Error ,Mean Squared Error, Root Mean Squared Error
    crossval_scores = cross_validate(algorithm, data, measures=["MAE", "MSE", "RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_mse': 'MSE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(algorithm).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE


Unnamed: 0,MAE,MSE,RMSE,Fit Time,Test Time
matrix_factorization.SVD,1.115124,1.879281,1.370672,0.04121,0.004213
co_clustering.CoClustering,1.274005,2.447581,1.563705,0.085057,0.004377
knns.KNNWithMeans,1.299138,2.456036,1.567052,0.00322,0.019569
knns.KNNWithZScore,1.293743,2.457534,1.567517,0.004249,0.024584
knns.KNNBaseline,1.304519,2.470407,1.57164,0.00562,0.025855
knns.KNNBasic,1.331588,2.554887,1.598246,0.001929,0.011898
baseline_only.BaselineOnly,1.371905,2.640264,1.624826,0.002049,0.002273


### Testing with different parameters

 Preparing the dataset to be used in a way that cross-validation can be applied directly on the entire dataset without the need for a separate test set initially defined 

In [13]:
#using cross-validation to help us test how well the model works accross different subsets of the data
trainset = data.build_full_trainset() #using all the ratings

#finding out how many users and courses we have in the dataset
print('Number of users: ', trainset.n_users, '\n') #20 users that rated
print('Number of items: ', trainset.n_items, '\n') #424 number of courses

Number of users:  20 

Number of items:  424 



#### Hyperparameters for the KNN models

- K parameter - indicates the the upper limit of similar items we want the algorithm to consider. For example in this case if the user rated 21 courses, but k is set to 5, when estimating a rating of a new course which has not yet been rated, only 5 out of the 21 courses that are the closest to the new course will be considered. 

- Similarity option, that defines the way to calculate it. All the similarity functions will return a number between 0 and 1 to a specific (i, j) item pair. 1 means the ratings are perfectly aligned, 0 means there is no connection between the two items. 

Three similarity metrics in the surprise similarity module: cosine, msd,pearson.

Both User-Based an Item-Based are implemented.

User-based collaborative filtering: a technique that predicts what the user might like based on the basis of ratings provided to the items by other users who have similar tastes with the target user. 

Item-based collaborative filtering: is a way to suggest things like products by looking at how similar two items are. Instead of comparing what the items are like (their features or properties), it looks at how users feel about the items—meaning, if people who like one item also tend to like another, those items are considered similar



#### KNNBasic

##### KNNBasic: User-Based Cosine

In [14]:
#fitting
#using user-based cosine similarity
params = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the cosine similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.5895
MSE: 2.5266
MAE:  1.3216

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.33   {'actual_k': 18, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.3262624876791738, details={'actual_k': 18, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.3154513342436438, details={'actual_k': 18, 'was_impossible': False})]

##### KNNBasic: Item-Based Cosine

In [15]:
# #build train and train train set
# trainset = data.build_full_trainset()

# #fitting

# params = {
#     "name": "cosine",
#     "user_based": False,  # Compute  similarities between users - False since we want to compare items
# }
# algo = KNNBasic(sim_options = params)
# algo.fit(trainset) #--------------> had to comment this entire section as ZeroDivisionError was displayed

# #test the test set using .test() 
# print('\nAccuracy on the testset:')
# accuracy.rmse(algo.test(testset))
# accuracy.mse(algo.test(testset))
# accuracy.mae(algo.test(testset))

# print('\nPredict Tests: ')
# print(algo.predict(1001, 2001))
# print(algo.predict(2001, 1001))
# print(algo.predict(1001, 88))
# print(algo.predict(1001, 5))

# print('\nPredict Using TestSet list: ')
# testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
# algo.test(testset)[:2]

##### KNNBasic: User-Based MSD 

In [16]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based MSD similarity
params = {
    "name": "msd",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2212
MSE: 1.4913
MAE:  1.0102

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.13   {'actual_k': 18, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.1331723380138086, details={'actual_k': 18, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.7382409723167738, details={'actual_k': 18, 'was_impossible': False})]

##### KNNBasic: Item-Based MSD

In [17]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#not using user-based MSD similarity
params = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7228
MSE: 0.5224
MAE:  0.4587

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.34   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.340087044653428, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.1024200734967815, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic: User-Based Pearson

In [18]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based pearson similarity
params = {
    "name": "pearson",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.0584
MSE: 1.1202
MAE:  0.8499

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.65   {'actual_k': 11, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.6489998437865436, details={'actual_k': 11, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.0582580856955324, details={'actual_k': 13, 'was_impossible': False})]

##### KNNBasic: Item-Based Pearson

In [19]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#not using user-based pearson similarity
params = {
    "name": "pearson",
    "user_based": False,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.9013
MSE: 0.8123
MAE:  0.6034

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.38   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.38047614304109, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.4009505926054024, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic: Cosine, User-Based, Shrinkage

In [20]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "cosine",
    "user_based": True,
    "shrinkage": 0 #no shrinkage - might not effect since using consine and only impacts pearson
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the cosine similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.5165
MSE: 2.2999
MAE:  1.2653

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.33   {'actual_k': 18, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.3262624876791738, details={'actual_k': 18, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.3154513342436438, details={'actual_k': 18, 'was_impossible': False})]

In [21]:
#-------> Commented due to the zero division error

# #build train and train train set
# trainset = data.build_full_trainset()

# #fitting
# #using user-based cosine similarity
# params = {
#     "name": "cosine",
#     "user_based": False,
#     "shrinkage": 0 #no shrinkage - might not effect since using consine and only impacts pearson
# }
# algo = KNNBasic(sim_options = params)
# algo.fit(trainset)

# #test the test set using .test() 
# print('\nAccuracy on the testset:')
# accuracy.rmse(algo.test(testset))
# accuracy.mse(algo.test(testset))
# accuracy.mae(algo.test(testset))

# print('\nPredict Tests: ')
# print(algo.predict(1001, 2001))
# print(algo.predict(2001, 1001))
# print(algo.predict(1001, 88))
# print(algo.predict(1001, 5))

# print('\nPredict Using TestSet list: ')
# testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
# algo.test(testset)[:2]

##### KNNBasic: MSD, User-Based, Shrinkage

In [22]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": True,
    "shrinkage": 0 #shrinkage is 0 -indicates that no shrinkage adjustment is being applied in this configuration.
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2212
MSE: 1.4913
MAE:  1.0102

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.13   {'actual_k': 18, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.1331723380138086, details={'actual_k': 18, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.7382409723167738, details={'actual_k': 18, 'was_impossible': False})]

In [23]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": False,
    "shrinkage": 0 #shrinkage is 0 -indicates that no shrinkage adjustment is being applied in this configuration.
     # However, it's important to note that shrinkage is specifically mentioned as relevant for "pearson_baseline" similarity. 
     # the "shrinkage" setting here might not have any effect because it's not applicable to the "msd" similarity measure. 
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7228
MSE: 0.5224
MAE:  0.4587

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.34   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.340087044653428, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.1024200734967815, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic: Pearson, User-Based, Shrinkage

In [24]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "pearson",
    "user_based": True,
    "shrinkage": 0
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.0584
MSE: 1.1202
MAE:  0.8499

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.65   {'actual_k': 11, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.6489998437865436, details={'actual_k': 11, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.0582580856955324, details={'actual_k': 13, 'was_impossible': False})]

In [25]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "pearson",
    "user_based": False,
    "shrinkage": 0
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.9013
MSE: 0.8123
MAE:  0.6034

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.38   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.38047614304109, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.4009505926054024, details={'actual_k': 40, 'was_impossible': False})]

##### Applying GridSearch

In [26]:
params_grid = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

grid_search_KNNBasic = GridSearchCV(KNNBasic, params_grid, measures=['RMSE','MAE'], cv=5, n_jobs=-1)                               
grid_search_KNNBasic.fit(data)

print(f'\nRMSE Best Parameters: {grid_search_KNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score: {grid_search_KNNBasic.best_score["rmse"]}')
print(f'MAE Best Parameters: {grid_search_KNNBasic.best_params["mae"]}')
print(f'MAE Best Score: {grid_search_KNNBasic.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.1042515100057009
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.7943279311007767


Applying the hyperparameters found in grid_search_KNNBasic 

In [27]:
finalKBL = KNNBaseline(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8)
predictions = finalKBL.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.1963
MSE: 1.4311
MAE:  0.9825


0.9825126349377201

#### KNN With K-Means

In [28]:
algo = KNNWithMeans()

#applying cross validation 
#kfold set to 5
cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5597  1.5803  1.5515  1.5314  1.5744  1.5595  0.0174  
MAE (testset)     1.2856  1.3264  1.2789  1.2612  1.3040  1.2912  0.0223  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.02    0.02    0.02    0.02    0.01    0.02    0.00    
Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.1964
MSE: 1.4313
MAE:  0.9822

Predict Tests: 
user: 1001       item: 2001       r_ui = No

[Prediction(uid=1001, iid=2001, r_ui=5, est=2.038762865442939, details={'actual_k': 18, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.930920525104305, details={'actual_k': 18, 'was_impossible': False})]

##### KNNWithMeans: User-based, Cosine 

In [29]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7550
MSE: 0.5700
MAE:  0.5105

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.62   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.622961775862307, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.720817421914526, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: Item-based Cosine 

In [30]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7550
MSE: 0.5700
MAE:  0.5105

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.62   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.622961775862307, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.720817421914526, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: User-based MSD 

In [31]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based msd similarity
parans = {
    "name": "msd",
    "user_based": True,  # Compute similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7550
MSE: 0.5700
MAE:  0.5105

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.62   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.622961775862307, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.720817421914526, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: Item-based MSD 

In [32]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "msd",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7550
MSE: 0.5700
MAE:  0.5105

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.62   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.622961775862307, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.720817421914526, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: User-based Pearson 

In [33]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "pearson",
    "user_based": True,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7550
MSE: 0.5700
MAE:  0.5105

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.62   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.622961775862307, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.720817421914526, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: Item-based Pearson 

In [34]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "pearson",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7550
MSE: 0.5700
MAE:  0.5105

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.62   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.622961775862307, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.720817421914526, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans - Applying the Grid Search

In [35]:
#doing this with name is msd or cosine causes float or zero division error
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

gsKWM = GridSearchCV(KNNWithMeans, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKWM.fit(data)

print(f'\nRMSE Best Parameters: {gsKWM.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKWM.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKWM.best_params["mae"]}')
print(f'MAE Best Score: {gsKWM.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.1319991800492675
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.8565875919478907


##### Applying the best hyperparameters for it 

In [36]:
finalKWM = KNNWithMeans(method = "als", n_epochs = 5, user_based = False, min_support = 3)
predictions = finalKWM.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.1964
MSE: 1.4313
MAE:  0.9822


0.982241529530695

#### KNN With Z Score

##### Applying the Grid Search

In [37]:
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "reg_u": [10, 12, 15],
                            "reg_i": [8, 10, 12],
                            "learning_rate": [0.0005, 0.05]}}

gsKZS = GridSearchCV(KNNWithZScore, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKZS.fit(data)

print(f'\nRMSE Best Parameters: {gsKZS.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKZS.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKZS.best_params["mae"]}')
print(f'MAE Best Score: {gsKZS.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
RMSE Best Score: 1.1428660312984367
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
MAE Best Score: 0.8550321322940837


##### Applying the best parameters found

In [38]:
finalKZS = KNNWithZScore(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8, learning_rate = 0.0005)
predictions = finalKZS.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.1971
MSE: 1.4331
MAE:  0.9814


0.9813673717439013

#### SVD 

In [39]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3529  1.3677  1.3637  1.3710  1.3701  1.3651  0.0066  
MAE (testset)     1.0848  1.1083  1.1155  1.1096  1.1240  1.1084  0.0131  
Fit time          0.04    0.03    0.04    0.03    0.04    0.04    0.00    
Test time         0.00    0.01    0.00    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 0.7726
MSE: 0.5969
MAE:  0.5531

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.99   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.47   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.47   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.9916817379008442, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.9092394919902826, details={'was_impossible': False})]

##### Applying Grid Search

In [40]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.4988255239722648
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.2671211514779466


##### Applying the best parameters

In [41]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.2889
MSE: 1.6613
MAE:  1.0816


1.0815847156821352

#### BaselineOnly

In [42]:
# Initialize the BaselineOnly algorithm
baseline_algo = BaselineOnly()

# Cross validate with k-fold set to 5
cross_validate(baseline_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


trainset = data.build_full_trainset()
baseline_algo.fit(trainset)


#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(baseline_algo.test(testset))
accuracy.mse(baseline_algo.test(testset))
accuracy.mae(baseline_algo.test(testset))

print('\nPredict Tests: ')
print(baseline_algo.predict(1001, 2001))
print(baseline_algo.predict(2001, 1001))
print(baseline_algo.predict(1001, 88))
print(baseline_algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
baseline_algo.test(testset)[:2]


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6421  1.5913  1.6301  1.6481  1.6368  1.6297  0.0201  
MAE (testset)     1.3910  1.3452  1.3738  1.3903  1.3872  1.3775  0.0173  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Estimating biases using als...

Accuracy on the testset:
RMSE: 1.5643
MSE: 2.4471
MAE:  1.3169

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.57   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.44   {'was_impossible': False}
user: 1001       item: 5          

[Prediction(uid=1001, iid=2001, r_ui=5, est=1.565684162916999, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.900575372977007, details={'was_impossible': False})]

##### Applying the Grid Search

In [43]:
# Define the parameter grid
params = {'bsl_options': {
                    'method': ['als', 'sgd'],  # Alternating Least Squares (ALS) and Stochastic Gradient Descent (SGD)
                    'n_epochs': [5, 10, 20],   # Only used for SGD
                    'reg_u': [12, 15, 17],     # Regularization parameter for users, only for ALS
                    'reg_i': [5, 10, 15]       # Regularization parameter for items, only for ALS
                }}

gs_baseline = GridSearchCV(BaselineOnly, params, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1)
gs_baseline.fit(data)

# Print best scores
print(f'\nRMSE Best Parameters: {gs_baseline.best_params["rmse"]}')
print(f'RMSE Best Score: {gs_baseline.best_score["rmse"]}')
print(f'MAE Best Parameters: {gs_baseline.best_params["mae"]}')
print(f'MAE Best Score: {gs_baseline.best_score["mae"]}')



RMSE Best Parameters: {'bsl_options': {'method': 'als', 'n_epochs': 5, 'reg_u': 17, 'reg_i': 10}}
RMSE Best Score: 1.629638521063325
MAE Best Parameters: {'bsl_options': {'method': 'sgd', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}}
MAE Best Score: 1.3689283780766222


##### Applying the best parameters

In [44]:
# Train and test with the best found parameters
final_baseline = BaselineOnly(bsl_options=gs_baseline.best_params['rmse']['bsl_options'])
final_baseline.fit(trainset)
predictions = final_baseline.test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)

Estimating biases using als...

Updated Accuracy: 
RMSE: 1.5643
MSE: 2.4471
MAE:  1.3171


1.3170882119130976

#### CoClustering

In [45]:
# Initialize the CoClustering algorithm
co_clustering = CoClustering()

# Cross validate with k-fold set to 5
cross_validate(co_clustering, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
co_clustering.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(co_clustering.test(testset))
accuracy.mse(co_clustering.test(testset))
accuracy.mae(co_clustering.test(testset))

print('\nPredict Tests: ')
print(co_clustering.predict(1001, 2001))
print(co_clustering.predict(2001, 1001))
print(co_clustering.predict(1001, 88))
print(co_clustering.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
co_clustering.test(testset)[:2]

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6686  1.6183  1.6501  1.5581  1.5551  1.6100  0.0465  
MAE (testset)     1.3587  1.2986  1.3446  1.2741  1.2486  1.3049  0.0415  
Fit time          0.08    0.06    0.06    0.06    0.07    0.07    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 1.3604
MSE: 1.8506
MAE:  1.0955

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.16   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.59   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.59   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.1603614468137753, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.7788625428095304, details={'was_impossible': False})]

##### Applying Grid Search

In [46]:
# Define the parameter grid
params = {
    'n_cltr_u': [3, 5, 7],  # Number of user clusters
    'n_cltr_i': [3, 5, 7],  # Number of item clusters
    'n_epochs': [20, 30, 40]  # Number of epochs
}

# Perform grid search
gs_co_clustering = GridSearchCV(CoClustering, params, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1)
gs_co_clustering.fit(data)

# Print best scores
print(f'\nRMSE Best Parameters: {gs_co_clustering.best_params["rmse"]}')
print(f'RMSE Best Score: {gs_co_clustering.best_score["rmse"]}')
print(f'MAE Best Parameters: {gs_co_clustering.best_params["mae"]}')
print(f'MAE Best Score: {gs_co_clustering.best_score["mae"]}')


RMSE Best Parameters: {'n_cltr_u': 7, 'n_cltr_i': 5, 'n_epochs': 30}
RMSE Best Score: 1.4528156354354005
MAE Best Parameters: {'n_cltr_u': 7, 'n_cltr_i': 7, 'n_epochs': 40}
MAE Best Score: 1.1334773020846165


##### Applying the best parameters

In [47]:
# Train and test with the best found parameters
final_co_clustering = CoClustering(n_cltr_u=gs_co_clustering.best_params['rmse']['n_cltr_u'],
                                   n_cltr_i=gs_co_clustering.best_params['rmse']['n_cltr_i'],
                                   n_epochs=gs_co_clustering.best_params['rmse']['n_epochs'])
final_co_clustering.fit(trainset)
predictions = final_co_clustering.test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)


Updated Accuracy: 
RMSE: 1.2679
MSE: 1.6075
MAE:  1.0056


1.0055905489546748