### Importing libraries

In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader 
from scipy.stats import zscore
from sklearn import metrics
import random
from surprise.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

#accessing the model and memory based techniques
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore 
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import CoClustering

#accessing the similarity metrics
from math import sqrt
from surprise import accuracy
from surprise.model_selection.validation import cross_validate

### Accessing the data

In [None]:
#defining the file path 
file_path = './cleanedDatasets/cleaned_user_ratings.csv'

#reading the data using pandas library
user_ratings_df = pd.read_csv(file_path, encoding= 'unicode_escape')
user_ratings_df

### Preparing the data in a suitable format

In [None]:
#creating a reader object
#line format defines the order to match with the file - since reading with pandas not needed
#sep is the comma in this case since it is a csv file - since reading with pandas not needed
#rating_scale it is a tuple defining the lowest and highest possible range, if a number is not included in the tuple it will be ignored. 
reader = Reader(rating_scale=(0, 5))

#importing the data in a fromat surprise to be able to work with it - by loading it from the pandas data frame
#data = Dataset.load_from_df(user_ratings_df[['course_id','user_id','rating']], reader=reader) #------> did not work as the order impacted the functionality  for the KNNBasic user-based 
data = Dataset.load_from_df(user_ratings_df[['user_id','course_id','rating']], reader=reader)

In [None]:
data

##### Splitting the data 

In [None]:
raw_ratings_data = data.raw_ratings #lists the raw ratings in the following order: user_id, course_id, rating, and the timestamp

In [None]:
data.raw_ratings

In [None]:
#before splitting the data it would be ideal to shuffle 
random.shuffle(raw_ratings_data) 

In [None]:
data.raw_ratings #to confirm that the data is shuffled properly, it was compared to the cleaned user rating csv file - note the order of columns is different of course than the orignal file

In [None]:
#20% assigned to the testing set 
#train_data_raw, test_data_raw = train_test_split(raw_ratings_data, test_size=0.2) # -------> did not work when trying to use the predict on the test set for the KNN Basic method since there were too many values to unpack
ratio = int(len(raw_ratings_data)*0.8)

#assigning data to the training set
train_data_raw = raw_ratings_data[:ratio] 

#assigning data to the testing set
test_data_raw = raw_ratings_data[ratio:] 

# using the entire data to set the training dataset
data.raw_ratings = train_data_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_data_raw)

In [None]:
# Calculate the lengths (i.e., the number of data points) of the training and testing sets
train_data_length = len(train_data_raw)
test_data_length = len(test_data_raw)

# Print out the sizes of the training and testing data sets
print(f"Number of data points in training set: {train_data_length}")
print(f"Number of data points in testing set: {test_data_length}")

#showing this as a percentage of the total data
total_data_length = train_data_length + test_data_length
train_data_percentage = (train_data_length / total_data_length) * 100
test_data_percentage = (test_data_length / total_data_length) * 100

print(f"Percentage of data in training set: {train_data_percentage:.2f}%")
print(f"Percentage of data in testing set: {test_data_percentage:.2f}%")

### Defining the different techniques for collaborative filtering to determine which one is best

In [None]:
#create dict for different memory-based  and model based recommendation algorithms
recommendation_algorithms =[KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore(), SVD(), BaselineOnly(), CoClustering() ] 
results = {} #to store the scores

### Cross-validation performed on all recommendation algorithms

Hyper parameters of cross validation method:

- cv - defines the type of folding for the model we want to use
- n-jobs - determines the number of folds evaluated in parallel. 

In [None]:
for algorithm in recommendation_algorithms:
    #kfold set to 5
    #using three different metrics for evaluation: Mean Absolute Error ,Mean Squared Error, Root Mean Squared Error
    crossval_scores = cross_validate(algorithm, data, measures=["MAE", "MSE", "RMSE"], cv=5, n_jobs=-1)  
    
    #saving and renaming appropraitely
    result = pd.DataFrame.from_dict(crossval_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_mse': 'MSE', 'test_rmse': 'RMSE', 'fit_time': 'Fit Time', 'test_time': 'Test Time'})
    results[str(algorithm).split("algorithms.")[1].split("object ")[0]] = result
    
#printing all models results
all_models = pd.DataFrame.from_dict(results)
all_models.T.sort_values(by='RMSE') #models sorted by RMSE


### Testing with different parameters

 Preparing the dataset to be used in a way that cross-validation can be applied directly on the entire dataset without the need for a separate test set initially defined 

In [None]:
#using cross-validation to help us test how well the model works accross different subsets of the data
trainset = data.build_full_trainset() #using all the ratings

#finding out how many users and courses we have in the dataset
print('Number of users: ', trainset.n_users, '\n') #20 users that rated
print('Number of items: ', trainset.n_items, '\n') #424 number of courses

#### Hyperparameters for the KNN models

- K parameter - indicates the the upper limit of similar items we want the algorithm to consider. For example in this case if the user rated 21 courses, but k is set to 5, when estimating a rating of a new course which has not yet been rated, only 5 out of the 21 courses that are the closest to the new course will be considered. 

- Similarity option, that defines the way to calculate it. All the similarity functions will return a number between 0 and 1 to a specific (i, j) item pair. 1 means the ratings are perfectly aligned, 0 means there is no connection between the two items. 

Three similarity metrics in the surprise similarity module: cosine, msd,pearson.

Both User-Based an Item-Based are implemented.

User-based collaborative filtering: a technique that predicts what the user might like based on the basis of ratings provided to the items by other users who have similar tastes with the target user. 

Item-based collaborative filtering: is a way to suggest things like products by looking at how similar two items are. Instead of comparing what the items are like (their features or properties), it looks at how users feel about the items—meaning, if people who like one item also tend to like another, those items are considered similar



#### KNNBasic

##### KNNBasic: User-Based Cosine

In [None]:
#fitting
#using user-based cosine similarity
params = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

##### KNNBasic: Item-Based Cosine

In [None]:
# #build train and train train set
# trainset = data.build_full_trainset()

# #fitting

# params = {
#     "name": "cosine",
#     "user_based": False,  # Compute  similarities between users - False since we want to compare items
# }
# algo = KNNBasic(sim_options = params)
# algo.fit(trainset) #--------------> had to comment this entire section as ZeroDivisionError was displayed

# #test the test set using .test() 
# print('\nAccuracy on the testset:')
# accuracy.rmse(algo.test(testset))
# accuracy.mse(algo.test(testset))
# accuracy.mae(algo.test(testset))

# print('\nPredict Tests: ')
# print(algo.predict(1001, 2001))
# print(algo.predict(2001, 1001))
# print(algo.predict(1001, 88))
# print(algo.predict(1001, 5))

# print('\nPredict Using TestSet list: ')
# testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
# algo.test(testset)[:2]

##### KNNBasic: User-Based MSD 

In [None]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based MSD similarity
params = {
    "name": "msd",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

##### KNNBasic: Item-Based MSD

In [98]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#not using user-based MSD similarity
params = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

MSE: 0.5346
MAE:  0.4658

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.61   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.6108100686103803, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.0176908000386153, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic: User-Based Pearson

In [99]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based pearson similarity
params = {
    "name": "pearson",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.0728
MSE: 1.1509
MAE:  0.8611

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.71   {'actual_k': 10, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.7094150107006367, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.8433401826655023, details={'actual_k': 12, 'was_impossible': False})]

##### KNNBasic: Item-Based Pearson

In [100]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#not using user-based pearson similarity
params = {
    "name": "pearson",
    "user_based": False,  # Compute  similarities between users
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.9069
MSE: 0.8225
MAE:  0.6092

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.74   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.7447618121212556, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.690543418083259, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic: Cosine, User-Based, Shrinkage

In [101]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "cosine",
    "user_based": True,
    "shrinkage": 0 #no shrinkage - might not effect since using consine and only impacts pearson
}
algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the cosine similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.5222
MSE: 2.3170
MAE:  1.2657

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.28   {'actual_k': 17, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=1.2817070940042048, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.2394895556575192, details={'actual_k': 17, 'was_impossible': False})]

In [103]:
#-------> Commented due to the zero division error

# #build train and train train set
# trainset = data.build_full_trainset()

# #fitting
# #using user-based cosine similarity
# params = {
#     "name": "cosine",
#     "user_based": False,
#     "shrinkage": 0 #no shrinkage - might not effect since using consine and only impacts pearson
# }
# algo = KNNBasic(sim_options = params)
# algo.fit(trainset)

# #test the test set using .test() 
# print('\nAccuracy on the testset:')
# accuracy.rmse(algo.test(testset))
# accuracy.mse(algo.test(testset))
# accuracy.mae(algo.test(testset))

# print('\nPredict Tests: ')
# print(algo.predict(1001, 2001))
# print(algo.predict(2001, 1001))
# print(algo.predict(1001, 88))
# print(algo.predict(1001, 5))

# print('\nPredict Using TestSet list: ')
# testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
# algo.test(testset)[:2]

##### KNNBasic: MSD, User-Based, Shrinkage

In [104]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": True,
    "shrinkage": 0 #shrinkage is 0 -indicates that no shrinkage adjustment is being applied in this configuration.
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2400
MSE: 1.5377
MAE:  1.0191

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.12   {'actual_k': 17, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.1188626723939823, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.6663210738856766, details={'actual_k': 17, 'was_impossible': False})]

In [105]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "msd",
    "user_based": False,
    "shrinkage": 0 #shrinkage is 0 -indicates that no shrinkage adjustment is being applied in this configuration.
     # However, it's important to note that shrinkage is specifically mentioned as relevant for "pearson_baseline" similarity. 
     # the "shrinkage" setting here might not have any effect because it's not applicable to the "msd" similarity measure. 
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7312
MSE: 0.5346
MAE:  0.4658

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.61   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.6108100686103803, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.0176908000386153, details={'actual_k': 40, 'was_impossible': False})]

##### KNNBasic: Pearson, User-Based, Shrinkage

In [106]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "pearson",
    "user_based": True,
    "shrinkage": 0
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.0728
MSE: 1.1509
MAE:  0.8611

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.71   {'actual_k': 10, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.7094150107006367, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.8433401826655023, details={'actual_k': 12, 'was_impossible': False})]

In [107]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
params = {
    "name": "pearson",
    "user_based": False,
    "shrinkage": 0
}

algo = KNNBasic(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.9069
MSE: 0.8225
MAE:  0.6092

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 3.74   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=3.7447618121212556, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=3.690543418083259, details={'actual_k': 40, 'was_impossible': False})]

##### Applying GridSearch

In [108]:
params_grid = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

grid_search_KNNBasic = GridSearchCV(KNNBasic, params_grid, measures=['RMSE','MAE'], cv=5, n_jobs=-1)                               
grid_search_KNNBasic.fit(data)

print(f'\nRMSE Best Parameters: {grid_search_KNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score: {grid_search_KNNBasic.best_score["rmse"]}')
print(f'MAE Best Parameters: {grid_search_KNNBasic.best_params["mae"]}')
print(f'MAE Best Score: {grid_search_KNNBasic.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.1043283671444724
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.7989003248565295


Applying the hyperparameters found in grid_search_KNNBasic 

In [109]:
finalKBL = KNNBaseline(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8)
predictions = finalKBL.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2115
MSE: 1.4678
MAE:  0.9924


0.9924226168619393

#### KNN With K-Means

In [110]:
algo = KNNWithMeans()

#applying cross validation 
#kfold set to 5
cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4889  1.5687  1.6056  1.5478  1.5464  1.5515  0.0379  
MAE (testset)     1.2213  1.2792  1.3363  1.2697  1.2825  1.2778  0.0366  
Fit time          0.00    0.01    0.00    0.00    0.00    0.00    0.00    
Test time         0.02    0.01    0.02    0.01    0.02    0.02    0.01    
Computing the msd similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 1.2113
MSE: 1.4673
MAE:  0.9920

Predict Tests: 
user: 1001       item: 2001       r_ui = No

[Prediction(uid=1001, iid=2001, r_ui=5, est=2.0235904211969973, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.882092609303578, details={'actual_k': 17, 'was_impossible': False})]

##### KNNWithMeans: User-based, Cosine 

In [111]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7682
MSE: 0.5902
MAE:  0.5206

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.94   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.9405691167903187, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8070152851051704, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: Item-based Cosine 

In [112]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based cosine similarity
parans = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7682
MSE: 0.5902
MAE:  0.5206

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.94   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.9405691167903187, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8070152851051704, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: User-based MSD 

In [113]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using user-based msd similarity
parans = {
    "name": "msd",
    "user_based": True,  # Compute similarities between users
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7682
MSE: 0.5902
MAE:  0.5206

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.94   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.9405691167903187, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8070152851051704, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: Item-based MSD 

In [114]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "msd",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7682
MSE: 0.5902
MAE:  0.5206

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.94   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.9405691167903187, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8070152851051704, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: User-based Pearson 

In [115]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "pearson",
    "user_based": True,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7682
MSE: 0.5902
MAE:  0.5206

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.94   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.9405691167903187, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8070152851051704, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans: Item-based Pearson 

In [116]:
#build train and train train set
trainset = data.build_full_trainset()

#fitting
#using item-based msd similarity
parans = {
    "name": "pearson",
    "user_based": False,  # Compute similarities between items
}
algo = KNNWithMeans(sim_options = params)
algo.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(algo.test(testset))
accuracy.mse(algo.test(testset))
accuracy.mae(algo.test(testset))

print('\nPredict Tests: ')
print(algo.predict(1001, 2001))
print(algo.predict(2001, 1001))
print(algo.predict(1001, 88))
print(algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
algo.test(testset)[:2]

Computing the pearson similarity matrix...
Done computing similarity matrix.

Accuracy on the testset:
RMSE: 0.7682
MSE: 0.5902
MAE:  0.5206

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 2.94   {'actual_k': 40, 'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=2.9405691167903187, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.8070152851051704, details={'actual_k': 40, 'was_impossible': False})]

##### KNNWithMeans - Applying the Grid Search

In [117]:
#doing this with name is msd or cosine causes float or zero division error
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "min_support": [3, 5, 8],
                          "user_based": [True, False]}}

gsKWM = GridSearchCV(KNNWithMeans, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKWM.fit(data)

print(f'\nRMSE Best Parameters: {gsKWM.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKWM.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKWM.best_params["mae"]}')
print(f'MAE Best Score: {gsKWM.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
RMSE Best Score: 1.127189199412474
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'min_support': 3}}
MAE Best Score: 0.8544039212682488


##### Applying the best hyperparameters for it 

In [118]:
finalKWM = KNNWithMeans(method = "als", n_epochs = 5, user_based = False, min_support = 3)
predictions = finalKWM.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2113
MSE: 1.4673
MAE:  0.9920


0.9919570392285494

#### KNN With Z Score

##### Applying the Grid Search

In [119]:
params = {'sim_options' : {'method': ['als','sgd'],
                            "n_epochs" : [5, 10, 20],
                            "user_based": [True, False],
                            "reg_u": [10, 12, 15],
                            "reg_i": [8, 10, 12],
                            "learning_rate": [0.0005, 0.05]}}

gsKZS = GridSearchCV(KNNWithZScore, params, measures=['mae', 'rmse'], cv=5, n_jobs=-1)                               
gsKZS.fit(data)

print(f'\nRMSE Best Parameters: {gsKZS.best_params["rmse"]}')
print(f'RMSE Best Score: {gsKZS.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsKZS.best_params["mae"]}')
print(f'MAE Best Score: {gsKZS.best_score["mae"]}')


RMSE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
RMSE Best Score: 1.1424787636810838
MAE Best Parameters: {'sim_options': {'method': 'als', 'n_epochs': 5, 'user_based': False, 'reg_u': 10, 'reg_i': 8, 'learning_rate': 0.0005}}
MAE Best Score: 0.8623094987811225


##### Applying the best parameters found

In [120]:
finalKZS = KNNWithZScore(method = "als", n_epochs = 5, user_based = False, reg_u = 10, reg_i = 8, learning_rate = 0.0005)
predictions = finalKZS.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.

Updated Accuracy: 
RMSE: 1.2115
MSE: 1.4678
MAE:  0.9906


0.990601580259656

#### SVD 

In [121]:
#algo
svd = SVD()

#cross validate with kfold set to 5
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

#build train and train train set
trainset = data.build_full_trainset()
svd.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(svd.test(testset))
accuracy.mse(svd.test(testset))
accuracy.mae(svd.test(testset))

print('\nPredict Tests: ')
print(svd.predict(1001, 2001))
print(svd.predict(2001, 1001))
print(svd.predict(1001, 88))
print(svd.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
svd.test(testset)[:2]

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3915  1.3798  1.3502  1.3108  1.3602  1.3585  0.0279  
MAE (testset)     1.1370  1.1261  1.1048  1.0582  1.1021  1.1056  0.0271  
Fit time          0.04    0.05    0.04    0.04    0.03    0.04    0.00    
Test time         0.00    0.00    0.01    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 0.7829
MSE: 0.6129
MAE:  0.5602

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 4.16   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.46   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.46   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=4.163384803765208, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=2.6658467100957433, details={'was_impossible': False})]

##### Applying Grid Search

In [122]:
params = {"n_factors": range(10, 100, 20),
         "n_epochs": [5, 10, 20],
         "lr_all": [0.002, 0.005],
         "reg_all": [0.2, 0.5]}

gsSVD = GridSearchCV(SVD, params, measures = ["RMSE", "MAE"], cv = 5, n_jobs = -1)
gsSVD.fit(data)

print(f'\nRMSE Best Parameters: {gsSVD.best_params["rmse"]}')
print(f'RMSE Best Score: {gsSVD.best_score["rmse"]}')
print(f'MAE Best Parameters: {gsSVD.best_params["mae"]}')
print(f'MAE Best Score: {gsSVD.best_score["mae"]}')


RMSE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score: 1.4897250568684477
MAE Best Parameters: {'n_factors': 90, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score: 1.2522848567848455


##### Applying the best parameters

In [123]:
finalSVD = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
predictions = finalSVD.fit(trainset).test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions)
accuracy.mae(predictions)


Updated Accuracy: 
RMSE: 1.3116
MSE: 1.7203
MAE:  1.0971


1.0970660912780203

#### BaselineOnly

In [124]:
# Initialize the BaselineOnly algorithm
baseline_algo = BaselineOnly()

# Cross validate with k-fold set to 5
cross_validate(baseline_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


trainset = data.build_full_trainset()
baseline_algo.fit(trainset)


#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(baseline_algo.test(testset))
accuracy.mse(baseline_algo.test(testset))
accuracy.mae(baseline_algo.test(testset))

print('\nPredict Tests: ')
print(baseline_algo.predict(1001, 2001))
print(baseline_algo.predict(2001, 1001))
print(baseline_algo.predict(1001, 88))
print(baseline_algo.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
baseline_algo.test(testset)[:2]


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6191  1.5869  1.6064  1.6211  1.6205  1.6108  0.0131  
MAE (testset)     1.3697  1.3303  1.3563  1.3599  1.3668  1.3566  0.0140  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Estimating biases using als...

Accuracy on the testset:
RMSE: 1.5636
MSE: 2.4448
MAE:  1.3167

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 1.57   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.45   {'was_impossible': False}
user: 1001       item: 5          

[Prediction(uid=1001, iid=2001, r_ui=5, est=1.5680489988180235, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.899560692781614, details={'was_impossible': False})]

##### Applying the Grid Search

In [125]:
# Define the parameter grid
params = {'bsl_options': {
                    'method': ['als', 'sgd'],  # Alternating Least Squares (ALS) and Stochastic Gradient Descent (SGD)
                    'n_epochs': [5, 10, 20],   # Only used for SGD
                    'reg_u': [12, 15, 17],     # Regularization parameter for users, only for ALS
                    'reg_i': [5, 10, 15]       # Regularization parameter for items, only for ALS
                }}

gs_baseline = GridSearchCV(BaselineOnly, params, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1)
gs_baseline.fit(data)

# Print best scores
print(f'\nRMSE Best Parameters: {gs_baseline.best_params["rmse"]}')
print(f'RMSE Best Score: {gs_baseline.best_score["rmse"]}')
print(f'MAE Best Parameters: {gs_baseline.best_params["mae"]}')
print(f'MAE Best Score: {gs_baseline.best_score["mae"]}')



RMSE Best Parameters: {'bsl_options': {'method': 'als', 'n_epochs': 20, 'reg_u': 17, 'reg_i': 10}}
RMSE Best Score: 1.6099482365290523
MAE Best Parameters: {'bsl_options': {'method': 'sgd', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}}
MAE Best Score: 1.3489720840445423


##### Applying the best parameters

In [126]:
# Train and test with the best found parameters
final_baseline = BaselineOnly(bsl_options=gs_baseline.best_params['rmse']['bsl_options'])
final_baseline.fit(trainset)
predictions = final_baseline.test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)

Estimating biases using als...

Updated Accuracy: 
RMSE: 1.5636
MSE: 2.4449
MAE:  1.3168


1.3167988648114093

#### CoClustering

In [127]:
# Initialize the CoClustering algorithm
co_clustering = CoClustering()

# Cross validate with k-fold set to 5
cross_validate(co_clustering, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
co_clustering.fit(trainset)

#test the test set using .test() 
print('\nAccuracy on the testset:')
accuracy.rmse(co_clustering.test(testset))
accuracy.mse(co_clustering.test(testset))
accuracy.mae(co_clustering.test(testset))

print('\nPredict Tests: ')
print(co_clustering.predict(1001, 2001))
print(co_clustering.predict(2001, 1001))
print(co_clustering.predict(1001, 88))
print(co_clustering.predict(1001, 5))

print('\nPredict Using TestSet list: ')
testset = [data.df.loc[i].to_list() for i in range(len(data.df))]
co_clustering.test(testset)[:2]

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5693  1.5076  1.6249  1.5385  1.5607  1.5602  0.0387  
MAE (testset)     1.2587  1.2214  1.3169  1.2286  1.2595  1.2570  0.0337  
Fit time          0.08    0.07    0.07    0.07    0.08    0.07    0.00    
Test time         0.00    0.01    0.01    0.00    0.00    0.00    0.00    

Accuracy on the testset:
RMSE: 1.4870
MSE: 2.2111
MAE:  1.2269

Predict Tests: 
user: 1001       item: 2001       r_ui = None   est = 0.76   {'was_impossible': False}
user: 2001       item: 1001       r_ui = None   est = 2.58   {'was_impossible': False}
user: 1001       item: 88         r_ui = None   est = 2.58   {'was_impossible': False}
user: 1001       item: 5          r_ui = None   est = 2.58   {'was_impossible': False}

Predict Using TestSet list: 


[Prediction(uid=1001, iid=2001, r_ui=5, est=0.7607440158143985, details={'was_impossible': False}),
 Prediction(uid=1002, iid=2001, r_ui=3, est=1.9303108093404608, details={'was_impossible': False})]

##### Applying Grid Search

In [128]:
# Define the parameter grid
params = {
    'n_cltr_u': [3, 5, 7],  # Number of user clusters
    'n_cltr_i': [3, 5, 7],  # Number of item clusters
    'n_epochs': [20, 30, 40]  # Number of epochs
}

# Perform grid search
gs_co_clustering = GridSearchCV(CoClustering, params, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1)
gs_co_clustering.fit(data)

# Print best scores
print(f'\nRMSE Best Parameters: {gs_co_clustering.best_params["rmse"]}')
print(f'RMSE Best Score: {gs_co_clustering.best_score["rmse"]}')
print(f'MAE Best Parameters: {gs_co_clustering.best_params["mae"]}')
print(f'MAE Best Score: {gs_co_clustering.best_score["mae"]}')


RMSE Best Parameters: {'n_cltr_u': 7, 'n_cltr_i': 5, 'n_epochs': 30}
RMSE Best Score: 1.4597126915028142
MAE Best Parameters: {'n_cltr_u': 7, 'n_cltr_i': 7, 'n_epochs': 30}
MAE Best Score: 1.1401804212143225


##### Applying the best parameters

In [129]:
# Train and test with the best found parameters
final_co_clustering = CoClustering(n_cltr_u=gs_co_clustering.best_params['rmse']['n_cltr_u'],
                                   n_cltr_i=gs_co_clustering.best_params['rmse']['n_cltr_i'],
                                   n_epochs=gs_co_clustering.best_params['rmse']['n_epochs'])
final_co_clustering.fit(trainset)
predictions = final_co_clustering.test(testset)

print('\nUpdated Accuracy: ')
accuracy.rmse(predictions)
accuracy.mse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)


Updated Accuracy: 
RMSE: 1.2587
MSE: 1.5843
MAE:  0.9985


0.9984732575077113