## Libraries

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, HTML
from surprise import Dataset,accuracy,Reader,KNNBasic,KNNWithMeans,NormalPredictor,KNNBaseline,SVD,NMF,CoClustering,BaselineOnly,KNNWithZScore,SlopeOne,SVDpp
from surprise.model_selection import cross_validate,train_test_split,GridSearchCV
from surprise.accuracy import rmse
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 1)
display(HTML("<style>.container { width:80% !important; }</style>"))

## Reading the Data

In [18]:
model_df=pd.read_csv("processed_data.csv")

reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(model_df[['student_id', 'course_id', 'course_rating']], reader)

In [19]:
course_df=pd.DataFrame(model_df['course_name'].unique(),columns=["course_name"])
index = pd.Index(range(0, len(course_df.index), 1))
course_df["course_id"] = course_df.index

## Getting User Data

In [20]:
student_id=1583
trainSet = data.build_full_trainset()

## Create AntiTest set

In [21]:
anti_testset_user = []
inner_uid = trainSet.to_inner_uid(student_id)
targetUser = inner_uid #inner_id of the target user
user_item_ratings = trainSet.ur[targetUser]
fillValue = trainSet.global_mean
user_item_ratings = trainSet.ur[inner_uid]
user_items = [item for (item,_) in (user_item_ratings)]
user_items
#filter
ratings = trainSet.all_ratings()

for iid in trainSet.all_items():
    if(iid not in user_items):
        anti_testset_user.append((trainSet.to_raw_uid(targetUser),trainSet.to_raw_iid(iid),fillValue))

## Course Enrolled 

In [22]:
course_df.loc[course_df['course_id'].isin(user_items)]

Unnamed: 0,course_name,course_id
3,Artificial Intelligence,3
26,VEX Robotics Competition,26


# Distance Based Algos

In [23]:
sim_options = {'name': 'pearson_baseline',
                'user_based': True}

## i) KNN Basic

In [24]:
np.random.seed(1)
knn_basic_algo = KNNBasic(min_k=2,k=10,sim_options=sim_options)

cross_validate(knn_basic_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = knn_basic_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9820  0.9951  0.9924  0.9797  0.9798  0.9858  0.0066  
MAE (testset)     0.7824  0.7979  0.7851  0.7824  0.7880  0.7872  0.0058  
Fit time          0.45    0.45    0.46    0.45    0.46    0.45    0.00    
Test time         0.24    0.24    0.22    0.24    0.22  

Unnamed: 0,uid,iid,r_ui,est,details
2,1583,2,3.71832,4.0,"{'actual_k': 2, 'was_impossible': False}"
51,1583,53,3.71832,4.0,"{'actual_k': 2, 'was_impossible': False}"
1,1583,1,3.71832,3.875176,"{'actual_k': 10, 'was_impossible': False}"
40,1583,42,3.71832,3.72076,"{'was_impossible': True, 'reason': 'Not enough neighbors.'}"
29,1583,31,3.71832,3.72076,"{'was_impossible': True, 'reason': 'Not enough neighbors.'}"


##  ii) KNN with means

In [25]:
np.random.seed(1)
knn_means_algo = KNNWithMeans(min_k=3,sim_options=sim_options)

cross_validate(knn_means_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = knn_means_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1149  1.1198  1.1177  1.1011  1.0991  1.1105  0.0087  
MAE (testset)     0.8568  0.8616  0.8622  0.8733  0.8701  0.8648  0.0060  
Fit time          0.46    0.48    0.48    0.47    0.49    0.48    0.01    
Test time         0.27    0.27    0.24    0.29    0.

Unnamed: 0,uid,iid,r_ui,est,details
0,1583,0,3.71832,4.5,"{'actual_k': 0, 'was_impossible': False}"
28,1583,30,3.71832,4.5,"{'actual_k': 0, 'was_impossible': False}"
30,1583,32,3.71832,4.5,"{'actual_k': 0, 'was_impossible': False}"
31,1583,33,3.71832,4.5,"{'actual_k': 1, 'was_impossible': False}"
32,1583,34,3.71832,4.5,"{'actual_k': 0, 'was_impossible': False}"


##  iii) KNNBaseline

# Matrix Factorization Algos

## i) SVD

In [26]:
np.random.seed(1)
svd_algo = SVD(n_factors=150,n_epochs=5,lr_all=0.005,reg_all=0.1)
cross_validate(svd_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svd_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8956  0.9291  0.9204  0.9076  0.9142  0.9134  0.0113  
MAE (testset)     0.7422  0.7624  0.7476  0.7399  0.7559  0.7496  0.0084  
Fit time          0.03    0.03    0.03    0.03    0.02    0.02    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


Unnamed: 0,uid,iid,r_ui,est,details
10,1583,11,3.71832,3.937566,{'was_impossible': False}
35,1583,37,3.71832,3.930273,{'was_impossible': False}
51,1583,53,3.71832,3.924604,{'was_impossible': False}
49,1583,51,3.71832,3.910719,{'was_impossible': False}
38,1583,40,3.71832,3.869327,{'was_impossible': False}


In [27]:
pred['err'] = abs(pred.est - pred.r_ui)
best_predictions = pred.sort_values(by='err').head(5)
best_predictions

Unnamed: 0,uid,iid,r_ui,est,details,err
41,1583,43,3.71832,3.715604,{'was_impossible': False},0.002716
30,1583,32,3.71832,3.723981,{'was_impossible': False},0.005661
4,1583,5,3.71832,3.725925,{'was_impossible': False},0.007605
44,1583,46,3.71832,3.710534,{'was_impossible': False},0.007786
19,1583,20,3.71832,3.709024,{'was_impossible': False},0.009296


# ii) SVDpp

In [28]:
np.random.seed(1)
svdpp_algo = SVDpp()
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svdpp_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9272  0.9492  0.9354  0.9267  0.9344  0.9346  0.0081  
MAE (testset)     0.7555  0.7699  0.7550  0.7483  0.7658  0.7589  0.0078  
Fit time          0.05    0.05    0.05    0.05    0.05    0.05    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


Unnamed: 0,uid,iid,r_ui,est,details
42,1583,44,3.71832,4.181222,{'was_impossible': False}
0,1583,0,3.71832,4.179845,{'was_impossible': False}
41,1583,43,3.71832,4.145347,{'was_impossible': False}
38,1583,40,3.71832,4.122619,{'was_impossible': False}
32,1583,34,3.71832,4.060287,{'was_impossible': False}


## CoClustering

In [29]:
np.random.seed(1)
svdpp_algo = CoClustering(n_cltr_u=10, n_cltr_i=8, random_state=None)
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svdpp_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1686  1.2793  1.2301  1.1832  1.1896  1.2102  0.0401  
MAE (testset)     0.9144  0.9950  0.9590  0.9420  0.9355  0.9492  0.0270  
Fit time          0.39    0.40    0.40    0.40    0.39    0.40    0.00    
Test time         0.00    0.01    0.01    0.01    0.01    0.01    0.00    


Unnamed: 0,uid,iid,r_ui,est,details
12,1583,13,3.71832,5.0,{'was_impossible': False}
52,1583,54,3.71832,5.0,{'was_impossible': False}
40,1583,42,3.71832,5.0,{'was_impossible': False}
48,1583,50,3.71832,5.0,{'was_impossible': False}
47,1583,49,3.71832,5.0,{'was_impossible': False}


## SlopeOne

In [30]:
np.random.seed(1)
svdpp_algo = SlopeOne()
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svdpp_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1578  1.1471  1.1601  1.1436  1.1561  1.1529  0.0064  
MAE (testset)     0.9090  0.8977  0.9002  0.9141  0.9250  0.9092  0.0098  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


Unnamed: 0,uid,iid,r_ui,est,details
53,1583,55,3.71832,5.0,{'was_impossible': False}
40,1583,42,3.71832,5.0,{'was_impossible': False}
24,1583,25,3.71832,5.0,{'was_impossible': False}
29,1583,31,3.71832,5.0,{'was_impossible': False}
21,1583,22,3.71832,5.0,{'was_impossible': False}


## Benchmarks

## Hyper Parameter Tuning using GridSearchCV

## i) KNN Baseline With Parameters

In [31]:
knn_baseline_param_grid = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [1, 2],
    },
    'sim_options': {
        'min_support': [True],
        'user_based': [True],
        'min_k':[2]
    },
}

grid_search_knn_baseline = GridSearchCV(KNNBaseline,knn_baseline_param_grid, measures=['rmse','mae'], cv=5)
grid_search_knn_baseline.fit(data) 
print(grid_search_knn_baseline.best_params['rmse'])
print(grid_search_knn_baseline.best_score['rmse'])
print(grid_search_knn_baseline.best_score['mae'])
algo_grid_search_knn_baseline = grid_search_knn_baseline.best_estimator['rmse']
cross_validate(algo_grid_search_knn_baseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_knn_baseline.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

Unnamed: 0,uid,iid,r_ui,est,details
32,1583,34,3.71832,5.0,"{'actual_k': 1, 'was_impossible': False}"
12,1583,13,3.71832,4.468005,"{'actual_k': 2, 'was_impossible': False}"
34,1583,36,3.71832,4.345822,"{'actual_k': 5, 'was_impossible': False}"
44,1583,46,3.71832,4.333324,"{'actual_k': 6, 'was_impossible': False}"
26,1583,28,3.71832,4.311824,"{'actual_k': 12, 'was_impossible': False}"


## ii) KNNBasic With Parameters

In [32]:
knn_basic_params = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [1, 2],
    },
    'sim_options': {
        'name': ['msd', 'cosine'],
        'user_based': [True],
        'min_support': [True],

    },
}

grid_search_knn_basic = GridSearchCV(KNNBasic,knn_basic_params, measures=['rmse','mae'], cv=5)
grid_search_knn_basic.fit(data) 
print(grid_search_knn_basic.best_params['rmse'])
print(grid_search_knn_basic.best_score['rmse'])
print(grid_search_knn_basic.best_score['mae'])

algo_grid_search_knn_basic = grid_search_knn_basic.best_estimator['rmse']
cross_validate(algo_grid_search_knn_basic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_knn_basic.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix.

Unnamed: 0,uid,iid,r_ui,est,details
50,1583,52,3.71832,4.375,"{'actual_k': 8, 'was_impossible': False}"
0,1583,0,3.71832,4.307692,"{'actual_k': 13, 'was_impossible': False}"
47,1583,49,3.71832,4.25,"{'actual_k': 4, 'was_impossible': False}"
12,1583,13,3.71832,4.25,"{'actual_k': 4, 'was_impossible': False}"
21,1583,22,3.71832,4.125,"{'actual_k': 8, 'was_impossible': False}"


## iii) SVD with Parameters

In [33]:
svd_params = {'n_factors': [100,150],
              'n_epochs': [5,10,15],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}

grid_search_svd = GridSearchCV(SVD,svd_params,measures=['rmse','mae'], cv=5)
grid_search_svd.fit(data)
print(grid_search_svd.best_params['rmse'])
print(grid_search_svd.best_score['rmse'])
print(grid_search_svd.best_score['mae'])

algo_grid_search_svd = grid_search_svd.best_estimator['rmse']
cross_validate(algo_grid_search_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_svd.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

{'n_factors': 100, 'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.1}
0.9135234055481452
0.7455193141380174
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9275  0.8878  0.9296  0.9166  0.9022  0.9128  0.0158  
MAE (testset)     0.7626  0.7202  0.7582  0.7455  0.7470  0.7467  0.0148  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


Unnamed: 0,uid,iid,r_ui,est,details
22,1583,23,3.71832,3.978228,{'was_impossible': False}
32,1583,34,3.71832,3.939025,{'was_impossible': False}
20,1583,21,3.71832,3.930562,{'was_impossible': False}
35,1583,37,3.71832,3.914379,{'was_impossible': False}
53,1583,55,3.71832,3.898651,{'was_impossible': False}


## iv) SVDpp with Parameters

In [34]:
svdpp_params = {'n_factors': [100,150],
              'n_epochs': [5,10,15],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}

grid_search_svdpp = GridSearchCV(SVDpp,svdpp_params,measures=['rmse','mae'], cv=5)
grid_search_svdpp.fit(data)
print(grid_search_svdpp.best_params['rmse'])
print(grid_search_svdpp.best_score['rmse'])
print(grid_search_svdpp.best_score['mae'])

algo_grid_search_svdpp = grid_search_svdpp.best_estimator['rmse']
cross_validate(algo_grid_search_svdpp, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_svdpp.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

{'n_factors': 150, 'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.1}
0.9131157522250813
0.7478184267117285
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9042  0.9223  0.9170  0.9167  0.9192  0.9159  0.0062  
MAE (testset)     0.7430  0.7552  0.7480  0.7545  0.7458  0.7493  0.0048  
Fit time          0.06    0.06    0.06    0.06    0.06    0.06    0.00    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


Unnamed: 0,uid,iid,r_ui,est,details
13,1583,14,3.71832,4.060574,{'was_impossible': False}
44,1583,46,3.71832,4.019352,{'was_impossible': False}
27,1583,29,3.71832,3.965149,{'was_impossible': False}
26,1583,28,3.71832,3.939545,{'was_impossible': False}
30,1583,32,3.71832,3.937279,{'was_impossible': False}


## Model Evaluation on Anti test set

In [35]:
np.random.seed(1)
svd_algo = SVD(n_factors=150,n_epochs=5,lr_all=0.005,reg_all=0.1)
cross_validate(svd_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svd_algo.test(anti_testset_user)
print("Validation RMSE:",accuracy.rmse(predictions))

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8956  0.9291  0.9204  0.9076  0.9142  0.9134  0.0113  
MAE (testset)     0.7422  0.7624  0.7476  0.7399  0.7559  0.7496  0.0084  
Fit time          0.03    0.02    0.03    0.02    0.02    0.03    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
RMSE: 0.1133
Validation RMSE: 0.11327732621571301
