## Libraries

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from IPython.display import display, HTML
from sqlite3 import connect
from surprise import Dataset,accuracy
from surprise import Reader
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import NormalPredictor
from surprise import KNNBaseline
from surprise import SVD
import numpy as np
from surprise import NMF
from surprise import CoClustering
from surprise import BaselineOnly
from surprise import KNNWithZScore
from surprise import SlopeOne
from surprise import SVDpp
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse as rmse_sp
import pandas as pd
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 1)
display(HTML("<style>.container { width:80% !important; }</style>"))

## Modeling

In [None]:
model_df=pd.read_csv("processed_data.csv")

reader = Reader(rating_scale=(1,5))

## Getting User Data

In [None]:
student_id=1583
data = Dataset.load_from_df(model_df[['student_id', 'course_name', 'course_rating']], reader)
trainSet = data.build_full_trainset()

In [None]:
course_df=pd.DataFrame(model_df['course_name'].unique(),columns=["course_name"])
index = pd.Index(range(0, len(course_df.index), 1))
course_df["course_id"] = course_df.index

## Create AntiTest set

In [None]:
anti_testset_user = []
inner_uid = trainSet.to_inner_uid(student_id)
targetUser = inner_uid #inner_id of the target user
user_item_ratings = trainSet.ur[targetUser]
fillValue = trainSet.global_mean
user_item_ratings = trainSet.ur[inner_uid]
user_items = [item for (item,_) in (user_item_ratings)]
user_items
#filter
ratings = trainSet.all_ratings()

for iid in trainSet.all_items():
    if(iid not in user_items):
        anti_testset_user.append((trainSet.to_raw_uid(targetUser),trainSet.to_raw_iid(iid),fillValue))

## Course Enrolled 

In [None]:
course_df.loc[course_df['course_id'].isin(user_items)]

# Distance Based Algos

In [None]:
sim_options = {'name': 'pearson_baseline',
                'user_based': True}

## i) KNN Basic

In [None]:
np.random.seed(1)
knn_basic_algo = KNNBasic(min_k=2,k=10,sim_options=sim_options)

cross_validate(knn_basic_algo, data, measures=['RMSE', 'MAE'], cv=1, verbose=True)

predictions = knn_basic_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

##  ii) KNN with means

In [None]:
np.random.seed(1)
knn_means_algo = KNNWithMeans(min_k=3,sim_options=sim_options)

cross_validate(knn_means_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = knn_means_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)



##  iii) KNNBaseline

# Matrix Factorization Algos

## i) SVD

In [None]:
np.random.seed(1)
svd_algo = SVD(n_factors=150,n_epochs=5,lr_all=0.005,reg_all=0.1)
cross_validate(svd_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svd_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

In [None]:
pred['err'] = abs(pred.est - pred.r_ui)
best_predictions = pred.sort_values(by='err').head(5)
best_predictions

# ii) SVDpp

In [None]:
np.random.seed(1)
svdpp_algo = SVDpp()
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svdpp_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## CoClustering

In [None]:
np.random.seed(1)
svdpp_algo = CoClustering(n_cltr_u=10, n_cltr_i=8, random_state=None)
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svdpp_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## SlopeOne

In [None]:
np.random.seed(1)
svdpp_algo = SlopeOne()
cross_validate(svdpp_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
predictions = svdpp_algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## Hyper Parameter Tuning using GridSearchCV

## i) KNN Baseline With Parameters

In [None]:
knn_baseline_param_grid = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [1, 2],
    },
    'sim_options': {
        'min_support': [True],
        'user_based': [True],
        'min_k':[2]
    },
}

grid_search_knn_baseline = GridSearchCV(KNNBaseline,knn_baseline_param_grid, measures=['rmse','mae'], cv=5)
grid_search_knn_baseline.fit(data) 
print(grid_search_knn_baseline.best_params['rmse'])
print(grid_search_knn_baseline.best_score['rmse'])
print(grid_search_knn_baseline.best_score['mae'])
algo_grid_search_knn_baseline = grid_search_knn_baseline.best_estimator['rmse']
cross_validate(algo_grid_search_knn_baseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_knn_baseline.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## ii) KNNBasic With Parameters

In [None]:
knn_basic_params = {
    'bsl_options': {
        'method': ['als', 'sgd'],
        'reg': [1, 2],
    },
    'sim_options': {
        'name': ['msd', 'cosine'],
        'user_based': [True],
        'min_support': [True],

    },
}

grid_search_knn_basic = GridSearchCV(KNNBasic,knn_basic_params, measures=['rmse','mae'], cv=5)
grid_search_knn_basic.fit(data) 
print(grid_search_knn_basic.best_params['rmse'])
print(grid_search_knn_basic.best_score['rmse'])
print(grid_search_knn_basic.best_score['mae'])

algo_grid_search_knn_basic = grid_search_knn_basic.best_estimator['rmse']
cross_validate(algo_grid_search_knn_basic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_knn_basic.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## iii) SVD with Parameters

In [None]:
svd_params = {'n_factors': [100,150],
              'n_epochs': [5,10,15],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}

grid_search_svd = GridSearchCV(SVD,svd_params,measures=['rmse','mae'], cv=5)
grid_search_svd.fit(data)
print(grid_search_svd.best_params['rmse'])
print(grid_search_svd.best_score['rmse'])
print(grid_search_svd.best_score['mae'])

algo_grid_search_svd = grid_search_svd.best_estimator['rmse']
cross_validate(algo_grid_search_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_svd.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## iv) SVDpp with Parameters

In [None]:
svdpp_params = {'n_factors': [100,150],
              'n_epochs': [5,10,15],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}

grid_search_svdpp = GridSearchCV(SVDpp,svdpp_params,measures=['rmse','mae'], cv=5)
grid_search_svdpp.fit(data)
print(grid_search_svdpp.best_params['rmse'])
print(grid_search_svdpp.best_score['rmse'])
print(grid_search_svdpp.best_score['mae'])

algo_grid_search_svdpp = grid_search_svdpp.best_estimator['rmse']
cross_validate(algo_grid_search_svdpp, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

predictions = algo_grid_search_svdpp.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],ascending = False).head(5)

## Model Evaluation

In [None]:
rating_df=student_df.groupby('course_name')['course_rating'].mean().reset_index().rename(columns={'course_rating':'avg_rating'})
rating_df.rename(columns = {'course_name':'iid'}, inplace = True)

In [None]:
ypred = []
ytrue = []
for i in range(len(predictions)):
    ypred.append(predictions[i].est)
    ytrue.append(predictions[i].r_ui)

In [None]:
mean_squared_error(ytrue, ypred)

In [None]:
plt.figure().set_figwidth(30)
plt.figure().set_figheight(5)
plt.xticks(rotation=90)
sns.scatterplot(data=course_df, x="iid", y="est")
sns.scatterplot(data=course_df, x="iid", y="avg_rating")
plt.show()

In [None]:
plt.figure().set_figwidth(30)
plt.figure().set_figheight(5)
plt.xticks(rotation=90)
sns.scatterplot(data=course_df, x="avg_rating", y="est")
#sns.lineplot(data=pred, x="iid", y="r_ui")
plt.show()

In [None]:
benchmark = []
# Iterate over all algorithms

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [None]:
surprise_results