In [3]:
import pandas as pd
import numpy as np
import surprise

In [83]:
k_df = pd.read_csv(r"12. Recommender Systems\ml-100k\u.data",
                 sep="\t",
                 names = ["uid", "iid", "rating", "timestamp"])

k_df.drop("timestamp", axis =1, inplace = True)

In [84]:
k_df

Unnamed: 0,uid,iid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [85]:
lowest_rating = k_df['rating'].min()
print("Lowest Rating: ", lowest_rating)


highest_rating = k_df['rating'].max()
print("Highest Rating: ", highest_rating)

Lowest Rating:  1
Highest Rating:  5


In [86]:
print(" Ratings range between {0} and {1} ".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

 Ratings range between 1 and 5 


Converting the data into surprise format

In [87]:
data = surprise.Dataset.load_from_df(k_df, reader)
type(data)

surprise.dataset.DatasetAutoFolds

Similarity options:

In case of Item based filtering, 'user_based' value will be False


In [88]:
similarity_options = {'name': 'cosine', 'user_based': True}

Default K = 40

In [89]:
algo = surprise.KNNBasic(sim_options = similarity_options)
output = algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


The above .fit() calculates expected rating for all the users

Say we want expected rating of user=880 for item 377


In [91]:
pred = algo.predict( uid = '880', iid = '476')
score = pred.est
print(score)

3.52986


All the users

In [97]:
iids = k_df['iid'].unique()
iids

array([ 242,  302,  377, ..., 1637, 1630, 1641], dtype=int64)

Consider ratings given by user=880

In [94]:
rec_880 = k_df[k_df['uid'] == 880 ]
iids880 = rec_880['iid']
print("List of iid that uid = {0} has rated:".format(880))
print(iids880)

List of iid that uid = 880 has rated:
60191    386
60259    824
60290    307
60404    299
60414    398
        ... 
99270    294
99706      3
99776    110
99806      5
99995    476
Name: iid, Length: 368, dtype: int64


In [99]:
iids_to_predict = np.setdiff1d(iids, iids880)
print("List of iid which uid={0} did not rate(in all {1}) :".format(880,len(iids_to_predict)))
print(iids_to_predict)

List of iid which uid=880 did not rate(in all 1314) :
[   6    9   10 ... 1680 1681 1682]


Create a testset for getting the expected rating

In [103]:
testset = [[880,iid,0.] for iid in iids_to_predict]
testset

[[880, 6, 0.0],
 [880, 9, 0.0],
 [880, 10, 0.0],
 [880, 13, 0.0],
 [880, 14, 0.0],
 [880, 15, 0.0],
 [880, 16, 0.0],
 [880, 18, 0.0],
 [880, 19, 0.0],
 [880, 20, 0.0],
 [880, 26, 0.0],
 [880, 30, 0.0],
 [880, 32, 0.0],
 [880, 34, 0.0],
 [880, 35, 0.0],
 [880, 36, 0.0],
 [880, 37, 0.0],
 [880, 43, 0.0],
 [880, 45, 0.0],
 [880, 46, 0.0],
 [880, 48, 0.0],
 [880, 51, 0.0],
 [880, 52, 0.0],
 [880, 57, 0.0],
 [880, 58, 0.0],
 [880, 59, 0.0],
 [880, 60, 0.0],
 [880, 61, 0.0],
 [880, 66, 0.0],
 [880, 73, 0.0],
 [880, 74, 0.0],
 [880, 75, 0.0],
 [880, 76, 0.0],
 [880, 77, 0.0],
 [880, 78, 0.0],
 [880, 83, 0.0],
 [880, 84, 0.0],
 [880, 86, 0.0],
 [880, 89, 0.0],
 [880, 101, 0.0],
 [880, 102, 0.0],
 [880, 103, 0.0],
 [880, 104, 0.0],
 [880, 106, 0.0],
 [880, 107, 0.0],
 [880, 108, 0.0],
 [880, 112, 0.0],
 [880, 113, 0.0],
 [880, 114, 0.0],
 [880, 115, 0.0],
 [880, 116, 0.0],
 [880, 119, 0.0],
 [880, 125, 0.0],
 [880, 126, 0.0],
 [880, 129, 0.0],
 [880, 130, 0.0],
 [880, 131, 0.0],
 [880, 132, 0.0

Generate predictions on testset

In [104]:
predictions = algo.test(testset)
predictions[5]

Prediction(uid=880, iid=15, r_ui=0.0, est=3.7987600126079766, details={'actual_k': 40, 'was_impossible': False})

In [105]:
type(predictions)

list

In [106]:
pred_ratings = np.array([pred.est for pred in predictions])
pred_ratings

array([3.5950755 , 4.30018859, 4.14789564, ..., 2.        , 3.        ,
       3.        ])

In [107]:
iids_to_predict

array([   6,    9,   10, ..., 1680, 1681, 1682], dtype=int64)

Finding the index of maximum predicted rating

In [109]:
i_max = pred_ratings.argmax()
i_max

529

In [110]:
iids_to_predict[i_max] 

814

Recommending the item with maximum predicted rating

In [112]:
iid_recommend_most = iids_to_predict[i_max] 
print("Top item to be recommended for user {0} is {1} with predicted rating as {2}".format(880,iid_recommend_most,pred_ratings[i_max]))

Top item to be recommended for user 880 is 814 with predicted rating as 5.0


Getting top 10 items to be recommended for uid = 880

In [114]:
import heapq
i_sorted_10 = heapq.nlargest(10,  range(len(pred_ratings)), pred_ratings.take)
top_10_items = iids_to_predict[i_sorted_10]
print(top_10_items)

[ 814 1189 1201 1293 1500 1536 1599 1653 1122 1467]


Tuning with different K

In [116]:
np.arange(30,110,10)

array([ 30,  40,  50,  60,  70,  80,  90, 100])

In [117]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'k': np.arange(30,110,10)}

kfold = KFold(n_splits=5,
              random_state=2021,
              shuffle=True)

gs = GridSearchCV(surprise.KNNBasic,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=kfold)


Running the Grid Search CV

In [118]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [119]:
# Best Score
print(gs.best_score['rmse'])

0.9770385408008142


In [120]:
# Best Param
print(gs.best_params['rmse'])

{'k': 30}
