In [1]:
import pandas as pd
import numpy as np
import surprise
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

In [2]:
ratings = pd.read_csv("D:/Training/Academy/ML(Python)/20. Recommender Systems/filmtrust/ratings.txt",sep=' ',
                      names = ['uid','iid','rating'])
ratings.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [3]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))

Ratings range between 0.5 and 4.0


In [4]:
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [5]:
similarity_options = {'name': 'cosine', 'user_based': True}
# Default k = 40
algo = surprise.KNNBasic(sim_options = similarity_options)
output = algo.fit(data.build_full_trainset()) # Calculates expected rating for all the users

Computing the cosine similarity matrix...
Done computing similarity matrix.


List of User IDs

In [6]:
ratings['uid'].unique()

array([   1,    2,    3, ..., 1506, 1507, 1508], dtype=int64)

Expected rating for any specific user for a specific item:

In [7]:
pred = algo.predict(uid='20',iid='101')
print(pred.est)

3.0028030537791928


In [8]:
pred

Prediction(uid='20', iid='101', r_ui=None, est=3.0028030537791928, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

Total Items:

In [9]:
iids = ratings['iid'].unique()
print(iids)

[   1    2    3 ... 2069 2070 2071]


The list of items rated by user 60:

In [10]:
u_iid = list(ratings[ratings['uid']==60]['iid'])
print("List of items rated by user 60:", u_iid)
print("No. of items rated by user {0}: {1}".format(60, len(u_iid)))

List of items rated by user 60: [13, 8, 341, 318, 215]
No. of items rated by user 60: 5


`np.setdiff1d`

In [11]:
a = [ 2,4,6,7,8 ]
b = [ 4,8]
np.setdiff1d(a,b)

array([2, 6, 7])

List of the items not rated by user 60:

In [12]:
iids_to_predict = np.setdiff1d(iids, u_iid)
print("Items not rated by 60 or those items for which the expected ratings are to be predicted:",iids_to_predict )

Items not rated by 60 or those items for which the expected ratings are to be predicted: [   1    2    3 ... 2069 2070 2071]


In [13]:
len(iids_to_predict)

2066

Extracting the estimated rating for iids_to_predict

In [14]:
testset = [[60,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)
exp_ratings = pd.DataFrame(predictions)[['iid','est']]
exp_ratings.sort_values('est', ascending=False).iloc[:10]

Unnamed: 0,iid,est
1033,1039,4.0
196,199,4.0
1640,1646,4.0
1227,1233,4.0
1214,1220,4.0
552,558,4.0
1209,1215,4.0
556,562,4.0
194,197,4.0
1637,1643,4.0


## Tuning for best K

### User-Based Filtering

In [21]:
param_grid = {'k': np.arange(30,70,10),  'user_based':[True]}
param_grid

{'k': array([30, 40, 50, 60]), 'user_based': [True]}

In [16]:
kfold = KFold(n_splits=5, random_state=25, shuffle=True)
gs = GridSearchCV(surprise.KNNWithZScore, param_grid,measures=['rmse', 'mae'], cv=kfold)

In [17]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Best Score & Paramter:

In [18]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8290575638213372
{'k': 60, 'user_based': True}


In [19]:
pd.DataFrame(gs.cv_results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_user_based
0,0.829677,0.830395,0.838368,0.834178,0.841363,0.834796,0.004513,4,0.625128,0.633264,...,0.632302,0.003743,4,0.6569,0.072234,2.114172,0.079602,"{'k': 30, 'user_based': True}",30,True
1,0.826448,0.828719,0.834653,0.830702,0.838866,0.831878,0.004413,3,0.622651,0.632149,...,0.629652,0.003756,3,0.746811,0.039599,2.32187,0.1132,"{'k': 40, 'user_based': True}",40,True
2,0.825377,0.827037,0.833835,0.828185,0.836202,0.830127,0.00416,2,0.621732,0.630082,...,0.628003,0.003521,2,0.643567,0.050068,2.276059,0.110682,"{'k': 50, 'user_based': True}",50,True
3,0.824213,0.826114,0.831701,0.827574,0.835686,0.829058,0.004129,1,0.620618,0.629406,...,0.626905,0.003486,1,0.638207,0.052776,2.370609,0.026003,"{'k': 60, 'user_based': True}",60,True


We can now use the algorithm that yields the best rmse:

In [22]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x2496bce8880>

The recommendations can be generated for any user with the object **algo**.

### Item-Based Filtering

In [23]:
param_grid = {'k': np.arange(30,70,10), 'user_based':[False]}
param_grid

{'k': array([30, 40, 50, 60]), 'user_based': [False]}

In [24]:
kfold = KFold(n_splits=5, random_state=25, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid=param_grid,measures=['rmse', 'mae'], cv=kfold)

In [25]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Best Score:

In [26]:
print(gs.best_score['rmse'])

0.8631869024815704


Best Parameter:

In [27]:
print(gs.best_params['rmse'])

{'k': 60, 'user_based': False}


We can now use the algorithm that yields the best rmse:

In [None]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7dec6c162c30>

The recommendations can be generated for any user with the object **algo**.