In [31]:
import pandas as pd

In [32]:
data = pd.read_csv('train.csv')

In [33]:
data.shape

(16731, 3)

In [34]:
data.head()

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1


In [35]:
data.isna().sum()

user_id       0
article_id    0
rating        0
dtype: int64

In [36]:
any(data.duplicated(subset=['user_id', 'article_id']).values)

False

### User Based CF

In [37]:
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans

In [38]:
data.head()

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1


In [39]:
data.columns = ['user_id', 'item_id', 'rating']

In [40]:
reader = Reader(rating_scale=(1,5))

In [41]:
data_ = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], reader)

In [42]:
param_grid={'k': list(range(1,30,2)), 'sim_options':{'name':['cosine', 'pearson']}}

In [43]:
gs_user = GridSearchCV(KNNWithMeans, param_grid, measures = ['rmse'], cv = 5, n_jobs=-1)

In [44]:
gs_user.fit(data_)

Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.

In [45]:
print(gs_user.best_params['rmse'])

{'k': 5, 'sim_options': {'name': 'pearson', 'user_based': True}}


In [46]:
print(gs_user.best_score['rmse'])

1.008811153859624


In [47]:
RMSE = {}
RMSE['user'] = gs_user.best_score['rmse']
RMSE

{'user': 1.008811153859624}

### Item based

In [48]:
param_grid={'k': list(range(1,30,2)), 'sim_options':{'name':['cosine', 'pearson'], 'user_based':[False]}}

In [49]:
gs_item = GridSearchCV(KNNWithMeans, param_grid, measures = ['rmse'], cv = 5, n_jobs=-1)

In [50]:
gs_item.fit(data_)

Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing s

In [51]:
print(gs_item.best_params['rmse'])

{'k': 23, 'sim_options': {'name': 'cosine', 'user_based': False}}


In [52]:
print(gs_item.best_score['rmse'])

1.0648345171504499


In [53]:
RMSE['item'] = gs_item.best_score['rmse']
RMSE

{'user': 1.008811153859624, 'item': 1.0648345171504499}

### Matrix Factorization

In [55]:
from surprise import SVD

In [56]:
param_grid={'n_factors': list(range(1,50,5)), 'n_epochs': list(range(5,50,5)), 'random_state': [69]}

In [57]:
gs_svd = GridSearchCV(SVD, param_grid, measures = ['rmse'], cv = 5, n_jobs=-1)

In [58]:
gs_svd.fit(data_)

In [59]:
print(gs_svd.best_params['rmse'])

{'n_factors': 1, 'n_epochs': 10, 'random_state': 69}


In [60]:
print(gs_svd.best_score['rmse'])

0.932188940000341


In [61]:
RMSE['svd'] = gs_svd.best_score['rmse']
RMSE

{'user': 1.008811153859624,
 'item': 1.0648345171504499,
 'svd': 0.932188940000341}

Matrix Factorisation performed the best with just 1 factor!