## Collaborative Filtering
#### Model Based Approach

In [1]:
! pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.8 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633950 sha256=9a492e15b75224a64db8f3bb4b2d3920483dec2fee8d9cdfb47e2c7ed799bdec
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
import pandas as pd
# import SVD from surprise
from surprise import SVD

# # import dataset from surprise
from surprise import Dataset
from surprise import Reader


# import accuracy from surprise
from surprise import accuracy

# import train_test_split from surprise.model_selection
from surprise.model_selection import train_test_split
# import GridSearchCV from surprise.model_selection
from surprise.model_selection import GridSearchCV
# import cross_validate from surprise.model_selection
from surprise.model_selection import cross_validate

In [5]:
book_ratings = pd.read_csv('/content/sample_data/BX-Book-Ratings.csv',sep=";", encoding="latin")

In [6]:
book_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
lower_rating = min(book_ratings['Book-Rating'])
upper_rating = max(book_ratings['Book-Rating'])

print('Review range: {0} to {1}'.format(lower_rating, upper_rating))

Review range: 0 to 10


In [8]:
book_ratings.shape

(1149780, 3)

In [9]:
# create surprise dataset from book_ratings

reader = Reader(rating_scale=(0, 10))

# Loads Pandas dataframe
data = Dataset.load_from_df(book_ratings, reader)

In [18]:
data_train, data_test = train_test_split(data, test_size = 0.15)

In [19]:
# fit SVD model on training set
alg = SVD()
alg.fit(data_train)
predictions = alg.test(data_test)

In [20]:
# print default model's rmse that was computed on the test set (using object accuracy we imported in the beginning)
accuracy.rmse(predictions)

RMSE: 3.5030


3.5030274620776063

In [21]:
# tune the model
param_grid = {'n_factors': [110, 120, 140, 160], 'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# print combination of parameters that gave best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

{'n_factors': 160, 'reg_all': 0.15}
{'n_factors': 110, 'reg_all': 0.08}


In [22]:
# fit SVD model on training set with updated params
alg = SVD(n_factors = 160, reg_all = 0.15)
alg.fit(data_train)
predictions = alg.test(data_test)
accuracy.rmse(predictions)

RMSE: 3.4264


3.4264395268224304