In [None]:
# run in colab

In [1]:
! pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 2.4 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1634003 sha256=f552604c07796a3615aab39797180dbfdf0beaf9c65d5926bd27ceae720e7711
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [9]:
import numpy as np
import pandas as pd
import urllib
import io
import zipfile
import surprise

In [6]:
dataset = pd.read_csv('/content/sample_data/ratings.txt', sep = ' ', names= ['uid','iid','rating'])
dataset.head()

Unnamed: 0,uid,iid,rating
0,1,1,2.0
1,1,2,4.0
2,1,3,3.5
3,1,4,3.0
4,1,5,4.0


In [7]:
# first check the range of the reviews for this dataset
lower_rating = dataset['rating'].min()
upper_rating = dataset['rating'].max()
print('Review range: {0} to {1}'.format(lower_rating, upper_rating))

Review range: 0.5 to 4.0


In [10]:
# load in our dataset and change scale
reader = surprise.Reader(rating_scale = (0.5, 4.))
data = surprise.Dataset.load_from_df(dataset, reader)

In [11]:
# fit SVD model on whole dataset (normally should just fit on training set)
alg = surprise.SVDpp()
output = alg.fit(data.build_full_trainset())

In [12]:
# check the predicted score of, for example, user 50 on a music artist 52 
pred = alg.predict(uid = '50', iid='52')
score = pred.est
print(score)

3.0028030537791928


In [13]:
# Get a list of all movie ids
iids = dataset['iid'].unique()

# Get a list of iids that uid 50 has rated
iids50 = dataset.loc[dataset['uid'] == 50, 'iid']

# Remove the iids that uid 50 has rated from teh list of all movie ids
iids_to_pred = np.setdiff1d(iids, iids50)

In [14]:
# create another dataset with the iids we want to predict in the sparse format as before of: uid, iid, rating. We'll just arbitrarily set all the ratings of this test set to 4, as they are not needed
testset = [[50, iid, 4.] for iid in iids_to_pred]
predictions = alg.test(testset)
predictions[0]

Prediction(uid=50, iid=14, r_ui=4.0, est=3.1665397480736064, details={'was_impossible': False})

In [16]:
# convert this object into an array of the predicted ratings. We’ll then use this to find the iid with the best predicted rating
pred_ratings = np.array([pred.est for pred in predictions])

# find the index of the maximum predicted rating
i_max = pred_ratings.argmax()

# use this to find the corresponding iid to recommend
iid = iids_to_pred[i_max]
print('Top item for user 50 has iid {0} with predicted rating {1}'.format(iid, pred_ratings[i_max]))

Top item for user 50 has iid 218 with predicted rating 4.0


In [17]:
# tune the model
param_grid = {'lr_all': [.001, .01], 'reg_all': [.1, .5]}
gs = surprise.model_selection.GridSearchCV(surprise.SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# print combination of parameters that gave best RMSE score
print(gs.best_params['rmse'])

{'lr_all': 0.01, 'reg_all': 0.1}


In [18]:
alg = surprise.SVDpp(lr_all = .01, reg_all = 0.1) # parameter choices can be added here
output = surprise.model_selection.cross_validate(alg, data, verbose = True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7907  0.8111  0.8013  0.7979  0.7886  0.7979  0.0081  
MAE (testset)     0.6135  0.6235  0.6197  0.6198  0.6084  0.6170  0.0054  
Fit time          16.17   16.44   17.33   16.95   18.25   17.03   0.73    
Test time         0.49    0.52    0.54    0.54    0.66    0.55    0.06    
