In [10]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from pathlib import Path
import matplotlib.pyplot as plt
from tensorflow import keras

In [7]:
!pip install scikit-surprise
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163490 sha256=ff1491f8e1bfa98cba3302a02e82ea185b00690cd9ed221497982fe6f1132f8e
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [14]:
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)

movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)

keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
tags_file = movielens_dir / "tags.csv"
movies_file = movielens_dir / "movies.csv"

df_rating = pd.read_csv(ratings_file)
tags = pd.read_csv(tags_file)
movies = pd.read_csv(movies_file)


In [15]:
df_rating.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


SVD algorithm

In [18]:
svd = SVD()

In [17]:
reader = Reader()

In [19]:
data = Dataset.load_from_df(df_rating[['userId', 'movieId', 'rating']], reader)

cross-validation and print the results

In [20]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8724  0.8694  0.8748  0.8708  0.8760  0.8727  0.0024  
MAE (testset)     0.6722  0.6699  0.6733  0.6686  0.6726  0.6713  0.0018  
Fit time          1.48    1.45    1.49    1.44    2.33    1.64    0.35    
Test time         0.12    0.18    0.14    0.12    0.20    0.15    0.03    


{'test_rmse': array([0.8723896 , 0.86944087, 0.87476297, 0.87075104, 0.87601913]),
 'test_mae': array([0.6721622 , 0.6699408 , 0.67328827, 0.66863475, 0.67261708]),
 'fit_time': (1.4757578372955322,
  1.4533376693725586,
  1.4853723049163818,
  1.4448726177215576,
  2.3268790245056152),
 'test_time': (0.12429285049438477,
  0.17807292938232422,
  0.14011645317077637,
  0.12296795845031738,
  0.2039639949798584)}

 train dataset and predict

In [21]:
trainset = data.build_full_trainset()

In [22]:
# Train the algorithm on trainset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ddcd940f040>

pick user with user Id 3 and check the ratings he has given

In [24]:
df_rating[df_rating['userId'] == 3]

Unnamed: 0,userId,movieId,rating,timestamp
261,3,31,0.5,1306463578
262,3,527,0.5,1306464275
263,3,647,0.5,1306463619
264,3,688,0.5,1306464228
265,3,720,0.5,1306463595
266,3,849,5.0,1306463611
267,3,914,0.5,1306463567
268,3,1093,0.5,1306463627
269,3,1124,0.5,1306464216
270,3,1263,0.5,1306463569


algorithm to predict his/her score for move_id of 506

In [27]:
# predict ratings for the testset
svd.predict(uid=3, iid=506, r_ui=None)

Prediction(uid=3, iid=506, r_ui=None, est=2.9062309117236205, details={'was_impossible': False})

In [29]:
# directly grab the estimated ratings for the testset
svd.predict(uid=3, iid=506, r_ui=None).est

2.9062309117236205

For movie with ID 506, we get an estimated prediction of 2.90. One feature of this recommender system is that it does not care what the movie is (or what it contains). It works purely on the basis of an assigned movie ID and tries to predict ratings based on how the other users have predicted the movie.