In [8]:
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import KNNWithMeans, SVDpp, SVD

In [2]:
ratings = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

In [3]:
movies = pd.read_csv('ml-1m/movies.dat', header=None, sep='::', engine='python',
                      names=['MovieID', 'Title', 'Genres'])

In [6]:
movies_with_ratings = movies.join(ratings.set_index('MovieID'), on='MovieID').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [7]:
movies_with_ratings.head()

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,978824268.0
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,978237008.0
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,978233496.0
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,978225952.0
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,978226474.0


In [11]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.UserID,
    'iid': movies_with_ratings.Title,
    'rating': movies_with_ratings.Rating
})

In [12]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [13]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [14]:
algo_svd = SVD()

In [15]:
result_svd = cross_validate(algo_svd, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8752  0.8782  0.8722  0.8734  0.8722  0.8742  0.0023  
MAE (testset)     0.6871  0.6893  0.6848  0.6864  0.6843  0.6864  0.0018  
Fit time          46.39   45.20   45.31   43.24   44.74   44.98   1.02    
Test time         1.92    1.60    1.86    1.63    1.87    1.78    0.13    


In [16]:
np.mean(result_svd['test_rmse'])

0.8742343474529782