### Automatic cross-validation

In [3]:
from surprise import Dataset, SVD
from surprise.model_selection import cross_validate


# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin("ml-100k")

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9490  0.9349  0.9310  0.9366  0.9274  0.9358  0.0073  
MAE (testset)     0.7480  0.7393  0.7352  0.7367  0.7291  0.7377  0.0062  
Fit time          0.70    0.73    0.72    0.72    0.70    0.71    0.01    
Test time         0.14    0.13    0.09    0.13    0.12    0.12    0.02    


{'test_rmse': array([0.94895533, 0.93485383, 0.93101334, 0.93655514, 0.92738631]),
 'test_mae': array([0.74796064, 0.73933601, 0.73515609, 0.73671914, 0.72908893]),
 'fit_time': (0.7009801864624023,
  0.7341670989990234,
  0.7211644649505615,
  0.7167453765869141,
  0.7019352912902832),
 'test_time': (0.13703227043151855,
  0.1290299892425537,
  0.09202146530151367,
  0.1260054111480713,
  0.12499189376831055)}

### Use a custom dataset

In [19]:
import os

from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser("~/.surprise_data/ml-100k/ml-100k/u.data")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_file(file_path, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9453  0.9362  0.9550  0.9387  0.9474  0.9445  0.0067  
MAE (testset)     0.7468  0.7435  0.7560  0.7446  0.7525  0.7487  0.0048  
Fit time          0.14    0.17    0.16    0.16    0.16    0.16    0.01    
Test time         0.05    0.05    0.06    0.05    0.05    0.05    0.00    


{'test_rmse': array([0.94534981, 0.93615582, 0.95503278, 0.93874078, 0.94743453]),
 'test_mae': array([0.74684604, 0.74347603, 0.75601081, 0.74461369, 0.75252522]),
 'fit_time': (0.13503289222717285,
  0.1690373420715332,
  0.1560354232788086,
  0.16303730010986328,
  0.15703582763671875),
 'test_time': (0.05000948905944824,
  0.05101203918457031,
  0.056012868881225586,
  0.050011396408081055,
  0.04901123046875)}

### Use cross-validation iterators

In [10]:
from surprise import accuracy, Dataset, SVD
from surprise.model_selection import KFold

# Load the movielens-100k dataset
data = Dataset.load_builtin("ml-100k")

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9412
RMSE: 0.9509
RMSE: 0.9513


###  Cross-validation with folds predefined by some files

In [11]:
import os

from surprise import accuracy, Dataset, Reader, SVD
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser("~/.surprise_data/ml-100k/ml-100k/")

# This time, we'll use the built-in reader.
reader = Reader("ml-100k")

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + "u%d.base"
test_file = files_dir + "u%d.test"
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9504
RMSE: 0.9424
RMSE: 0.9309
RMSE: 0.9345
RMSE: 0.9369


### Train-test split and fit() method

In [8]:
from surprise import accuracy, Dataset, SVD
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin("ml-100k")

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9395


0.9395339744660888

### Train on a whole trainset and the predict() method

In [9]:
from surprise import Dataset, KNNBasic

# Load the movielens-100k dataset
data = Dataset.load_builtin("ml-100k")

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items. r_ui represents the true, known rating.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

# Note: The predict() uses raw ids. As the dataset we have used has been read from a file, the raw ids are strings (even if they represent numbers).

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


### Tune algorithm parameters with GridSearchCV

In [12]:
from surprise import Dataset, SVD
from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = Dataset.load_builtin("ml-100k")

param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

0.9633656595058318
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [13]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28de8c5b800>

In [17]:
import pandas as pd
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.993824,0.998935,0.998765,0.997175,0.00237,7,0.802444,0.808015,0.807708,0.806056,0.002557,7,0.169706,0.000943,0.187709,0.023982,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,0.999967,1.005216,1.004401,1.003195,0.002306,8,0.811082,0.816832,0.816111,0.814675,0.002558,8,0.150368,0.002626,0.187042,0.026324,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.969914,0.976222,0.974347,0.973495,0.002645,3,0.777794,0.785082,0.782946,0.781941,0.003059,2,0.146905,0.000182,0.18411,0.024808,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.978679,0.985357,0.983013,0.98235,0.002766,5,0.788419,0.796213,0.794061,0.792897,0.003287,5,0.146367,0.000472,0.200712,0.000943,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.974362,0.980554,0.978855,0.977924,0.002612,4,0.781975,0.789086,0.78701,0.786024,0.002985,4,0.289596,0.001123,0.188061,0.023345,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4
5,0.98254,0.988541,0.986812,0.985964,0.002522,6,0.79246,0.799303,0.79749,0.796417,0.002895,6,0.289399,0.002358,0.193044,0.03128,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}",10,0.002,0.6
6,0.959311,0.966822,0.963964,0.963366,0.003096,1,0.767837,0.776586,0.772908,0.772444,0.003587,1,0.286867,0.001967,0.186039,0.026326,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}",10,0.005,0.4
7,0.969237,0.976691,0.97379,0.973239,0.003068,2,0.779464,0.788192,0.784966,0.784207,0.003604,3,0.285495,0.001007,0.185693,0.026633,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}",10,0.005,0.6
