In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9418  0.9357  0.9405  0.9288  0.9315  0.9356  0.0050  
MAE (testset)     0.7410  0.7361  0.7420  0.7348  0.7324  0.7373  0.0037  
Fit time          5.60    5.28    5.53    6.31    6.09    5.76    0.38    
Test time         0.18    0.25    0.28    0.19    0.22    0.22    0.04    


{'fit_time': (5.602244853973389,
  5.279819965362549,
  5.5315070152282715,
  6.308753252029419,
  6.093297958374023),
 'test_mae': array([ 0.74097639,  0.73608936,  0.74202444,  0.73482773,  0.73244639]),
 'test_rmse': array([ 0.94175006,  0.93572622,  0.9404656 ,  0.92878403,  0.93148775]),
 'test_time': (0.17873597145080566,
  0.2543649673461914,
  0.2761049270629883,
  0.1928722858428955,
  0.21847987174987793)}

In [None]:
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [11]:
import os
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9517
RMSE: 0.9412
RMSE: 0.9333
RMSE: 0.9293
RMSE: 0.9368


In [21]:
from collections import defaultdict
n = 10
top_n = defaultdict(list)
for uid, iid, r_ui, est, _ in predictions:
    top_n[uid].append((r_ui,iid, est))

# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

top_n

defaultdict(list,
            {'1': [(2.0, '94', 2.8908537313289338),
              (4.0, '88', 3.084747017370038),
              (3.0, '83', 3.8733369986833535),
              (5.0, '59', 4.4529425345601021),
              (4.0, '52', 4.2702829686750183),
              (4.0, '3', 2.6974607300942042),
              (1.0, '29', 2.8173446666565161),
              (4.0, '28', 3.5558006993874125),
              (2.0, '244', 3.1892706586364943),
              (1.0, '231', 2.9126778343929143)],
             '10': [(4.0, '93', 4.1398282894208727),
              (4.0, '9', 4.2403671012334971),
              (4.0, '85', 3.664994509109412),
              (5.0, '707', 4.2438046479321727),
              (4.0, '700', 3.3628013298918407),
              (4.0, '686', 4.1046768162426286),
              (4.0, '610', 4.2429408306815146),
              (4.0, '604', 4.3404214170177973),
              (5.0, '525', 4.2150786309268113),
              (5.0, '484', 4.5687198141389063)],
             '100': [(4.

In [23]:
"""
This module illustrates how to retrieve the top-10 items with highest rating
prediction. We first train an SVD algorithm on the MovieLens dataset, and then
predict all the ratings for the pairs (user, item) that are not in the training
set. We then retrieve the top-10 prediction for each user.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

196 ['408', '169', '483', '318', '603', '178', '513', '963', '50', '357']
186 ['114', '318', '408', '169', '746', '22', '357', '210', '313', '513']
22 ['483', '179', '135', '98', '182', '496', '169', '318', '185', '603']
244 ['98', '483', '178', '134', '654', '14', '187', '513', '285', '12']
166 ['169', '174', '408', '641', '513', '172', '483', '96', '657', '50']
298 ['64', '12', '480', '272', '408', '114', '169', '520', '313', '963']
115 ['408', '175', '169', '134', '179', '514', '474', '200', '154', '603']
253 ['174', '169', '480', '272', '408', '178', '513', '114', '172', '923']
305 ['513', '114', '515', '647', '659', '607', '657', '137', '1142', '661']
6 ['603', '179', '654', '657', '647', '651', '963', '641', '1203', '1194']
62 ['169', '657', '408', '427', '603', '496', '178', '488', '515', '42']
286 ['12', '496', '487', '64', '519', '1121', '921', '485', '480', '603']
200 ['427', '480', '603', '144', '272', '190', '12', '178', '651', '963']
210 ['169', '408', '318', '178', '603',

In [33]:
#dir(trainset)
user_ratings


[('318', 5),
 ('114', 4.9002875978282594),
 ('50', 4.8558934307056916),
 ('483', 4.8316001291956132),
 ('64', 4.8266241821097182),
 ('169', 4.7960824340158474),
 ('480', 4.7890227270565262),
 ('923', 4.7488090274442616),
 ('8', 4.7194256377797315),
 ('174', 4.6911146167899318)]

In [34]:


from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import KNNBasic
from surprise import Dataset

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

# we can now query for specific predicions
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}
