In [1]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy

In [27]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
n_ratings = 1000000
data_df = pd.read_csv("ratings.csv")
data_df = data_df.iloc[:n_ratings]
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(data_df[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.25)

#data = Dataset.load_builtin('ml-100k')
#trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
#testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)
# get RMSE
accuracy.rmse(predictions)

# Print the recommended items for each user
#for uid, user_ratings in top_n.items():
#    print(uid, [iid for (iid, _) in user_ratings])

RMSE: 0.8384


0.8383937046435391

In [6]:
from surprise import BaselineOnly
from surprise import SVD
from surprise import Dataset
from surprise import Reader
import os
import pandas as pd
from surprise.model_selection import cross_validate

n_ratings = 1000000

# path to dataset file
file_path = os.path.expanduser('~/PROJS/hallmark/surprise_code/12_12_recosys/ratings_new.csv')
#reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1,10),skip_lines=1)
#data = Dataset.load_from_file(file_path, reader=reader)
data_df = pd.read_csv("ratings.csv")
data_df = data_df.iloc[:n_ratings]
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(data_df[['userId', 'movieId', 'rating']], reader)


# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(SVD(), data, verbose=True, n_jobs=2)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8368  0.8336  0.8319  0.8355  0.8333  0.8342  0.0017  
MAE (testset)     0.6396  0.6382  0.6365  0.6387  0.6381  0.6382  0.0010  
Fit time          35.58   34.20   36.55   35.26   35.83   35.48   0.77    
Test time         1.39    1.34    1.38    1.43    1.41    1.39    0.03    


{'test_rmse': array([0.83682816, 0.83358649, 0.8318936 , 0.83549357, 0.83332664]),
 'test_mae': array([0.63955108, 0.63824305, 0.63648125, 0.63866097, 0.63813269]),
 'fit_time': (35.58294677734375,
  34.2040376663208,
  36.54542112350464,
  35.25814771652222,
  35.834367513656616),
 'test_time': (1.3891146183013916,
  1.3417341709136963,
  1.3804545402526855,
  1.4315814971923828,
  1.4110774993896484)}

In [None]:
# n_jobs=5

#{'test_rmse': array([0.83286841, 0.83403583, 0.83453788, 0.837163  , 0.83377609]),
# 'test_mae': array([0.63797554, 0.63812647, 0.6379149 , 0.63997025, 0.6375087 ]),
# 'fit_time': (34.25440835952759,
#  37.010011434555054,
#  36.49393129348755,
#  36.3558554649353,
#  33.53095293045044),
# 'test_time': (1.2990314960479736,
#  1.3195006847381592,
#  1.2744128704071045,
#  1.252169132232666,
#  1.2351505756378174)}

In [None]:
# n_jobs=1


#{'test_rmse': array([0.83318748, 0.8343154 , 0.83587293, 0.83275152, 0.83569748]),
# 'test_mae': array([0.63789076, 0.63755536, 0.63905574, 0.63709123, 0.63979902]),
# 'fit_time': (34.5315260887146,
#  35.73594927787781,
#  35.420734882354736,
#  35.074718952178955,
#  35.664207458496094),
# 'test_time': (2.5257678031921387,
#  2.1665899753570557,
#  2.3544838428497314,
#  2.5684831142425537,
#  2.2046148777008057)}

In [None]:
# n_jobs = 2

#{'test_rmse': array([0.83682816, 0.83358649, 0.8318936 , 0.83549357, 0.83332664]),
# 'test_mae': array([0.63955108, 0.63824305, 0.63648125, 0.63866097, 0.63813269]),
# 'fit_time': (35.58294677734375,
#  34.2040376663208,
#  36.54542112350464,
#  35.25814771652222,
#  35.834367513656616),
# 'test_time': (1.3891146183013916,
#  1.3417341709136963,
#  1.3804545402526855,
#  1.4315814971923828,
#  1.4110774993896484)}