In [5]:
from surprise import SVD
from surprise import KNNBasic
from surprise import BaselineOnly
from surprise import NormalPredictor

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

import csv

In [6]:
# Load in the book ratings and return a dataset.
def load_dataset():
    reader = Reader(line_format='user item rating', sep=';', skip_lines=1)
    ratings_dataset = Dataset.load_from_file('./ratings_no_quotes_smallest.csv', reader=reader)

    # Lookup a book's name with it's bookID as key
    bookID_to_name = {}
    with open('./clubs_book.csv', newline='', encoding='Latin1') as csvfile:
            book_reader = csv.reader(csvfile)
            next(book_reader)
            for row in book_reader:
                bookID = int(row[0]) 
                book_name = row[1]
                bookID_to_name[bookID] = book_name
    # Return both the dataset and lookup dict in tuple
    return (ratings_dataset, bookID_to_name)

dataset, bookID_to_name = load_dataset()

In [7]:
# SVD provides a more accurate prediction but only if applied on preprocessed data
# In the example below, SVD has a low RMSE and MAE, but takes longer to fit
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.0175  2.9506  2.9009  2.9175  2.9668  2.9507  0.0407  
MAE (testset)     2.3536  2.3035  2.2528  2.2979  2.3067  2.3029  0.0320  
Fit time          0.47    0.45    0.51    0.48    0.48    0.48    0.02    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([3.01747995, 2.95064598, 2.90092093, 2.91747023, 2.96678781]),
 'test_mae': array([2.35361127, 2.30352981, 2.25276447, 2.297918  , 2.30665268]),
 'fit_time': (0.47173285484313965,
  0.44814515113830566,
  0.5117897987365723,
  0.4792921543121338,
  0.4777069091796875),
 'test_time': (0.00850820541381836,
  0.008734941482543945,
  0.009173870086669922,
  0.00945591926574707,
  0.009185075759887695)}

In [8]:
# KNN is typically better when less data can be provided
# The RMSE and MAE are higher than SVD, but fit and test time are extremely low
algo = KNNBasic()

# Run 5-fold cross-validation and print results.
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.3721  3.4380  3.4807  3.4900  3.3945  3.4351  0.0463  
MAE (testset)     2.8699  2.9088  2.9262  2.9369  2.8772  2.9038  0.0264  
Fit time          0.02    0.02    0.01    0.01    0.01    0.02    0.00    
Test time         0.02    0.02    0.01    0.02    0.01    0.02    0.00    


{'test_rmse': array([3.37211301, 3.43802177, 3.48067869, 3.48996633, 3.39454957]),
 'test_mae': array([2.86993876, 2.90876116, 2.92616937, 2.93685889, 2.87718134]),
 'fit_time': (0.015182018280029297,
  0.01885390281677246,
  0.014732122421264648,
  0.014858007431030273,
  0.01488804817199707),
 'test_time': (0.017633914947509766,
  0.017277956008911133,
  0.014986038208007812,
  0.015547037124633789,
  0.014621973037719727)}

In [10]:
# Use the Normal Predictor algorithm
algo = NormalPredictor()

# Run 5-fold cross-validation and print results.
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.8775  3.8204  3.8340  3.8345  3.8331  3.8399  0.0195  
MAE (testset)     3.2137  3.1839  3.1959  3.1858  3.1855  3.1930  0.0112  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.02    0.02    0.02    0.02    0.01    0.02    0.00    


{'test_rmse': array([3.87753075, 3.8203611 , 3.83401368, 3.83451573, 3.83312496]),
 'test_mae': array([3.21373374, 3.18392392, 3.1958944 , 3.18581506, 3.1855204 ]),
 'fit_time': (0.01129293441772461,
  0.014464855194091797,
  0.014043331146240234,
  0.012397050857543945,
  0.011802911758422852),
 'test_time': (0.018342256546020508,
  0.021129131317138672,
  0.01959395408630371,
  0.017589807510375977,
  0.01464080810546875)}