# Evaluate offline performance metrics of RS
Collaborative filtering: matrix factorization with SVD algorithm

## load package

In [1]:
from surprise import SVD, KNNBaseline
from surprise.model_selection import train_test_split, LeaveOneOut

## put utils folder in the same directory as the notebook
from utils.MovieLens import MovieLens
from utils.RecommenderMetrics import RecommenderMetrics

## read data

In [2]:
ml = MovieLens()

print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
type(data)

Loading movie ratings...


surprise.dataset.DatasetAutoFolds

## compute movie popularity ranks to measure novelty later

In [6]:
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
type(rankings)


Computing movie popularity ranks so we can measure novelty later...


collections.defaultdict

In [7]:
len(rankings.keys())

9066

In [13]:
## items by popularity ranks
for i in range(5):
    print(rankings[i])

0
8
127
344
1790


## compute item similarities to measure diversity later

In [18]:
print("\nComputing item similarities so we can measure diversity later...")
fullTrainSet = data.build_full_trainset()

sim_options = {'name': 'pearson_baseline', 'user_based': False}
# SVD in 'surprise' lib doesn't compute item similarity itself
simsAlgo = KNNBaseline(sim_options=sim_options) 
simsAlgo.fit(fullTrainSet)


Computing item similarities so we can measure diversity later...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fc446d27400>

In [19]:
type(simsAlgo)

surprise.prediction_algorithms.knns.KNNBaseline

## Collaborative filtering: matrix factorization with SVD algorithm

In [15]:
## to save run time, not doing k-fold CV
print("\nBuilding recommendation model...")
trainSet, testSet = train_test_split(data, test_size=.25, random_state=1)
type(trainSet)


Building recommendation model...


surprise.trainset.Trainset

In [17]:
algo = SVD(random_state=10)
algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc446c8f040>

In [20]:
type(algo)

surprise.prediction_algorithms.matrix_factorization.SVD

In [21]:
print("\nComputing recommendations...")
predictions = algo.test(testSet) # get rating predictions
type(predictions)


Computing recommendations...


list

In [22]:
predictions[:5]

[Prediction(uid='387', iid='3801', r_ui=4.0, est=3.8238706669975406, details={'was_impossible': False}),
 Prediction(uid='534', iid='507', r_ui=4.0, est=3.7701088018860327, details={'was_impossible': False}),
 Prediction(uid='480', iid='8874', r_ui=5.0, est=4.375936145730298, details={'was_impossible': False}),
 Prediction(uid='575', iid='3469', r_ui=4.0, est=3.666028378036622, details={'was_impossible': False}),
 Prediction(uid='214', iid='1219', r_ui=4.0, est=4.648309363932954, details={'was_impossible': False})]

## RMSE and MAE measurement

In [23]:
print("\nEvaluating accuracy of model...")
print("RMSE: ", RecommenderMetrics.RMSE(predictions))
print("MAE: ", RecommenderMetrics.MAE(predictions))


Evaluating accuracy of model...
RMSE:  0.9033701087151801
MAE:  0.6977882196132263


## evaluate top-N recommendations

In [24]:
# Set aside one rating item per user for testing
LOOCV = LeaveOneOut(n_splits=1, random_state=1)

for trainSet, testSet in LOOCV.split(data):
    print("Computing recommendations with leave-one-out...")

    # Train model without left-out ratings
    algo.fit(trainSet)

    # Predicts ratings for left-out ratings only
    print("Predict ratings for left-out set...")
    leftOutPredictions = algo.test(testSet)

    # Build predictions for all items not in the training set
    print("Predict all missing ratings...")
    bigTestSet = trainSet.build_anti_testset()
    allPredictions = algo.test(bigTestSet)

    # Compute top 10 recs for each user
    print("Compute top 10 recs per user...")
    topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n=10)

    # See how often we recommended a movie the user actually rated
    print("\nHit Rate: ", RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions))

    # Break down hit rate by rating value
    print("\nrHR (Hit Rate by Rating value): ")
    RecommenderMetrics.RatingHitRate(topNPredicted, leftOutPredictions)

    # See how often we recommended a movie the user actually liked
    print("\ncHR (Cumulative Hit Rate, rating >= 4): ", RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))

    # Compute ARHR
    print("\nARHR (Average Reciprocal Hit Rank): ", RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions))

Computing recommendations with leave-one-out...
Predict ratings for left-out set...
Predict all missing ratings...
Compute top 10 recs per user...

Hit Rate:  0.029806259314456036

rHR (Hit Rate by Rating value): 
3.5 0.017241379310344827
4.0 0.0425531914893617
4.5 0.020833333333333332
5.0 0.06802721088435375

cHR (Cumulative Hit Rate, rating >= 4):  0.04960835509138381

ARHR (Average Reciprocal Hit Rank):  0.0111560570576964


## compute complete recommendations without hold-out set

In [25]:
print("\nComputing complete recommendations, no hold outs...")
algo.fit(fullTrainSet)

bigTestSet = fullTrainSet.build_anti_testset()
allPredictions = algo.test(bigTestSet)

topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n=10)


Computing complete recommendations, no hold outs...


In [27]:
type(topNPredicted)

collections.defaultdict

In [28]:
topNPredicted.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 208, 209, 210, 211, 212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,

In [31]:
for i in [2, 3, 4]:
    print(topNPredicted[i])

[(750, 4.35280339944793), (7502, 4.352442956492622), (926, 4.342936228026204), (1945, 4.338353198673341), (1035, 4.3211799931232795), (194, 4.316813430150751), (905, 4.3089507079940725), (968, 4.294282510828192), (3462, 4.286268842687708), (1089, 4.281743479404668)]
[(926, 4.4149856280480275), (3462, 4.364463968021669), (1136, 4.324506591977236), (50, 4.230061037677276), (7502, 4.229965552561011), (899, 4.220991684692096), (2064, 4.19789633024895), (905, 4.179668227514134), (741, 4.1671423380333525), (1147, 4.151154032890661)]
[(50, 5), (318, 5), (1221, 5), (111, 5), (534, 5), (4973, 5), (608, 5), (1060, 5), (1147, 5), (1203, 5)]


In [32]:
# Print user coverage with a minimum predicted rating of 4.0:
print("\nUser coverage: ", RecommenderMetrics.UserCoverage(topNPredicted, fullTrainSet.n_users, ratingThreshold=4.0))

# Measure diversity of recommendations:
print("\nDiversity: ", RecommenderMetrics.Diversity(topNPredicted, simsAlgo))

# Measure novelty (average popularity rank of recommendations):
print("\nNovelty (average popularity rank): ", RecommenderMetrics.Novelty(topNPredicted, rankings))


User coverage:  0.9552906110283159
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Diversity:  0.9665208258150911

Novelty (average popularity rank):  491.5767777960256
