# SVD recommendations

In [1]:
import os
import csv
import sys
import re

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)

# now parse movies dataset
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [3]:
# get user ratings from ratingsDataset based on user
def getUserRatings(user):
    userRatings = []
    hitUser = False
    with open(ratingsPath, newline='') as csvfile:
        ratingReader = csv.reader(csvfile)
        next(ratingReader)
        for row in ratingReader:
            userID = int(row[0])
            if (user == userID):
                movieID = int(row[1])
                rating = float(row[2])
                userRatings.append((movieID, rating))
                hitUser = True
            if (hitUser and (user != userID)):
                break

    return userRatings

In [4]:
# now it is important to get popularity ranks to get some metrics
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

In [5]:
import sys
sys.path.append('..')
from Framework.EvaluationData import EvaluationData
from Framework.RecommenderMetrics import RecommenderMetrics

from surprise import SVD, SVDpp
from surprise import NormalPredictor

In [6]:
# get evaluation data
evaluationData = EvaluationData(ratingsDataset, rankings)
# fit svd algorithm
svd = SVD().fit(evaluationData.GetTrainSet())
predictions = svd.test(evaluationData.GetTestSet())
print("RMSE SVD", RecommenderMetrics.RMSE(predictions))
print("MAE SVD", RecommenderMetrics.MAE(predictions))
print()

# fit svd++
svdpp = SVDpp().fit(evaluationData.GetTrainSet())
predictions = svdpp.test(evaluationData.GetTestSet())
print("RMSE SVD++", RecommenderMetrics.RMSE(predictions))
print("MAE SVD++", RecommenderMetrics.MAE(predictions))

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE SVD 0.8777448025673252
MAE SVD 0.6734751254012374

RMSE SVD++ 0.869060784972803
MAE SVD++ 0.6652134391099813


In [8]:
# funtion to ger movie name based on movie ID
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

In [9]:
# let's see some recommendations
testSubject = 85
k = 10

trainSet = evaluationData.GetFullTrainSet()
testSet = evaluationData.GetAntiTestSetForUser(testSubject)

predictions = svd.test(testSet)
recommendations = []
for userID, movieID, actualRating, EstimatedRating, _ in predictions:
    intMovieID = int(movieID)
    recommendations.append((intMovieID, EstimatedRating))

recommendations.sort(key=lambda x: x[1], reverse=True)

print("#"*10, "SVD recommendations", "#"*10)
for ratings in recommendations[:k]:
    print(getMovieName(ratings[0]), ratings[1])
print()

predictions = svdpp.test(testSet)
recommendations = []
for userID, movieID, actualRating, EstimatedRating, _ in predictions:
    intMovieID = int(movieID)
    recommendations.append((intMovieID, EstimatedRating))

recommendations.sort(key=lambda x: x[1], reverse=True)

print("#"*10, "SVD++ recommendations", "#"*10)
for ratings in recommendations[:k]:
    print(getMovieName(ratings[0]), ratings[1])
print()

########## SVD recommendations ##########
One Flew Over the Cuckoo's Nest (1975) 4.417029617121035
Fight Club (1999) 4.408305974911228
Whiplash (2014) 4.361119454402982
Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001) 4.359927624585454
There Will Be Blood (2007) 4.353894110746785
Godfather: Part II, The (1974) 4.348907103152077
Shawshank Redemption, The (1994) 4.336963693958108
Miller's Crossing (1990) 4.32633344844112
Usual Suspects, The (1995) 4.32242557087096
Apocalypse Now (1979) 4.3215054934201165

########## SVD++ recommendations ##########
Shawshank Redemption, The (1994) 4.395018314668186
Streetcar Named Desire, A (1951) 4.36247970343162
Forrest Gump (1994) 4.346441338369155
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.313417345424422
Princess Bride, The (1987) 4.289633522110627
Dark Knight, The (2008) 4.27066547142905
Solaris (Solyaris) (1972) 4.267801986881965
Spotlight (2015) 4.266282832953542
Monty Python and the Holy Grail (1975) 4.245125648499546
Dr. Strangelove o

## Tunning SVD

In [10]:
from surprise.model_selection import GridSearchCV

In [14]:
# define a param grid
param_grid = {
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.010],
    'n_factors': [50, 100]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(ratingsDataset)

In [15]:
# now explore best metrics and params
print("Best RMSE score attained: ", gs.best_score['rmse'])
print(gs.best_params['rmse'])

Best RMSE score attained:  0.8776478697999123
{'n_epochs': 20, 'lr_all': 0.005, 'n_factors': 50}
