# Movie Lens exploration
This notebook is just an exploration on this dataset used to start learning about recommendation systems. It is one of the datasets available by `surprise`.

In [1]:
import os
import csv
import sys
import re

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

In [3]:
# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)

In [4]:
# now parse movies dataset
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [5]:
# get user ratings from ratingsDataset based on user
def getUserRatings(user):
    userRatings = []
    hitUser = False
    with open(ratingsPath, newline='') as csvfile:
        ratingReader = csv.reader(csvfile)
        next(ratingReader)
        for row in ratingReader:
            userID = int(row[0])
            if (user == userID):
                movieID = int(row[1])
                rating = float(row[2])
                userRatings.append((movieID, rating))
                hitUser = True
            if (hitUser and (user != userID)):
                break

    return userRatings

In [6]:
# now it is important to get popularity ranks to get some metrics
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

In [7]:
# get movie genres
genres = defaultdict(list)
genreIDs = {}
maxGenreID = 0
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        genreList = row[2].split('|')
        genreIDList = []
        for genre in genreList:
            if genre in genreIDs:
                genreID = genreIDs[genre]
            else:
                genreID = maxGenreID
                genreIDs[genre] = genreID
                maxGenreID += 1
            genreIDList.append(genreID)
        genres[movieID] = genreIDList

# genres has a dict that has genres encoded as integers
# so the goal is to convert those to bitfields to it can be
# treated as vectors
for (movieID, genreIDList) in genres.items():
    bitfield = [0] * maxGenreID
    for genreID in genreIDList:
        bitfield[genreID] = 1
    genres[movieID] = bitfield

In [8]:
# get movie years
p = re.compile(r'(?:\((\d{4})\))?\s*$')
years = defaultdict(int)
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        title = row[1]
        m = p.search(title)
        year = m.group(1)
        if year:
            years[movieID] = int(year)

In [9]:
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

def getMovieID(movieName):
        if movieName in name_to_movieID:
            return name_to_movieID[movieName]
        else:
            return 0

# Define some modeling utils
such a metrics and features

In [10]:
import itertools
from surprise import accuracy

In [19]:
def MAE(predictions):
    return accuracy.mae(predictions, verbose=False)

def RMSE(predictions):
    return accuracy.rmse(predictions, verbose=False)

def GetTopN(predictions, n=10, minimumRating=4.0):
    """
    Get top n predictions based on rating threshold
    """
    topN = defaultdict(list)

    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating))
    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]
    
    return topN

def HitRate(topNPredicted, leftOutPredictions):
    """
    Calculate hit rate which is the rate of how many recommendations does
    the user alredy hit before
    """
    hits = 0
    total = 0

    # for each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # is it on the top 10 predicted for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID)==int(movieID)):
                hit=True
                break
        if (hit):
            hits += 1
        
        total += 1
    
    return hits/total

def CummulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
    hits = 0
    total = 0
    # for each left-out rating
    for userID, leftOutMovieID, actualRating, estatimatedRating, _ in leftOutPredictions:
        # only look at ability to recommend things the user actually liked
        if (actualRating >= ratingCutoff):
            # is it in the predicted top 10 for this user?
            hit = False
            for movieID, predictedRating in topNPredicted[int(userID)]:
                if (int(leftOutMovieID)==movieID):
                    hit = True
                    break
            if (hit):
                hits += 1
            total += 1
    return hits/total

def RatingHitRate(topNPredicted, leftOutPredictions):
    hits = defaultdict(float)
    total = defaultdict(float)

    for userID, leftOutMovieID, actualRating, estatimatedRating, _ in leftOutPredictions:
        # is it in the predicted top N for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == movieID):
                hit = True
                break
        if (hit):
            hits[actualRating] += 1
        
        total[actualRating] += 1
    
    # compute overral precision
    for rating in sorted(hits.keys()):
        print(rating, hits[rating]/total[rating])


def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
    summation = 0
    total = 0
    # for each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # is it the predicted top N for this user?
        hitRank = 0
        rank = 0
        for movieID, predictedRating in topNPredicted[int(userID)]:
            rank = rank + 1
            if (int(leftOutMovieID)==movieID):
                hitRank = rank
                break
        if (hitRank > 0):
            summation += 1.0 / hitRank
        
        total += 1

    return summation/ total

def userCoverage(topNPredicted, numUsers, ratingThreshold=0):
    hits = 0
    for userID in topNPredicted.keys():
        hit = False
        for movieID, predictedRating in topNPredicted[userID]:
            if (predictedRating >= ratingThreshold):
                hit = True
                break
        if (hit):
            hit += 1
    
    return hits / numUsers

def Diversity(topNpredicted, simsAlgo):
    n = 0
    total = 0
    simsMatrix = simsAlgo.compute_similarities()
    for userID in topNpredicted.keys():
        pairs = itertools.combinations(topNpredicted[userID], 2)
        for pair in pairs:
            movie1 = pair[0][0]
            movie2 = pair[1][0]
            innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
            innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
            similarity = simsMatrix[innerID1][innerID2]
            total += similarity
    
    # S = (total/n)
    return (1 - (total/n))

def Novelty(topNPredicted, rankings):
    n = 0
    total = 0
    for userID in topNPredicted.keys():
        for rating in topNPredicted[userID]:
            movieID = rating[0]
            rank = rankings[movieID]
            total += rank
            n += 1
            
    return total / n


## Model training
The most popular one is SVD which is a matrix factorization-based technique. 

In [14]:
from surprise.model_selection import train_test_split, LeaveOneOut
from surprise import SVD, NormalPredictor, KNNBaseline

In [16]:
# build dataset from loaded surprise dataset
fullTrainSet = ratingsDataset.build_full_trainset()
# define options for KNN baseline, we need the similarities
# for further steps
sim_options = {
    'name': 'pearson_baseline',
    'user_based': False
}
# define a KNN baseline to get similarities
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(fullTrainSet)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x115363700>

In [17]:
# split 75/25
trainSet, testSet = train_test_split(ratingsDataset, test_size=0.25, random_state=1)
algo = SVD(random_state=10)
algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11538f190>

In [18]:
# compute recomendations on test
predictions = algo.test(testSet)
print('RMSE: ', RMSE(predictions))
print('MAE', MAE(predictions))

RMSE:  0.87790565300794
MAE 0.6731720779996845


In [21]:
# evaluate top-10 recommendations with cross-fold
LOOCV = LeaveOneOut(n_splits=1, random_state=1)
for trainSet, testSet in LOOCV.split(ratingsDataset):
    algo.fit(trainSet)
    leftOutPredictions = algo.test(testSet)
    # this creates a set for movies that the user has not rate before
    bigTestSet = trainSet.build_anti_testset()
    # rank the previous with the algorithm 
    allPredictions = algo.test(bigTestSet)
    # get top-n recommendations
    topNPredicted = GetTopN(allPredictions, n=10)
    print("top-n: ", topNPredicted)
    # see how often the algorithm predicted something the user already rate
    print("hit-rate: ", HitRate(topNPredicted, leftOutPredictions))
    # break down hit rate by rating value
    print("rating hit-rate: ")
    RatingHitRate(topNPredicted, leftOutPredictions)
    # see how often the model recommended something the user actually liked
    print("Cummulative hit rate >= 4", CummulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=4.0))

top-n:  defaultdict(<class 'list'>, {1: [(318, 5), (79132, 5), (898, 5), (904, 5), (908, 5), (910, 5), (912, 5), (1079, 5), (750, 5), (5618, 5)], 2: [(951, 4.510394186164916), (1221, 4.466989923196604), (858, 4.457507712283533), (1225, 4.4542693950427985), (1945, 4.449507740575167), (50, 4.434674495799477), (1196, 4.423228780991947), (7153, 4.423048531378662), (1198, 4.390831043268471), (260, 4.371317091575608)], 4: [(16, 4.501868828592104), (122926, 4.492593421897621), (3578, 4.484570691003636), (1193, 4.4788377384676465), (7153, 4.460953887080198), (45720, 4.375282987669873), (89745, 4.37406791643794), (48516, 4.373263474467198), (1104, 4.367137487558025), (1201, 4.361950387420735)], 5: [(2571, 4.415665601094694), (750, 4.370287560478859), (260, 4.336332017049061), (912, 4.31841422496164), (908, 4.315353838743837), (7153, 4.300614731331705), (4993, 4.289647412087848), (5952, 4.287381634586103), (3451, 4.283756706070486), (1221, 4.2465795232923265)], 6: [(1197, 4.606914978867989), (90

Interpretation of these metrics:
- Recommendations of 4.5 stars and above were the most common. So that's good.
- The cummulative hit-rate is greater than the raw hit-rate so, it's a good sign that the model is recommending more titles based on the fixed threshold.

In [22]:
# now build metrics for all predictions as a whole
algo.fit(fullTrainSet)
bigTestSet = fullTrainSet.build_anti_testset()
allPredictions = algo.test(bigTestSet)
topNPredicted = GetTopN(allPredictions, n=10)
print("Coverage: ", userCoverage(topNPredicted, fullTrainSet.n_users, ratingThreshold=4.0))
print("Diversity: ", Diversity(topNPredicted, simsAlgo))
print("Novelty: ", Novelty(topNPredicted, rankings))

Coverage:  0.0
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Diversity:  -inf
Novelty:  504.3873857062885


  return (1 - (total/n))


A good thing is to have novelty and diversity on lower values just to be sure that the model is not recommending anything obscure to the user. Novelty can be a good thing, but keeping it more controlled could be better, so the user can be more familiar with the content the model is recommending.