# Content based recommendations
Explore how genre similarity and KNN for same year movies clusters works.

In [11]:
import os
import csv
import sys
import re
import math

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np
import heapq

## Load data

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

In [3]:
# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)
ratingsDataset = ratingsDataset.build_full_trainset()

# now it is important to get popularity ranks to get some metrics
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

In [4]:
# this is an additional feature dataset for the movielens dataset
mes = defaultdict(list)
with open("../data/ml-latest-small/LLVisualFeatures13K_Log.csv", newline='') as csvfile:
    mesReader = csv.reader(csvfile)
    next(mesReader)
    for row in mesReader:
        movieID = int(row[0])
        avgShotLength = float(row[1])
        meanColorVariance = float(row[2])
        stddevColorVariance = float(row[3])
        meanMotion = float(row[4])
        stddevMotion = float(row[5])
        meanLightingKey = float(row[6])
        numShots = float(row[7])
        mes[movieID] = [avgShotLength, meanColorVariance, stddevColorVariance,
            meanMotion, stddevMotion, meanLightingKey, numShots]

In [5]:
# get movie genres
genres = defaultdict(list)
genreIDs = {}
maxGenreID = 0
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        genreList = row[2].split('|')
        genreIDList = []
        for genre in genreList:
            if genre in genreIDs:
                genreID = genreIDs[genre]
            else:
                genreID = maxGenreID
                genreIDs[genre] = genreID
                maxGenreID += 1
            genreIDList.append(genreID)
        genres[movieID] = genreIDList

# genres has a dict that has genres encoded as integers
# so the goal is to convert those to bitfields to it can be
# treated as vectors
for (movieID, genreIDList) in genres.items():
    bitfield = [0] * maxGenreID
    for genreID in genreIDList:
        bitfield[genreID] = 1
    genres[movieID] = bitfield

In [6]:
# get years
p = re.compile(r"(?:\((\d{4})\))?\s*$")
years = defaultdict(int)
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        title = row[1]
        m = p.search(title)
        year = m.group(1)
        if year:
            years[movieID] = int(year)

In [7]:
# define a function to compute genre similarity
def computeGenreSimilarity(movie1, movie2, genres):
    genres1 = genres[movie1]
    genres2 = genres[movie2]
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(genres1)):
        x = genres1[i]
        y = genres2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    
    return sumxy / math.sqrt(sumxx*sumyy)

In [8]:
# define function to calculate years similarity
def computeYearSimilarity(movie1, movie2, years):
    """
    this function penalizes large diferences in years (by 10 scale).
    exponential decay
    """
    diff = abs(years[movie1] - years[movie2])
    sim = math.exp(-diff/10.0)
    return sim

In [9]:
# compute genre distance for every movie combination as a 2x2 matrix
similarities = np.zeros((ratingsDataset.n_items, ratingsDataset.n_items))
for thisRating in range(ratingsDataset.n_items):
    if (thisRating % 100 == 0):
        print(thisRating, " of ", ratingsDataset.n_items)
    for otherRating in range(thisRating+1, ratingsDataset.n_items):
        thisMovieID = int(ratingsDataset.to_raw_iid(thisRating))
        otherMovieID = int(ratingsDataset.to_raw_iid(otherRating))
        genreSimilarity = computeGenreSimilarity(thisMovieID, otherMovieID, genres)
        yearSimilarity = computeYearSimilarity(thisMovieID, otherMovieID, years)
        # compute a final similarity score combining both genre and year similarity
        similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity
        similarities[otherRating, thisRating] = similarities[thisRating, otherRating]

0  of  9724
100  of  9724
200  of  9724
300  of  9724
400  of  9724
500  of  9724
600  of  9724
700  of  9724
800  of  9724
900  of  9724
1000  of  9724
1100  of  9724
1200  of  9724
1300  of  9724
1400  of  9724
1500  of  9724
1600  of  9724
1700  of  9724
1800  of  9724
1900  of  9724
2000  of  9724
2100  of  9724
2200  of  9724
2300  of  9724
2400  of  9724
2500  of  9724
2600  of  9724
2700  of  9724
2800  of  9724
2900  of  9724
3000  of  9724
3100  of  9724
3200  of  9724
3300  of  9724
3400  of  9724
3500  of  9724
3600  of  9724
3700  of  9724
3800  of  9724
3900  of  9724
4000  of  9724
4100  of  9724
4200  of  9724
4300  of  9724
4400  of  9724
4500  of  9724
4600  of  9724
4700  of  9724
4800  of  9724
4900  of  9724
5000  of  9724
5100  of  9724
5200  of  9724
5300  of  9724
5400  of  9724
5500  of  9724
5600  of  9724
5700  of  9724
5800  of  9724
5900  of  9724
6000  of  9724
6100  of  9724
6200  of  9724
6300  of  9724
6400  of  9724
6500  of  9724
6600  of  9724
6700  o

In [29]:
from surprise import AlgoBase
from surprise import PredictionImpossible

In [26]:
# define a function to get anti test set for an specific user
def GetAntiTestSetForUser(testSubject):
    trainset = Dataset.load_from_file(ratingsPath, reader=reader).build_full_trainset()
    fill = trainset.global_mean
    anti_testet = []
    u = trainset.to_inner_uid(str(testSubject))
    user_items = set([j for (j, _) in trainset.ur[u]])
    anti_testet += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                      i in trainset.all_items() if 
                      i not in user_items]

    return anti_testet

In [13]:
# define a function to estimate KNN for a given user and item
k = 40 # parameter for KNN neighbors
def estimate(u, i):
    if not (ratingsDataset.knows_user(u) and ratingsDataset.knows_item(i)):
        raise PredictionImpossible('User and/or items unkown')

    # build similarities between item and everything the user rated
    neighbors = []
    for rating in ratingsDataset.ur[u]:
        genreSimilarity = similarities[i, rating[0]]
        neighbors.append((genreSimilarity, rating[1]))
    
    # extract top-k most-similar ratings
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # compute average sim score of k neighbors weightes by user ratings
    simTotal = weightedSum = 0
    for (simScore, rating) in k_neighbors:
        if (simScore > 0):
            simTotal += simScore
            weightedSum += simScore * rating
    
    if (simTotal==0):
        raise PredictionImpossible('No neighbors')
    
    predictedRating = weightedSum / simTotal

    return predictedRating

In [None]:
# see recomendations for some user
testSubject = 85
testSet = GetAntiTestSetForUser(testSubject)
content_knn = AlgoBase.fit(testSet)