# User content filtering random recommendations
Explore how to face cold-start

In [1]:
import os
import csv
import sys
import re

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)

# now parse movies dataset
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [3]:
# get user ratings from ratingsDataset based on user
def getUserRatings(user):
    userRatings = []
    hitUser = False
    with open(ratingsPath, newline='') as csvfile:
        ratingReader = csv.reader(csvfile)
        next(ratingReader)
        for row in ratingReader:
            userID = int(row[0])
            if (user == userID):
                movieID = int(row[1])
                rating = float(row[2])
                userRatings.append((movieID, rating))
                hitUser = True
            if (hitUser and (user != userID)):
                break

    return userRatings

In [4]:
# now it is important to get popularity ranks to get some metrics
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

In [5]:
def getYears():
    p = re.compile(r"(?:\((\d{4})\))?\s*$")
    years = defaultdict(int)
    with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
        movieReader = csv.reader(csvfile)
        next(movieReader)
        for row in movieReader:
            movieID = int(row[0])
            title = row[1]
            m = p.search(title)
            year = m.group(1)
            if year:
                years[movieID] = int(year)
    return years

In [6]:
# define a function to get new movies
def getNewMovies():
    newMovies = []
    years = getYears()
    latestYear = max(years.values())
    print("Newest year is ", latestYear)
    for movieID, year in years.items():
        if year == latestYear:
            newMovies.append(movieID)
    return newMovies

In [13]:
sys.path.append('..')
import heapq
import random
from Framework.EvaluationData import EvaluationData
from Framework.RecommenderMetrics import RecommenderMetrics
from surprise import KNNBasic
from operator import itemgetter

In [10]:
# train a simple KNN doing user based
evalData = EvaluationData(ratingsDataset, rankings)
trainSet = evalData.GetLOOCVTrainSet()
sim_options = {
    'name': 'cosine',
    'user_based': True
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [11]:
# Evaluate
leftOutTestSet = evalData.GetLOOCVTestSet()
# get new movies
newMovies = getNewMovies()
explorationSlot = 9

Newest year is  2018


In [14]:
# build up recommendations
topN = defaultdict(list)
k = 10
for uiid in range(trainSet.n_users):
    # get top similar uses similar to current one
    similarityRow = simsMatrix[uiid]
    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != uiid):
            similarUsers.append((innerID, score))
    
    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

    # get stuff they liked and add up ratings for each item, weighted by user sim
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1]/5.0) * userSimilarityScore
    
    # now get a dict to know what the user already watched
    watched = {}
    for itemID, rating in trainSet.ur[uiid]:
        watched[itemID] = 1
    
    # get top rated items from similar users
    pos = 0
    for itemID, rartingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            movieID = 0
            if (pos == explorationSlot):
                movieID = random.choice(newMovies)
            else:
                movieID = trainSet.to_raw_iid(itemID)
            topN[int(trainSet.to_raw_uid(uiid))].append((int(movieID), 0.0))
            pos += 1
            if pos > 40:
                break

In [15]:
print("HR: ", RecommenderMetrics.HitRate(topN, leftOutTestSet))

HR:  0.047540983606557376
