# Remove outliers to evaluate impact on final results

In [15]:
import re
import csv
import sys

import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader

from collections import defaultdict

In [3]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

# load dataset fitlering outliers from root
ratingsDataset = 0
ratings = pd.read_csv(ratingsPath, encoding='latin-1')
# get ratings by user
ratingsByUser = ratings.groupby('userId', as_index=False).agg({"rating": "count"})
# define outlier as 3 std threshold
outlierStdDev = 3
ratingsByUser['outlier'] = (abs(
    ratingsByUser.rating - ratingsByUser.rating.mean() > ratingsByUser.rating.std()*outlierStdDev
))
ratingsByUser.drop(columns=['rating'], inplace=True)

# combine ratings per user and ratings
combined = ratings.merge(ratingsByUser, on='userId', how='left')

# filter non-outliers df
filtered = combined.loc[combined['outlier']==False]
filtered = filtered.drop(columns=['outlier', 'timestamp'])

In [4]:
# define surprise reader
reader = Reader(rating_scale=(1,5))
ratingsDataset = Dataset.load_from_df(filtered, reader)

In [7]:
# now it is important to get popularity ranks to get some metrics
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

In [6]:
# get movie names and ids
movieID_to_name = {}
name_to_movieID = {}

with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
        movieReader = csv.reader(csvfile)
        next(movieReader)  #Skip header line
        for row in movieReader:
            movieID = int(row[0])
            movieName = row[1]
            movieID_to_name[movieID] = movieName
            name_to_movieID[movieName] = movieID

In [18]:
def getYears():
    p = re.compile(r"(?:\((\d{4})\))?\s*$")
    years = defaultdict(int)
    with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
        movieReader = csv.reader(csvfile)
        next(movieReader)
        for row in movieReader:
            movieID = int(row[0])
            title = row[1]
            m = p.search(title)
            year = m.group(1)
            if year:
                years[movieID] = int(year)
    return years

# function to get new movies
def getNewMovies():
    newMovies = []
    years = getYears()
    # What's the newest year in our data?
    latestYear = max(years.values())
    print ("Newest year is ", latestYear)
    for movieID, year in years.items():
        if year == latestYear:
            newMovies.append(movieID)
            #print (self.getMovieName(movieID))
    return newMovies

In [21]:
import sys
import random
from operator import itemgetter
sys.path.append('..')
from Framework.EvaluationData import EvaluationData
from Framework.RecommenderMetrics import RecommenderMetrics
from surprise import KNNBasic
import heapq

In [10]:
evalData = EvaluationData(ratingsDataset, rankings)
# define trainset and leave one out
trainSet = evalData.GetLOOCVTrainSet()
sim_options = {
    'name': 'cosine',
    'user_based': True
}

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [11]:
# define model
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
# get left-out test set
leftOutTestSet = evalData.GetLOOCVTestSet()
# get new movies rhat need data
newMovies = getNewMovies()
explorationSlot = 9

Newest year is  2018


In [22]:
# build up recommendations
topN = defaultdict(list)
k = 10
for uiid in range(trainSet.n_users):
    # get top similar uses similar to current one
    similarityRow = simsMatrix[uiid]
    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != uiid):
            similarUsers.append((innerID, score))
    
    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

    # get stuff they liked and add up ratings for each item, weighted by user sim
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1]/5.0) * userSimilarityScore
    
    # now get a dict to know what the user already watched
    watched = {}
    for itemID, rating in trainSet.ur[uiid]:
        watched[itemID] = 1
    
    # get top rated items from similar users
    pos = 0
    for itemID, rartingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            movieID = 0
            if (pos == explorationSlot):
                movieID = random.choice(newMovies)
            else:
                movieID = trainSet.to_raw_iid(itemID)
            topN[int(trainSet.to_raw_uid(uiid))].append((int(movieID), 0.0))
            pos += 1
            if pos > 40:
                break

In [23]:
print("HR: ", RecommenderMetrics.HitRate(topN, leftOutTestSet))

HR:  0.0587248322147651
