# Collaborative Filtering Item Based

In [1]:
import csv
import heapq
from surprise import KNNBasic

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
from operator import itemgetter

## Load data

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

In [3]:
# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)
ratingsDataset = ratingsDataset.build_full_trainset()

In [4]:
# read movies info
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)  #Skip header line
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [5]:
# define function to get movie name
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

In [6]:
# define similarity options and get similarity matrix
sim_options = {
    'name': 'cosine',
    # set to flase cause this one is item based
    'user_based': False
}

model = KNNBasic(sim_options=sim_options)
model.fit(ratingsDataset)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [7]:
# now get top-N recommendations for a test user
testSubject = '85'
k = 10

testUserInnerID = ratingsDataset.to_inner_uid(testSubject)
# get top K the test subject rated
testUserRatings = ratingsDataset.ur[testUserInnerID]
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

# get similar items to thing the test subject liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating/5.0)

In [8]:
# build a dict of stuff the test user already seen
watched = {}
for itemID, rating in ratingsDataset.ur[testUserInnerID]:
    watched[itemID] = 1

# get top-rated items from similar items
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = ratingsDataset.to_raw_iid(itemID)
        print(getMovieName(int(movieID)), ratingSum)
        pos += 1
        if (pos > 10):
            break

Trust (1990) 8.993195870740706
Night Porter, The (Portiere di notte, Il) (1974) 8.991602228017534
Daytrippers, The (1996) 8.978892697759282
Living in Oblivion (1995) 8.973090760365785
Melvin and Howard (1980) 8.970142500145332
Hate (Haine, La) (1995) 8.967270202229166
Presidio, The (1988) 8.96429313204702
Stop Making Sense (1984) 8.954691793826552
Color Purple, The (1985) 8.954276360003217
Opposite of Sex, The (1998) 8.95170801390616
Clue (1985) 8.949644625955322


### Let's see how results change by defining a threshold on items rating

In [9]:
# take all items with a rate of 4.0 or greater
kNeighbors = []
for rating in testUserRatings:
    if rating[1] > 4.0:
        kNeighbors.append(rating)

# get similar items to thing the test subject liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating/5.0)

# get top-rated items from similar items
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = ratingsDataset.to_raw_iid(itemID)
        print(getMovieName(int(movieID)), ratingSum)
        pos += 1
        if (pos > 10):
            break

Opposite of Sex, The (1998) 10.94172656584623
American in Paris, An (1951) 10.936427200000246
One Fine Day (1996) 10.934890689782453
Last Picture Show, The (1971) 10.931806486265977
Gilda (1946) 10.929912059633953
Dead Calm (1989) 10.916981536391042
Streetcar Named Desire, A (1951) 10.91594253451208
Five Easy Pieces (1970) 10.914690389589753
Out of the Past (1947) 10.910660929932783
Killer, The (Die xue shuang xiong) (1989) 10.90933848448976
Born Yesterday (1950) 10.909227888131605


## Evaluate approach

In [10]:
import sys
sys.path.append('..')
from Framework.EvaluationData import EvaluationData
from Framework.RecommenderMetrics import RecommenderMetrics

from surprise.model_selection import LeaveOneOut

In [11]:
# get rankings and ratings based on data
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(ratingsPath, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank +=1

# define evaluation data based on original data
evalData = EvaluationData(ratingsDataset, rankings)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [12]:
# train on leave-one-out
trainSet = evalData.GetLOOCVTrainSet()
sim_options = {
    'name': 'cosine',
    'user_based': False
}
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [13]:
leftOutTestSet = evalData.GetLOOCVTestSet()
# build dict to lists of (int(movieID), predictedrating) pairs
topN = defaultdict(list)
k = 10
for uiid in range(trainSet.n_users):
    # get top N similar users to test subject
    userRatings = trainSet.ur[uiid]
    kNeighbors = heapq.nlargest(k, userRatings, key=lambda t: t[1])
    # get candidates weighted by user rating
    candidates = defaultdict(float)
    for itemID, rating in kNeighbors:
        similarityRow = simsMatrix[itemID]
        for innerID, score in enumerate(similarityRow):
            candidates[innerID] += score * (rating / 5.0)
    
    # build a dict of stuff the user already watched
    watched = {}
    for itemID, rating in trainSet.ur[uiid]:
        watched[itemID] = 1
    
    # get top-rated items from similar users
    pos = 0
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            topN[int(trainSet.to_raw_uid(uiid))].append((int(movieID), 0.0))
            pos += 1
            if (pos > 40):
                break

In [14]:
# print hit-rate or mesure of interest
print("hit-rate", RecommenderMetrics.HitRate(topN, leftOutTestSet))

hit-rate 0.0


Data could be driven to worse results based on item based approach. Even though this needs to be teste online with A/B test.