# Item based collaborative filtering with KNN

## load package

In [1]:
import heapq
from collections import defaultdict
from operator import itemgetter

from surprise import KNNBasic

from utils.MovieLens import MovieLens

## read data

In [2]:
testSubject = '85'
k = 10

ml = MovieLens()
data = ml.loadMovieLensLatestSmall()

trainSet = data.build_full_trainset()

## Item based collaborative filtering with KNN

In [3]:
sim_options = {'name': 'cosine',
               'user_based': False
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [4]:
# item - item similarity
simsMatrix.shape

(9066, 9066)

In [None]:
testUserInnerID = trainSet.to_inner_uid(testSubject)

## method 1: weighted average rating without neighbour rating filtering

In [5]:
# Get the top K items we rated
testUserRatings = trainSet.ur[testUserInnerID]
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

In [6]:
# Get similar items to stuff we liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating / 5.0)

In [7]:
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1
    
# Get top-rated items:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(ml.getMovieName(int(movieID)), ratingSum)
        pos += 1
        if (pos > 10):
            break

James Dean Story, The (1957) 10.0
Get Real (1998) 9.987241120712646
Kiss of Death (1995) 9.966881877751941
Set It Off (1996) 9.963732215657119
How Green Was My Valley (1941) 9.943984081065269
Amos & Andrew (1993) 9.93973694500253
My Crazy Life (Mi vida loca) (1993) 9.938290487546041
Grace of My Heart (1996) 9.926255896645218
Fanny and Alexander (Fanny och Alexander) (1982) 9.925699671455906
Wild Reeds (Les roseaux sauvages) (1994) 9.916226404418774
Edge of Seventeen (1998) 9.913028764691676


## method 2: weighted average rating with neighbour rating filtering

In [8]:
testUserRatings = trainSet.ur[testUserInnerID]

kNeighbors = []
for rating in testUserRatings:
    if rating[1] > 4.0: # only keep high rating items from neighbours
        kNeighbors.append(rating) 

In [11]:
len(kNeighbors)

17

In [9]:
# Get similar items to stuff we liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating / 5.0)

In [10]:
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1
    
# Get top-rated items:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(ml.getMovieName(int(movieID)), ratingSum)
        pos += 1
        if (pos > 10):
            break

Kiss of Death (1995) 16.910437073265502
Amos & Andrew (1993) 16.861270021975354
Edge of Seventeen (1998) 16.853845983977223
Get Real (1998) 16.840092759084882
Grace of My Heart (1996) 16.83866418909583
Relax... It's Just Sex (1998) 16.825893097731395
My Crazy Life (Mi vida loca) (1993) 16.825163372963015
Set It Off (1996) 16.820045947032426
Bean (1997) 16.81043113102984
Joe's Apartment (1996) 16.804698282071367
Lost & Found (1999) 16.78956315445952
