# Collaborative Filtering User Based

In [4]:
import heapq
from surprise import KNNBasic

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
from operator import itemgetter

## Load data

In [2]:
# define some paths to download data
ratingsPath = "../data/ml-latest-small/ratings.csv"
moviesPath = "../data/ml-latest-small/movies.csv"

In [3]:
# define reader instance to download data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
# download dataset to path
ratingsDataset = Dataset.load_from_file(ratingsPath, reader=reader)
ratingsDataset = ratingsDataset.build_full_trainset()

In [9]:
# read movies info
movieID_to_name = {}
name_to_movieID = {}
with open(moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
    movieReader = csv.reader(csvfile)
    next(movieReader)  #Skip header line
    for row in movieReader:
        movieID = int(row[0])
        movieName = row[1]
        movieID_to_name[movieID] = movieName
        name_to_movieID[movieName] = movieID

In [10]:
# define function to get movie name
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""

In [5]:
# define similarity options and get similarity matrix
sim_options = {
    'name': 'cosine',
    'user_based': True
}

model = KNNBasic(sim_options=sim_options)
model.fit(ratingsDataset)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [7]:
# now get top-N recommendations for a test user
testSubject = '85'
k = 10

testUserInnerID = ratingsDataset.to_inner_uid(testSubject)
similarityRow = simsMatrix[testUserInnerID]

# loop over all users and start appending ids and scores
similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append((innerID, score))

# get neighbors based on defined k
kNeighbors = heapq.nlargest(k, similarUsers, key= lambda t: t[1])

In [8]:
# get what users rated, and add up ratings for each item, weight by user similarity
candidates = defaultdict(float)
for similarUsers in kNeighbors:
    innerID = similarUsers[0]
    userSimilarityScore = similarUsers[1]
    theirRatings = ratingsDataset.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1]/0.5) * userSimilarityScore

In [11]:
# build a dictionary of what the test subject has already seen
watched = {}
for itemID, rating in ratingsDataset.ur[testUserInnerID]:
    watched[itemID] = 1

# get top-rated items from similar users
# making exceptions on what the user already watched
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = ratingsDataset.to_raw_iid(itemID)
        print(getMovieName(int(movieID)), ratingSum)
        # this is just a counter to know how many recomendations we want to get
        pos += 1
        if (pos>10):
            break

Gladiator (2000) 46.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 45.0
Forrest Gump (1994) 41.0
Shawshank Redemption, The (1994) 41.0
Pulp Fiction (1994) 36.0
Matrix, The (1999) 36.0
Seven (a.k.a. Se7en) (1995) 35.0
Groundhog Day (1993) 35.0
Silence of the Lambs, The (1991) 33.0
Saving Private Ryan (1998) 33.0
Inception (2010) 33.0
