# A class for accessing data

This class could be defined in another place but I feel it will be good to allocate it in this place for now

In [1]:
import os
import csv
import sys
import re
import heapq
from collections import defaultdict
from operator import itemgetter
from surprise import Dataset, Reader, SVD, accuracy, KNNBasic

In [2]:
class MovieLens:

    movieID_to_name = {}
    name_to_movieID = {}
    ratingsPath = 'ratings ut.csv'
    moviesPath = 'movies.csv'
    
    def loadMovieLensLatestSmall(self):

        # Look for files relative to the directory we are running from
        os.chdir(os.path.dirname(sys.argv[0]))

        ratingsDataset = 0
        self.movieID_to_name = {}
        self.name_to_movieID = {}

        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
                movieReader = csv.reader(csvfile)
                next(movieReader)  #Skip header line
                for row in movieReader:
                    movieID = int(row[0])
                    movieName = row[1]
                    self.movieID_to_name[movieID] = movieName
                    self.name_to_movieID[movieName] = movieID

        return ratingsDataset
    
    def getMovieName(self, movieID):
        if movieID in self.movieID_to_name:
            return self.movieID_to_name[movieID]
        else:
            return ""
        
    def getMovieID(self, movieName):
        if movieName in self.name_to_movieID:
            return self.name_to_movieID[movieName]
        else:
            return 0

# Data, parameters and others

Reading data

In [3]:
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()

Split data

In [4]:
trainSet = data.build_full_trainset()

# Model user-based

In [5]:
model = KNNBasic(sim_options={'name': 'cosine','user_based': True})
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [6]:
simsMatrix

array([[1.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 1.        , 0.95561425, ..., 0.89144204, 0.97993672,
        0.        ],
       [0.        , 0.95561425, 1.        , ..., 0.94592126, 0.98448284,
        1.        ],
       ...,
       [0.        , 0.89144204, 0.94592126, ..., 1.        , 0.96183401,
        0.93334561],
       [1.        , 0.97993672, 0.98448284, ..., 0.96183401, 1.        ,
        0.96388092],
       [0.        , 0.        , 1.        , ..., 0.93334561, 0.96388092,
        1.        ]])

In [86]:
testSubject = '672'
k = 10

Finding the similarity row our user

In [87]:
# Get top N similar users to our test subject
testUserInnerID = trainSet.to_inner_uid(testSubject)
similarityRow = simsMatrix[testUserInnerID]

Add them to a list

In [88]:
similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append( (innerID, score) )

Order them by similarity rank

In [89]:
#kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

kNeighbors = []

for rating in similarUsers:
    if rating[1]> 0.97:
        kNeighbors.append(rating)

Select item candidates 

In [90]:
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

Item user has already seen

In [91]:
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

## Generate top-N

In [92]:
# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(ml.getMovieName(int(movieID)))#, ratingSum)
        pos += 1
        if (pos > 10):
            break

Forrest Gump (1994)
Pulp Fiction (1994)
Shawshank Redemption, The (1994)
Star Wars: Episode V - The Empire Strikes Back (1980)
Silence of the Lambs, The (1991)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Schindler's List (1993)
Back to the Future (1985)
Terminator 2: Judgment Day (1991)
Godfather, The (1972)
Jurassic Park (1993)
