In [38]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from collections import defaultdict
from operator import itemgetter
import heapq

import os
import csv

In [39]:
# Load in the book ratings and return a dataset.
def load_dataset():
    reader = Reader(line_format='user item rating', sep=';', skip_lines=1)
    ratings_dataset = Dataset.load_from_file('./ratings_no_quotes_smallest.csv', reader=reader)

    # Lookup a book's name with it's bookID as key
    bookID_to_name = {}
    with open('./clubs_book.csv', newline='', encoding='Latin1') as csvfile:
            book_reader = csv.reader(csvfile)
            next(book_reader)
            for row in book_reader:
                bookID = int(row[0]) 
                book_name = row[1]
                bookID_to_name[bookID] = book_name
    # Return both the dataset and lookup dict in tuple
    return (ratings_dataset, bookID_to_name)

dataset, bookID_to_name = load_dataset()

# Build a full Surprise training set from dataset
trainset = dataset.build_full_trainset()

In [40]:
similarity_matrix = KNNBasic(sim_options={
        'name': 'cosine',
        'user_based': False
        })\
        .fit(trainset)\
        .compute_similarities()

similarity_matrix

Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1., nan, nan],
       [ 0.,  0.,  0., ..., nan,  1., nan],
       [ 0.,  0.,  0., ..., nan, nan,  1.]])

In [41]:
# Pick a random user ID, has to be a numeric string.
# Play around and see how the final recommendations change
# depending on the user!
test_subject = '276725'

# Get the top K items user rated
k = 10


In [42]:
# When using Surprise, there are RAW and INNER IDs.
# Raw IDs are the IDs, strings or numbers, you use when
# creating the trainset. The raw ID will be converted to
# an unique integer Surprise can more easily manipulate
# for computations.
#
# So in order to find an user inside the trainset, you
# need to convert their RAW ID to the INNER Id. Read
# here for more info https://surprise.readthedocs.io/en/stable/FAQ.html#what-are-raw-and-inner-ids
test_subject_iid = trainset.to_inner_uid(test_subject)

# Get the top K items we rated
test_subject_ratings = trainset.ur[test_subject_iid]
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])

In [43]:
# Default dict is basically a standard dictionary,
# the difference beeing that it doesn't throw an error
# when trying to access a key which does not exist,
# instead a new entry, with that key, is created.
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue

In [44]:
# Utility we'll use later.
def getBookName(bookID):
  if (bookID) in bookID_to_name:
    return bookID_to_name[bookID]
  else:
      return ""

In [45]:
# Build a dictionary of books the user has read
read = {}
for itemID, rating in trainset.ur[test_subject_iid]:
  read[itemID] = 1

# Add items to list of user's recommendations
# If they are similar to their favorite books,
# AND have not already been read.
recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in read:
    recommendations.append(getBookName(itemID))
    position += 1
    if (position > 10): break # We only want top 10

for rec in recommendations:
  print("book: ", rec)

book:  0393310779
book:  0399136487
book:  1551051729
book:  1402201435
book:  0060926546
book:  0844239062
book:  0425068145
book:  0373078188
book:  0446604623
book:  1585671274
book:  006091498X
