In [1]:
# Libraries for data preparation & visualization
import numpy as np
import pandas as pd

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")

# pip install scikit-surprise
# Importing libraries for model building & evaluation
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import accuracy
import random

In [2]:
import os
os.getcwd()

'C:\\Users\\Legion\\Downloads\\b\\1INTRO2AI\\GithubDesktop\\book-rec\\ngan'

In [3]:
book_path = "../dataset/BX-Books"

In [4]:
# Fix the &amp; and other HTML escape sequences
import html

raw_book_path = book_path
book_path = "../dataset/BX-Books-HTMLfixed"
with open(f'{raw_book_path}.csv', 'r', encoding='latin-1') as f, open(f'{book_path}.csv', 'w') as g:
    content = html.unescape(f.read())
    g.write(content)

In [5]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';', encoding='latin-1', escapechar='\\',\
                    error_bad_lines=False, warn_bad_lines=False,)
    return df

# Use your path in local
book   = loaddata(book_path)
user   = loaddata("../dataset/BX-Users")
rating = loaddata("../dataset/BX-Book-Ratings")

In [6]:
rating_users = rating['User-ID'].value_counts().reset_index().\
               rename({'Index':'User-ID','User-ID':'Rating'}, axis=1)
rating_books = rating['ISBN'].value_counts().reset_index().\
               rename({'Index':'ISBN','ISBN':'Rating'}, axis=1)
# In order to avoid rating bias & for making good recommendations, limit the dataset to only those
# users that have made at least 100 ratings & books that have received at least 50 ratings

rating = rating[rating['User-ID'].isin(rating_users[rating_users['Rating']>=100]['index'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['Rating']>=50]['index'])]

rating

Unnamed: 0,User-ID,ISBN,Book-Rating
413,276925,002542730X,10
426,276925,0316666343,0
427,276925,0345391810,0
429,276925,0385504209,8
448,276925,0679745580,0
...,...,...,...
1149604,276680,0743486226,6
1149616,276680,0812969812,0
1149628,276680,1573222267,0
1149629,276680,1573229083,7


In [7]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
rating               

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276925,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
2,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
3,10030,002542730X,7,Politically Correct Bedtime Stories: Modern Ta...
4,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
118771,238781,0743406184,10,If Only It Were True
118772,246156,0743406184,0,If Only It Were True
118773,246617,0743406184,0,If Only It Were True
118774,274308,0743406184,0,If Only It Were True


In [8]:
ratings_explicit=rating[rating['Book-Rating']!=0]
ratings_implicit=rating[rating['Book-Rating']==0]
print(ratings_explicit.shape)
print(ratings_implicit.shape)

(32125, 4)
(86651, 4)


In [9]:
# creating a surprise object

reader = Reader(rating_scale=(1, 10))
data_nonzero   = Dataset.load_from_df(ratings_explicit[['User-ID','ISBN','Book-Rating']], reader)
data  = Dataset.load_from_df(rating[['User-ID','ISBN','Book-Rating']], reader)

# Split the data into training & testing sets. 

raw_ratings_nonzero = data_nonzero.raw_ratings
random.shuffle(raw_ratings_nonzero)                 # shuffle dataset

threshold   = int(len(raw_ratings_nonzero)*0.8)

train_raw_ratings = raw_ratings_nonzero[:threshold] # 80% of data is trainset
test_raw_ratings  = raw_ratings_nonzero[threshold:] # 20% of data is testset

data_nonzero.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data_nonzero.build_full_trainset() 
testset          = data_nonzero.construct_testset(test_raw_ratings)


In [10]:
# Trying KNN (K-Nearest Neighbors) with data

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [11]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,2.702589,3.384237,0.885369,1.478405
knns.KNNBaseline,2.708998,3.384759,1.05996,1.758614
knns.KNNWithZScore,2.680494,3.402472,0.91891,1.526584
knns.KNNBasic,2.818774,3.568546,0.728536,1.385616


In [12]:
# Trying KNN (K-Nearest Neighbors) with nonzero rating data

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data_nonzero, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [13]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithZScore,1.276195,1.732035,0.165443,0.108825
knns.KNNWithMeans,1.291869,1.734978,0.100701,0.097673
knns.KNNBaseline,1.313658,1.73644,0.111025,0.145468
knns.KNNBasic,1.44987,1.896146,0.095221,0.09162


In [14]:
# Hyperparameter tuning - KNNBasic with data_nonzero

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['mae', 'rmse'], cv=5, n_jobs=-1)
                                    
gridsearchKNNBasic.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNNBasic.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNBasic.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNBasic.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
MAE Best Score:       1.2512630283201538

RMSE Best Parameters: {'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      1.7042492584577709



In [15]:
# Hyperparameter tuning - KNNWithMeans with data_nonzero

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['mae', 'rmse'], cv=5, n_jobs=-1)
                                    
gridsearchKNNWithMeans.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNNWithMeans.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNWithMeans.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNWithMeans.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNWithMeans.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
MAE Best Score:       1.2526812726387833

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
RMSE Best Score:      1.6831247699122418



In [16]:
# Hyperparameter tuning - KNNWithZScore

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNZScore = GridSearchCV(KNNWithZScore, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNZScore.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNNZScore.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNZScore.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNZScore.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNZScore.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
MAE Best Score:       1.238699406540445

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
RMSE Best Score:      1.666235025531833



In [17]:
# Hyperparameter tuning - KNNBaseLine

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNBaseLine = GridSearchCV(KNNBaseline, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNBaseLine.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNNBaseLine.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNBaseLine.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNBaseLine.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNBaseLine.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}
MAE Best Score:       1.1902293965587902

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}
RMSE Best Score:      1.556415089369489



In [18]:
best_options       = gridsearchKNNWithMeans.best_params["rmse"]["sim_options"]
print(best_options)

# # We'll use the KNNWithMeans
algo = KNNWithMeans(sim_options=best_options)

# Run 5-fold cross-validation and print results
CV_score = cross_validate(algo, data_nonzero, measures=["RMSE", "MAE"], cv=5, verbose=True)

{'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6807  1.6740  1.6846  1.6737  1.7066  1.6839  0.0121  
MAE (testset)     1.2484  1.2342  1.2540  1.2382  1.2704  1.2490  0.0128  
Fit time          0.13    0.17    0.13    0.13    0.13    0.

In [19]:
best_options       = gridsearchKNNBaseLine.best_params["rmse"]["sim_options"]
print(best_options)

# # We'll use the KNNBaseline
algo = KNNBaseline(sim_options=best_options)

# Run 5-fold cross-validation and print results
CV_score = cross_validate(algo, data_nonzero, measures=["RMSE", "MAE"], cv=5, verbose=True)

{'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5470  1.5490  1.5462  1.5537  1.5631  1.5518  0.0062  
MAE (testset)     1.1902  1.1876  1.1821  1.1865  1.1902  1.1873  0.0030  
Fit time          0.27    0.22    0.23    0.26    0.21    0.

In [20]:
best_options       = gridsearchKNNBasic.best_params["rmse"]["sim_options"]
print(best_options)

# # We'll use the KNNBasic
algo = KNNBasic(sim_options=best_options)

# Run 5-fold cross-validation and print results
CV_score = cross_validate(algo, data_nonzero, measures=["RMSE", "MAE"], cv=5, verbose=True)

{'name': 'cosine', 'min_support': 1, 'user_based': False}
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6684  1.6911  1.6908  1.7289  1.6898  1.6938  0.0195  
MAE (testset)     1.2329  1.2459  1.2461  1.2511  1.2455  1.2443  0.0060  
Fit time          0.20    0.19    0.24    0.19    0.24    0.21    0.02    
Test time         0.14    0.14    0.15    0.14    0.16    0.15    0.01    


In [21]:
# Model fit & prediction - KNNWithMeans

best_withmeans_sim_options = gridsearchKNNWithMeans.best_params["rmse"]["sim_options"]
final_model = KNNWithMeans(sim_options=best_withmeans_sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.2859
RMSE: 1.7228
MAE: 1.2859322810168499, RMSE: 1.7227735140051783


In [22]:
# Model fit & prediction - KNNBaseline
# Best options for this model
best_baseline_sim_options = gridsearchKNNBaseLine.best_params["rmse"]["sim_options"]
final_model = KNNBaseline(sim_options=best_baseline_sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.1593
RMSE: 1.5434
MAE: 1.1593353731875786, RMSE: 1.543406282842105


In [23]:
# Model fit & prediction - KNNBasic

best_basic_sim_options = gridsearchKNNBasic.best_params["rmse"]["sim_options"]
final_model = KNNBasic(sim_options=best_basic_sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.2179
RMSE: 1.6488
MAE: 1.2178757544896452, RMSE: 1.6487990545636337


<p>We can see that KNNBaseline performs best.</p>

In [24]:
def generate_recommendationsKNN(similarity_matrix, userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [25]:
def printRecommendations(similarity_matrix, user_id, k, get_top):
    recommendationsKNN = generate_recommendationsKNN(similarity_matrix, userID=user_id, like_recommend=k, get_recommend=get_top)

    print("\nRecommended Books for user {0} (item-based):".format(user_id))
    red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
    red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
    return red_

In [26]:
# Set the user and other values
base_user_id = 13552
base_k = 40
base_recommend = 10

In [27]:
# Compute item based similarity matrix
# KNNBasic
best_basic_sim_options['user_based'] = False
similarities = KNNBasic(sim_options = best_basic_sim_options).fit(trainset).\
                    compute_similarities() 

printRecommendations(similarities, base_user_id, base_k, base_recommend)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.

Recommended Books for user 13552 (item-based):


Unnamed: 0,ISBN,Book-Title
0,0316666343,The Lovely Bones: A Novel
1,0440211727,A Time to Kill
2,044021145X,The Firm
3,0345337662,Interview with the Vampire
4,0440220602,The Chamber
5,0312966091,Three To Get Deadly : A Stephanie Plum Novel (...
6,0671793489,All Around the Town
7,0439064872,Harry Potter and the Chamber of Secrets (Book 2)
8,0553213148,Anne of Avonlea (Anne of Green Gables Novels (...
9,0439136369,Harry Potter and the Prisoner of Azkaban (Book 3)


In [28]:
# Compute item based similarity matrix
# KNNWithMeans
best_withmeans_sim_options['user_based'] = False
similarities = KNNWithMeans(sim_options = best_withmeans_sim_options).fit(trainset).\
                    compute_similarities()

printRecommendations(similarities, base_user_id, base_k, base_recommend)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Recommended Books for user 13552 (item-based):


Unnamed: 0,ISBN,Book-Title
0,044021145X,The Firm
1,0440241073,The Summons
2,0446364193,Along Came a Spider (Alex Cross Novels)
3,0451172817,Needful Things
4,0440236673,The Brethren
5,0440211727,A Time to Kill
6,0345370775,Jurassic Park
7,0440213525,The Client
8,051513287X,Face the Fire (Three Sisters Island Trilogy)
9,0440220602,The Chamber


In [29]:
# Compute item based similarity matrix
# KNNBaseline
best_baseline_sim_options['user_based'] = False
similarities = KNNBaseline(sim_options = best_baseline_sim_options).fit(trainset).\
                    compute_similarities() 

printRecommendations(similarities, base_user_id, base_k, base_recommend)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Recommended Books for user 13552 (item-based):


Unnamed: 0,ISBN,Book-Title
0,044021145X,The Firm
1,0440241073,The Summons
2,0446364193,Along Came a Spider (Alex Cross Novels)
3,0451172817,Needful Things
4,0440236673,The Brethren
5,0440211727,A Time to Kill
6,0345370775,Jurassic Park
7,0440213525,The Client
8,051513287X,Face the Fire (Three Sisters Island Trilogy)
9,0440220602,The Chamber


# Using sklearn NearestNeighbor

In [None]:
ratings_matrix = ratings_explicit.pivot(index = 'User-ID', columns = 'ISBN', values = 'Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

In [None]:
# Importing libraries for model building & evaluation 
from sklearn.neighbors import NearestNeighbors
import sklearn
import re

In [None]:
# KNN Function
#setting global variables
global metric,k
k=10
global_metric='cosine'
global_algorithm = 'brute'

In [None]:
def findksimilarusers(user_id, ratings, metric=global_metric, algo=global_algorithm,k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = algo)
    model_knn.fit(ratings.values)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1 - distances.flatten()
    
    return similarities, indices

In [None]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = global_metric, algorithm = global_algorithm, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilarusers(user_id, ratings, metric, algorithm, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc, :].mean() #to adjust for zero based indexing
    sum_wt = np.sum(abs(similarities))
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings
    #which are handled here as below
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    # print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))

    return prediction

In [None]:
def recommendedItem(user_id, ratings, metric=global_metric):
    if (user_id not in ratings.index.values) or type(user_id) is not int:
        print("User id should be a valid integer from this list : \n\n {}".format(re.sub('[\[\]]', '', np.array_str(ratings_matrix.index.values))))
    else:
        prediction = []

        total = ratings.shape[1]
        for i in range(ratings.shape[1]):
            if (ratings[str(ratings.columns[i])][user_id] != 0): #not rated already
                print("\r{0:<0.2f}% Finished".format(i*100/total), end='')
                prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))
            else:                    
                prediction.append(-1) #for books that user already rated
#         else:
#             print("Item-based not implemented yet")
        print("\r100.00% Finished")
        
        prediction = pd.Series(prediction)
        prediction = prediction.sort_values(ascending = False)
        print("...")
        print(prediction)
        
        recommended = prediction[:10]
        print("\nFor the User-based ({0} - {1}) approach, the following books are recommended: \n".format(global_algorithm, global_metric))
        for i in range(len(recommended)):
            print("{0}. Rated {1}: {2}".format(i+1, prediction.values[i], book['Book-Title'][recommended.index[i]]))
            

In [None]:
global_algorithm = 'brute'
global_metric = 'cosine'
recommendedItem(13552, ratings_matrix)

In [None]:
global_algorithm = 'brute'
global_metric = 'euclidean'
recommendedItem(13552, ratings_matrix)

In [None]:
global_algorithm = 'brute'
global_metric = 'correlation'
recommendedItem(13552, ratings_matrix)