In [89]:
# Libraries for data preparation & visualization
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "png"

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")

# pip install scikit-surprise
# Importing libraries for model building & evaluation
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import accuracy
import random

In [90]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df
# Use your path in local
book   = loaddata("../../BX-Books")
user   = loaddata("../../BX-Users")
rating = loaddata("../../BX-Book-Ratings")

In [91]:
rating_users = rating['User-ID'].value_counts().reset_index().\
               rename({'Index':'User-ID','User-ID':'Rating'}, axis=1)
rating_books = rating['ISBN'].value_counts().reset_index().\
               rename({'Index':'ISBN','ISBN':'Rating'}, axis=1)
# In order to avoid rating bias & for making good recommendations, limit the dataset to only those
# users that have made at least 100 ratings & books that have received at least 50 ratings

rating = rating[rating['User-ID'].isin(rating_users[rating_users['Rating']>=100]['index'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['Rating']>=50]['index'])]

rating

Unnamed: 0,User-ID,ISBN,Book-Rating
413,276925,002542730X,10
426,276925,0316666343,0
427,276925,0345391810,0
429,276925,0385504209,8
448,276925,0679745580,0
...,...,...,...
1149604,276680,0743486226,6
1149616,276680,0812969812,0
1149628,276680,1573222267,0
1149629,276680,1573229083,7


In [92]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
rating               

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276925,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
2,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
3,10030,002542730X,7,Politically Correct Bedtime Stories: Modern Ta...
4,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
118771,238781,0743406184,10,If Only It Were True
118772,246156,0743406184,0,If Only It Were True
118773,246617,0743406184,0,If Only It Were True
118774,274308,0743406184,0,If Only It Were True


In [93]:
ratings_explicit=rating[rating['Book-Rating']!=0]
ratings_implicit=rating[rating['Book-Rating']==0]
print(ratings_explicit.shape)
print(ratings_implicit.shape)

(32125, 4)
(86651, 4)


In [94]:
# creating a surprise object

reader = Reader(rating_scale=(1, 10))
data_nonzero   = Dataset.load_from_df(ratings_explicit[['User-ID','ISBN','Book-Rating']], reader)
data  = Dataset.load_from_df(rating[['User-ID','ISBN','Book-Rating']], reader)


# Split the data into training & testing sets. Python's surprise documentation has the steps detailed out
# https://surprise.readthedocs.io/en/stable/FAQ.html

raw_ratings_nonzero = data_nonzero.raw_ratings
random.shuffle(raw_ratings_nonzero)                 # shuffle dataset

threshold   = int(len(raw_ratings_nonzero)*0.8)

train_raw_ratings = raw_ratings_nonzero[:threshold] # 80% of data is trainset
test_raw_ratings  = raw_ratings_nonzero[threshold:] # 20% of data is testset

data_nonzero.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data_nonzero.build_full_trainset() 
testset          = data_nonzero.construct_testset(test_raw_ratings)



In [95]:
# Trying KNN (K-Nearest Neighbors) with data

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [96]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,2.704184,3.388098,0.413353,2.334252
knns.KNNBaseline,2.715056,3.392585,0.735178,4.491177
knns.KNNWithZScore,2.68887,3.411428,0.621295,3.125594
knns.KNNBasic,2.819927,3.56894,0.382429,2.028248


In [97]:
# Trying KNN (K-Nearest Neighbors) with data nonzero

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data_nonzero, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [98]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,1.275442,1.717533,0.12449,0.232515
knns.KNNWithZScore,1.274988,1.728506,0.320817,0.322542
knns.KNNBaseline,1.30805,1.731568,0.091116,0.176107
knns.KNNBasic,1.435886,1.884052,0.062674,0.146085


In [99]:
# Hyperparameter tuning - KNNWithMeans with data_nonzero

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNWithMeans.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNNWithMeans.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNWithMeans.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNWithMeans.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNWithMeans.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
MAE Best Score:       1.2477344142760778

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
RMSE Best Score:      1.6757927514357989



In [100]:
# Hyperparameter tuning - KNNBasic

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNBasic.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNNBasic.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNBasic.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNBasic.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
MAE Best Score:       1.2455148592739367

RMSE Best Parameters: {'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      1.694018748657123



In [101]:
# Hyperparameter tuning - KNNWithZScore

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNWithZScore, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
MAE Best Score:       1.234916795291311

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
RMSE Best Score:      1.6600244966827442



In [102]:
# Hyperparameter tuning - KNNBaseLine

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNBaseline, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data_nonzero)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}
MAE Best Score:       1.1832349103678226

RMSE Best Parameters: {'sim_options': {'name': 'pearson', 'min_support': 5, 'user_based': False}}
RMSE Best Score:      1.5521472879073785



In [103]:
sim_options       = {'name':'cosine','min_support':1,'user_based':False}

# # We'll use the KNNWithMeans
algo = KNNWithMeans(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data_nonzero, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6721  1.6853  1.6911  1.6996  1.6945  1.6885  0.0094  
MAE (testset)     1.2511  1.2628  1.2632  1.2672  1.2701  1.2629  0.0065  
Fit time          0.51    0.41    0.31    0.30    0.36    0.38    0.08    
Test time         0.57    0.36    0.31    0.35    0.42    0.40    0.09    


{'test_rmse': array([1.67208105, 1.68530907, 1.69108683, 1.69958188, 1.69453005]),
 'test_mae': array([1.25112429, 1.26280903, 1.26316416, 1.26715816, 1.27008289]),
 'fit_time': (0.5065946578979492,
  0.40538501739501953,
  0.3063511848449707,
  0.30307674407958984,
  0.3615427017211914),
 'test_time': (0.5738024711608887,
  0.36075878143310547,
  0.30837392807006836,
  0.35179853439331055,
  0.4212770462036133)}

In [104]:
sim_options       = {'name':'pearson_baseline','min_support':5,'user_based':False}

# # We'll use the KNNBaseline
algo = KNNBaseline(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data_nonzero, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5339  1.5378  1.5451  1.5658  1.5830  1.5531  0.0186  
MAE (testset)     1.1708  1.1679  1.1818  1.1813  1.2140  1.1832  0.0164  
Fit time          0.39    0.48    0.46    0.63    0.52    0.49    0.08    
Test time         0.42    0.45    0.59    0.50    0.5

{'test_rmse': array([1.53392498, 1.53776159, 1.54508982, 1.56575544, 1.58302514]),
 'test_mae': array([1.17084367, 1.16790857, 1.1817965 , 1.1813213 , 1.21398481]),
 'fit_time': (0.38599348068237305,
  0.4778268337249756,
  0.4590017795562744,
  0.6307759284973145,
  0.5197701454162598),
 'test_time': (0.41881370544433594,
  0.4460623264312744,
  0.5936911106109619,
  0.4970061779022217,
  0.5285868644714355)}

In [105]:
sim_options       = {'name':'cosine','min_support':1,'user_based':False}

# # We'll use the KNNBasic
algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data_nonzero, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6990  1.6863  1.7281  1.6807  1.6968  1.6982  0.0164  
MAE (testset)     1.2495  1.2281  1.2648  1.2338  1.2607  1.2474  0.0144  
Fit time          0.50    0.41    0.41    0.38    0.39    0.42    0.04    
Test time         0.53    0.57    0.54    0.54    0.52    0.54    0.02    


{'test_rmse': array([1.69904603, 1.68628104, 1.72808602, 1.68068698, 1.6967573 ]),
 'test_mae': array([1.24946506, 1.22813823, 1.26483163, 1.23383546, 1.26074072]),
 'fit_time': (0.5019655227661133,
  0.4075937271118164,
  0.4054257869720459,
  0.3833637237548828,
  0.39145565032958984),
 'test_time': (0.5328042507171631,
  0.5725240707397461,
  0.5433354377746582,
  0.5399112701416016,
  0.5161399841308594)}

In [106]:
# Model fit & prediction - KNNBasic

sim_options = {'name':'cosine','min_support':1,'user_based':False}
final_model = KNNBasic(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.2070
RMSE: 1.6351
MAE: 1.2069549632455576, RMSE: 1.6350534087065443


In [107]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'cosine','min_support':1,'user_based':False}
final_model = KNNWithMeans(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.2205
RMSE: 1.6299
MAE: 1.220476817580134, RMSE: 1.6299293946102147


In [108]:
# Model fit & prediction - KNNBaseline

sim_options = {'name':'pearson_baseline','min_support':5,'user_based':False}
final_model = KNNBaseline(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.1711
RMSE: 1.5516
MAE: 1.1711008026549403, RMSE: 1.5516426687758436


In [109]:
# KNNBasic

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'cosine','min_support':1,'user_based':False}
    similarity_matrix = KNNBasic(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [110]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


['0312966970',
 '0440226430',
 '0842329129',
 '0451172817',
 '0440224764',
 '0312990456',
 '0671003755',
 '0515131229',
 '043935806X',
 '0553569910',
 '0385511612']

In [111]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0312966970,Four To Score (A Stephanie Plum Novel)
1,0440226430,Summer Sisters
2,0842329129,Left Behind: A Novel of the Earth's Last Days ...
3,0451172817,Needful Things
4,0440224764,The Partner
5,0312990456,One for the Money (A Stephanie Plum Novel)
6,0671003755,She's Come Undone (Oprah's Book Club (Paperback))
7,0515131229,Dance upon the Air (Three Sisters Island Trilogy)
8,043935806X,Harry Potter and the Order of the Phoenix (Boo...
9,0553569910,The Ugly Duckling


In [112]:
# KNNBaseline

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':5,'user_based':False}
    similarity_matrix = KNNBaseline(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [113]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['044021145X',
 '0312195516',
 '0439139600',
 '0385335881',
 '0440213525',
 '0515131229',
 '0515128554',
 '0142001740',
 '0440226430',
 '080410526X',
 '0425163407']

In [114]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,044021145X,The Firm
1,0312195516,The Red Tent (Bestselling Backlist)
2,0439139600,Harry Potter and the Goblet of Fire (Book 4)
3,0385335881,Shopaholic Takes Manhattan (Summer Display Opp...
4,0440213525,The Client
5,0515131229,Dance upon the Air (Three Sisters Island Trilogy)
6,0515128554,Heart of the Sea (Irish Trilogy)
7,0142001740,The Secret Life of Bees
8,0440226430,Summer Sisters
9,080410526X,All I Really Need to Know


In [115]:
# KNNWithMeans

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'cosine','min_support':1,'user_based':False}
    similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [116]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


['0312966970',
 '0440226430',
 '0842329129',
 '0451172817',
 '0440224764',
 '0312990456',
 '0671003755',
 '0515131229',
 '043935806X',
 '0553569910',
 '0385511612']

In [117]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0312966970,Four To Score (A Stephanie Plum Novel)
1,0440226430,Summer Sisters
2,0842329129,Left Behind: A Novel of the Earth's Last Days ...
3,0451172817,Needful Things
4,0440224764,The Partner
5,0312990456,One for the Money (A Stephanie Plum Novel)
6,0671003755,She's Come Undone (Oprah's Book Club (Paperback))
7,0515131229,Dance upon the Air (Three Sisters Island Trilogy)
8,043935806X,Harry Potter and the Order of the Phoenix (Boo...
9,0553569910,The Ugly Duckling


# Using sklearn NearestNeighbor

In [118]:
ratings_matrix = ratings_explicit.pivot(index = 'User-ID', columns = 'ISBN', values = 'Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

(1684, 2160)


ISBN,000649840X,0007110928,002026478X,0020442203,002542730X,0028604199,006000438X,0060008032,0060008776,006001203X,...,1860492592,1878424319,1885171080,1931561648,3257228007,3257229534,3404148665,3423202327,3442541751,3492045170
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
# Importing libraries for model building & evaluation 
from sklearn.neighbors import NearestNeighbors
import sklearn
import re

In [120]:
# KNN Function
#setting global variables
global metric,k
k=10
global_metric='cosine'
global_algorithm = 'brute'

In [121]:
def findksimilarusers(user_id, ratings, metric=global_metric, algo=global_algorithm,k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = algo)
    model_knn.fit(ratings.values)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1 - distances.flatten()
    
    return similarities, indices

In [122]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = global_metric, algorithm = global_algorithm, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilarusers(user_id, ratings, metric, algorithm, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc, :].mean() #to adjust for zero based indexing
    sum_wt = np.sum(abs(similarities))
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings
    #which are handled here as below
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    # print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))

    return prediction

In [123]:
def recommendedItem(user_id, ratings, metric=global_metric):
    if (user_id not in ratings.index.values) or type(user_id) is not int:
        print("User id should be a valid integer from this list : \n\n {}".format(re.sub('[\[\]]', '', np.array_str(ratings_matrix.index.values))))
    else:
        prediction = []

        total = ratings.shape[1]
        for i in range(ratings.shape[1]):
            if (ratings[str(ratings.columns[i])][user_id] != 0): #not rated already
                print("\r{0:<0.2f}% Finished".format(i*100/total), end='')
                prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))
            else:                    
                prediction.append(-1) #for books that user already rated
#         else:
#             print("Item-based not implemented yet")
        print("\r100.00% Finished")
        
        prediction = pd.Series(prediction)
        prediction = prediction.sort_values(ascending = False)
        print("...")
        print(prediction)
        
        recommended = prediction[:10]
        print("\nFor the User-based ({0} - {1}) approach, the following books are recommended: \n".format(global_algorithm, global_metric))
        for i in range(len(recommended)):
            print("{0}. Rated {1}: {2}".format(i+1, prediction.values[i], book['Book-Title'][recommended.index[i]]))
            

In [124]:
global_algorithm = 'brute'
global_metric = 'cosine'
recommendedItem(13552, ratings_matrix)

100.00% Finished
...
1802    3
926     3
1773    3
1822    3
910     3
       ..
714    -1
713    -1
712    -1
711    -1
2159   -1
Length: 2160, dtype: int64

For the User-based (brute - cosine) approach, the following books are recommended: 

1. Rated 3: The Brethren
2. Rated 3: The Diary of Ellen Rimbauer: My Life at Rose Red
3. Rated 3: The Blackwater Lightship: A Novel
4. Rated 3: Atonement
5. Rated 3: Trial by Fire
6. Rated 3: Sunwing (Aladdin Fantasy)
7. Rated 3: Welcome to Higby : A Novel
8. Rated 2: All That Glitters (Avalon, 2)
9. Rated 2: The Grass Is Always Greener over the Septic Tank
10. Rated 2: The Wooden Nickel: A Novel


In [125]:
global_algorithm = 'brute'
global_metric = 'euclidean'
recommendedItem(13552, ratings_matrix)

100.00% Finished
...
1802    3
926     3
1773    3
1822    3
910     3
       ..
714    -1
713    -1
712    -1
711    -1
2159   -1
Length: 2160, dtype: int64

For the User-based (brute - euclidean) approach, the following books are recommended: 

1. Rated 3: The Brethren
2. Rated 3: The Diary of Ellen Rimbauer: My Life at Rose Red
3. Rated 3: The Blackwater Lightship: A Novel
4. Rated 3: Atonement
5. Rated 3: Trial by Fire
6. Rated 3: Sunwing (Aladdin Fantasy)
7. Rated 3: Welcome to Higby : A Novel
8. Rated 2: All That Glitters (Avalon, 2)
9. Rated 2: The Grass Is Always Greener over the Septic Tank
10. Rated 2: The Wooden Nickel: A Novel


In [126]:
global_algorithm = 'brute'
global_metric = 'correlation'
recommendedItem(13552, ratings_matrix)

100.00% Finished
...
1802    3
926     3
1773    3
1822    3
910     3
       ..
714    -1
713    -1
712    -1
711    -1
2159   -1
Length: 2160, dtype: int64

For the User-based (brute - correlation) approach, the following books are recommended: 

1. Rated 3: The Brethren
2. Rated 3: The Diary of Ellen Rimbauer: My Life at Rose Red
3. Rated 3: The Blackwater Lightship: A Novel
4. Rated 3: Atonement
5. Rated 3: Trial by Fire
6. Rated 3: Sunwing (Aladdin Fantasy)
7. Rated 3: Welcome to Higby : A Novel
8. Rated 2: All That Glitters (Avalon, 2)
9. Rated 2: The Grass Is Always Greener over the Septic Tank
10. Rated 2: The Wooden Nickel: A Novel
