In [71]:
# Libraries for data preparation & visualization
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "png"

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")

# pip install scikit-surprise
# Importing libraries for model building & evaluation
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import accuracy
import random

In [72]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df

book   = loaddata("../AIproject/BX-Books")
user   = loaddata("../AIproject/BX-Users")
rating = loaddata("../AIproject/BX-Book-Ratings")

In [73]:
rating_users = rating['User-ID'].value_counts().reset_index().\
               rename({'Index':'User-ID','User-ID':'Rating'}, axis=1)
rating_books = rating['ISBN'].value_counts().reset_index().\
               rename({'Index':'ISBN','ISBN':'Rating'}, axis=1)
# In order to avoid rating bias & for making good recommendations, limit the dataset to only those
# users that have made at least 250 ratings & books that have received at least 50 ratings

rating = rating[rating['User-ID'].isin(rating_users[rating_users['Rating']>=250]['index'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['Rating']>=50]['index'])]

rating

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1468,277427,006092988X,0
1469,277427,0060930535,0
1470,277427,0060932139,0
1471,277427,0060934417,0
...,...,...,...
1147440,275970,1400031354,0
1147441,275970,1400031362,0
1147470,275970,1558744606,0
1147517,275970,1573229725,0


In [74]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
rating    

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
80889,234828,0345333926,8,Ringworld
80890,236283,0345333926,0,Ringworld
80891,249628,0345333926,0,Ringworld
80892,261829,0345333926,0,Ringworld


# Using surprise for data with zeros

In [75]:
# creating a surprise object

reader = Reader(rating_scale=(0, 10))
# data_nonzero   = Dataset.load_from_df(ratings_explicit[['User-ID','ISBN','Book-Rating']], reader)
data  = Dataset.load_from_df(rating[['User-ID','ISBN','Book-Rating']], reader)


# Split the data into training & testing sets. Python's surprise documentation has the steps detailed out
# https://surprise.readthedocs.io/en/stable/FAQ.html

raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                 # shuffle dataset

threshold   = int(len(raw_ratings)*0.8)

train_raw_ratings = raw_ratings[:threshold] # 80% of data is trainset
test_raw_ratings  = raw_ratings[threshold:] # 20% of data is testset

data.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data.build_full_trainset() 
testset          = data.construct_testset(test_raw_ratings)


In [76]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [77]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,2.349439,3.296574,0.26517,1.346878
knns.KNNBaseline,2.358011,3.311345,0.174383,1.652564
knns.KNNWithZScore,2.328952,3.335546,0.252259,1.284944
knns.KNNBasic,2.450135,3.519639,0.195459,1.521207


In [78]:
ratings_explicit=rating[rating['Book-Rating']!=0]
ratings_implicit=rating[rating['Book-Rating']==0]
print(ratings_explicit.shape)
print(ratings_implicit.shape)

(18651, 4)
(62243, 4)


In [79]:
# Hyperparameter tuning - KNNWithMeans

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNWithMeans.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNWithMeans.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNWithMeans.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNWithMeans.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNWithMeans.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.328159523296675

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.2297379861789617



In [80]:
# Hyperparameter tuning - KNNBasic

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNBasic.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNBasic.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNBasic.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNBasic.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.2900785392890084

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.2067294995515474



In [81]:
# Hyperparameter tuning - KNNWithZScore

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNWithZScore, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.2949738774729864

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.24857409478862



In [82]:
# Hyperparameter tuning - KNNBaseLine

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNBaseline, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.2992449803906014

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.190976980282428



In [83]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}

# # We'll use the famous SVD algorithm.
algo = KNNBaseline(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.1966  3.2084  3.2088  3.1815  3.1867  3.1964  0.0111  
MAE (testset)     2.3048  2.3030  2.3164  2.2998  2.2869  2.3022  0.0095  
Fit time          1.20    1.26    1.16    1.12    1.15    1.18    0.05    
Test time         2.69    2.31    2.53    2.44    2.5

{'test_rmse': array([3.19657797, 3.20841191, 3.20876416, 3.18147694, 3.18665873]),
 'test_mae': array([2.30483466, 2.30301612, 2.31637308, 2.2998013 , 2.28685835]),
 'fit_time': (1.2038657665252686,
  1.2563040256500244,
  1.1595892906188965,
  1.1202661991119385,
  1.1486485004425049),
 'test_time': (2.6873462200164795,
  2.3077168464660645,
  2.5339362621307373,
  2.4368560314178467,
  2.569251775741577)}

In [84]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}

# # We'll use the famous SVD algorithm.
algo = KNNWithZScore(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2376  3.2940  3.2159  3.2537  3.2768  3.2556  0.0277  
MAE (testset)     2.3245  2.3495  2.3082  2.3387  2.3512  2.3344  0.0162  
Fit time          1.31    1.38    1.29    1.33    1.42    1.35    0.05    
Test time         2.41    2.33    2.45    2.34    2

{'test_rmse': array([3.23758681, 3.29400597, 3.21589022, 3.25367234, 3.27684298]),
 'test_mae': array([2.32454364, 2.34945436, 2.30815437, 2.33871177, 2.35122158]),
 'fit_time': (1.312831163406372,
  1.3754403591156006,
  1.291945457458496,
  1.3294925689697266,
  1.4206109046936035),
 'test_time': (2.414608955383301,
  2.329281806945801,
  2.447754383087158,
  2.342407464981079,
  2.1940677165985107)}

In [85]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
# We'll use the famous SVD algorithm.
algo = KNNWithMeans(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2387  3.2343  3.2328  3.1973  3.2797  3.2366  0.0262  
MAE (testset)     2.3609  2.3454  2.3463  2.3214  2.3860  2.3520  0.0212  
Fit time          1.32    1.23    1.31    1.21    1.28    1.27    0.04    
Test time         2.24    2.22    2.38    2.30    2.

{'test_rmse': array([3.2386903 , 3.23432256, 3.23277135, 3.19730332, 3.27970771]),
 'test_mae': array([2.3608897 , 2.34538587, 2.34627883, 2.32144497, 2.38600018]),
 'fit_time': (1.3231263160705566,
  1.233036994934082,
  1.3136212825775146,
  1.213925838470459,
  1.2805283069610596),
 'test_time': (2.242658853530884,
  2.215169906616211,
  2.3848981857299805,
  2.297032117843628,
  2.1805920600891113)}

In [86]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
# We'll use the famous SVD algorithm.
algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.1807  3.2029  3.2216  3.2506  3.2046  3.2121  0.0232  
MAE (testset)     2.2639  2.2765  2.2946  2.3322  2.2934  2.2921  0.0230  
Fit time          1.14    1.34    1.21    1.27    1.25    1.24    0.07    
Test time         2.24    2.02    2.48    2.14    2.17  

{'test_rmse': array([3.18067891, 3.20289557, 3.22162532, 3.25055438, 3.2046289 ]),
 'test_mae': array([2.26391708, 2.27645759, 2.29459285, 2.33215177, 2.29343109]),
 'fit_time': (1.1396384239196777,
  1.3381869792938232,
  1.2132863998413086,
  1.2717680931091309,
  1.2489609718322754),
 'test_time': (2.24420166015625,
  2.0170700550079346,
  2.4815609455108643,
  2.140101194381714,
  2.1732516288757324)}

In [87]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNBasic(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.2773
RMSE: 3.1630
MAE: 2.2772513765486333, RMSE: 3.1629620918937933


In [88]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNWithMeans(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.3144
RMSE: 3.1675
MAE: 2.3143961631471015, RMSE: 3.1675066501332942


In [89]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNBaseline(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.2784
RMSE: 3.1426
MAE: 2.2783603682675535, RMSE: 3.142564161093465


In [90]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNWithZScore(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.2954
RMSE: 3.1807
MAE: 2.2953583090190643, RMSE: 3.180725481807555


In [94]:
# KNNBasic

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
    similarity_matrix = KNNBasic(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [95]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['0446350982',
 '0553580388',
 '0671004530',
 '0446608955',
 '0440236053',
 '0425101452',
 '0446611085',
 '0786889551',
 '0425189031',
 '1565122968',
 '140003180X']

In [96]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0446350982,Presumed Innocent
1,0553580388,The Patient
2,0671004530,On the Street Where You Live
3,0446608955,A Walk to Remember
4,0440236053,Writ of Execution
5,0425101452,Phantoms
6,0446611085,Suzanne's Diary for Nicholas
7,0786889551,The Pied Piper
8,0425189031,Portrait in Death
9,1565122968,Gap Creek: A Novel


In [97]:
# KNNBaseline

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':5,'user_based':False}
    similarity_matrix = KNNBaseline(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [98]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['0446608955',
 '0553580388',
 '0671004530',
 '0786889551',
 '0446611085',
 '0425189031',
 '0449003795',
 '0425178765',
 '0553561618',
 '0440166497',
 '051513287X']

In [99]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0446608955,A Walk to Remember
1,0553580388,The Patient
2,0671004530,On the Street Where You Live
3,0786889551,The Pied Piper
4,0446611085,Suzanne's Diary for Nicholas
5,0425189031,Portrait in Death
6,0449003795,P Is for Peril
7,0425178765,Easy Prey
8,0553561618,Dark Paradise
9,0440166497,Once in a Lifetime


In [102]:
# KNNWithMeans

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
    similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [103]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['0446350982',
 '0553580388',
 '0671004530',
 '0446608955',
 '0440236053',
 '0425101452',
 '0446611085',
 '0786889551',
 '0425189031',
 '1565122968',
 '140003180X']

In [104]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0446350982,Presumed Innocent
1,0553580388,The Patient
2,0671004530,On the Street Where You Live
3,0446608955,A Walk to Remember
4,0440236053,Writ of Execution
5,0425101452,Phantoms
6,0446611085,Suzanne's Diary for Nicholas
7,0786889551,The Pied Piper
8,0425189031,Portrait in Death
9,1565122968,Gap Creek: A Novel


In [105]:
unique_ids = rating['ISBN'].unique()
iids = rating.loc[rating['User-ID']==13552, 'ISBN']
book_to_predict = np.setdiff1d(unique_ids,iids)