In [6]:
# Libraries for data preparation & visualization
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "png"

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")

# pip install scikit-surprise
# Importing libraries for model building & evaluation
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import accuracy
import random

In [7]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df

book   = loaddata("../../BX-Books")
user   = loaddata("../../BX-Users")
rating = loaddata("../../BX-Book-Ratings")

In [8]:
rating_users = rating['User-ID'].value_counts().reset_index().\
               rename({'Index':'User-ID','User-ID':'Rating'}, axis=1)
rating_books = rating['ISBN'].value_counts().reset_index().\
               rename({'Index':'ISBN','ISBN':'Rating'}, axis=1)
# In order to avoid rating bias & for making good recommendations, limit the dataset to only those
# users that have made at least 250 ratings & books that have received at least 50 ratings

rating = rating[rating['User-ID'].isin(rating_users[rating_users['Rating']>=250]['index'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['Rating']>=50]['index'])]

rating

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1468,277427,006092988X,0
1469,277427,0060930535,0
1470,277427,0060932139,0
1471,277427,0060934417,0
...,...,...,...
1147440,275970,1400031354,0
1147441,275970,1400031362,0
1147470,275970,1558744606,0
1147517,275970,1573229725,0


In [9]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
rating    

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
80889,234828,0345333926,8,Ringworld
80890,236283,0345333926,0,Ringworld
80891,249628,0345333926,0,Ringworld
80892,261829,0345333926,0,Ringworld


# Using surprise for data with zeros

In [10]:
# creating a surprise object

reader = Reader(rating_scale=(0, 10))
# data_nonzero   = Dataset.load_from_df(ratings_explicit[['User-ID','ISBN','Book-Rating']], reader)
data  = Dataset.load_from_df(rating[['User-ID','ISBN','Book-Rating']], reader)


# Split the data into training & testing sets. Python's surprise documentation has the steps detailed out
# https://surprise.readthedocs.io/en/stable/FAQ.html

raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                 # shuffle dataset

threshold   = int(len(raw_ratings)*0.8)

train_raw_ratings = raw_ratings[:threshold] # 80% of data is trainset
test_raw_ratings  = raw_ratings[threshold:] # 20% of data is testset

data.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data.build_full_trainset() 
testset          = data.construct_testset(test_raw_ratings)


In [11]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [12]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,2.345479,3.300367,0.116832,0.763151
knns.KNNBaseline,2.346841,3.301817,0.105052,1.070257
knns.KNNWithZScore,2.322376,3.33112,0.110006,0.856742
knns.KNNBasic,2.440255,3.515788,0.082228,0.668339


In [13]:
ratings_explicit=rating[rating['Book-Rating']!=0]
ratings_implicit=rating[rating['Book-Rating']==0]
print(ratings_explicit.shape)
print(ratings_implicit.shape)

(18651, 4)
(62243, 4)


In [14]:
# Hyperparameter tuning - KNNWithMeans

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNWithMeans.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNWithMeans.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNWithMeans.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNWithMeans.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNWithMeans.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.3245501488622944

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.229344416698337



In [15]:
# Hyperparameter tuning - KNNBasic

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNBasic.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNBasic.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNBasic.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNBasic.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.2881471075654347

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.2045208367297464



In [16]:
# Hyperparameter tuning - KNNWithZScore

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNWithZScore, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.2881449778589533

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.245862428148932



In [17]:
# Hyperparameter tuning - KNNBaseLine

param_grid = { 'sim_options' : {'name': ['msd','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNBaseline, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
MAE Best Score:       2.297946983678618

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      3.1868143548191665



In [18]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}

# # We'll use the famous SVD algorithm.
algo = KNNBaseline(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2125  3.1877  3.1676  3.1744  3.2128  3.1910  0.0188  
MAE (testset)     2.3123  2.3016  2.2897  2.2951  2.3186  2.3035  0.0107  
Fit time          0.88    0.85    0.87    0.86    0.87    0.87    0.01    
Test time         1.63    1.62    1.73    1.62    1.6

{'test_rmse': array([3.21246893, 3.18769054, 3.16760838, 3.17442845, 3.21282754]),
 'test_mae': array([2.31230133, 2.30158412, 2.28968406, 2.29508288, 2.31860149]),
 'fit_time': (0.8783316612243652,
  0.8512139320373535,
  0.8712284564971924,
  0.8615007400512695,
  0.8696322441101074),
 'test_time': (1.6334702968597412,
  1.6168849468231201,
  1.7294089794158936,
  1.6199464797973633,
  1.6387641429901123)}

In [19]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}

# # We'll use the famous SVD algorithm.
algo = KNNWithZScore(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2845  3.2520  3.2251  3.2650  3.2256  3.2505  0.0230  
MAE (testset)     2.3623  2.3330  2.3000  2.3477  2.3166  2.3319  0.0220  
Fit time          0.94    0.96    0.98    0.99    1.01    0.98    0.02    
Test time         1.52    1.58    1.52    1.52    1

{'test_rmse': array([3.2845232 , 3.25202416, 3.22514755, 3.2650396 , 3.22559556]),
 'test_mae': array([2.36229709, 2.33303039, 2.30004068, 2.34771501, 2.31656183]),
 'fit_time': (0.9386410713195801,
  0.9615628719329834,
  0.9820506572723389,
  0.9946200847625732,
  1.0074794292449951),
 'test_time': (1.520583152770996,
  1.5758988857269287,
  1.5187642574310303,
  1.5171992778778076,
  1.5323803424835205)}

In [20]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
# We'll use the famous SVD algorithm.
algo = KNNWithMeans(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2140  3.2518  3.2572  3.1755  3.2246  3.2246  0.0294  
MAE (testset)     2.3384  2.3719  2.3725  2.3120  2.3391  2.3468  0.0229  
Fit time          0.88    0.89    0.89    0.89    0.89    0.89    0.00    
Test time         1.51    1.44    1.48    1.48    1.

{'test_rmse': array([3.21403223, 3.25183409, 3.25722537, 3.17545563, 3.22461056]),
 'test_mae': array([2.33840747, 2.37186062, 2.37250856, 2.31202555, 2.33913234]),
 'fit_time': (0.8810224533081055,
  0.8936145305633545,
  0.8942012786865234,
  0.8901336193084717,
  0.8906292915344238),
 'test_time': (1.5141706466674805,
  1.4405834674835205,
  1.4770317077636719,
  1.477827548980713,
  1.5465514659881592)}

In [21]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
# We'll use the famous SVD algorithm.
algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.1946  3.2238  3.2219  3.1625  3.2044  3.2014  0.0223  
MAE (testset)     2.2830  2.3001  2.2971  2.2683  2.2885  2.2874  0.0113  
Fit time          1.04    0.95    0.95    0.93    0.95    0.96    0.04    
Test time         1.50    1.54    1.67    1.71    1.70  

{'test_rmse': array([3.1946216 , 3.2237609 , 3.22186159, 3.16249847, 3.20436995]),
 'test_mae': array([2.28304918, 2.30013547, 2.29714311, 2.26830413, 2.28850557]),
 'fit_time': (1.0449776649475098,
  0.9523639678955078,
  0.9488754272460938,
  0.9274604320526123,
  0.9507274627685547),
 'test_time': (1.5018727779388428,
  1.5440101623535156,
  1.6671550273895264,
  1.7094554901123047,
  1.698007345199585)}

In [22]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNBasic(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.2682
RMSE: 3.1542
MAE: 2.268191183288463, RMSE: 3.154194714423748


In [23]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNWithMeans(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.3285
RMSE: 3.1749
MAE: 2.3284858250263616, RMSE: 3.1749201958799413


In [24]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNBaseline(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.2837
RMSE: 3.1444
MAE: 2.283706795003454, RMSE: 3.144389321600955


In [25]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'pearson_baseline','min_support':1,'user_based':False}
final_model = KNNWithZScore(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  2.3103
RMSE: 3.1937
MAE: 2.3103283714622833, RMSE: 3.193656565289063


In [26]:
# KNNBasic

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
    similarity_matrix = KNNBasic(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [27]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['0671693816',
 '0671024248',
 '0767905180',
 '044022473X',
 '0684195976',
 '0553096060',
 '0515120618',
 '0385492081',
 '0446603929',
 '0399501487',
 '0451124340']

In [28]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0671693816,Wifey
1,0671024248,Hearts In Atlantis
2,0767905180,Jemima J: A Novel About Ugly Ducklings and Swans
3,044022473X,Breach of Promise
4,0684195976,BODY FARM
5,0553096060,Sein Language
6,0515120618,Montana Sky
7,0385492081,Into Thin Air : A Personal Account of the Mt. ...
8,0446603929,See How They Run
9,0399501487,Lord of the Flies


In [29]:
# KNNBaseline

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':5,'user_based':False}
    similarity_matrix = KNNBaseline(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [30]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['0671693816',
 '0385492081',
 '044022473X',
 '0446608955',
 '0767905180',
 '0399501487',
 '0553271636',
 '0515130966',
 '0385505833',
 '0345386132',
 '0345369947']

In [31]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0671693816,Wifey
1,0385492081,Into Thin Air : A Personal Account of the Mt. ...
2,044022473X,Breach of Promise
3,0446608955,A Walk to Remember
4,0767905180,Jemima J: A Novel About Ugly Ducklings and Swans
5,0399501487,Lord of the Flies
6,0553271636,D Is for Deadbeat (Kinsey Millhone Mysteries (...
7,0515130966,Riptide
8,0385505833,Skipping Christmas
9,0345386132,Eyes of a Child


In [32]:
# KNNWithMeans

def generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}
    similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [33]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=40, get_recommend=10)
recommendationsKNN

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


['0671693816',
 '0671024248',
 '0767905180',
 '044022473X',
 '0684195976',
 '0553096060',
 '0515120618',
 '0385492081',
 '0446603929',
 '0399501487',
 '0451124340']

In [34]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red_ = red.merge(book, on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0671693816,Wifey
1,0671024248,Hearts In Atlantis
2,0767905180,Jemima J: A Novel About Ugly Ducklings and Swans
3,044022473X,Breach of Promise
4,0684195976,BODY FARM
5,0553096060,Sein Language
6,0515120618,Montana Sky
7,0385492081,Into Thin Air : A Personal Account of the Mt. ...
8,0446603929,See How They Run
9,0399501487,Lord of the Flies


In [35]:
unique_ids = rating['ISBN'].unique()
iids = rating.loc[rating['User-ID']==13552, 'ISBN']
book_to_predict = np.setdiff1d(unique_ids,iids)

In [36]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}

# # We'll use KNNWIthMeans
algo = KNNBasic(sim_options=sim_options)

algo.fit(trainset).test(testset)
my_recs = []
for iid in book_to_predict:
    my_recs.append((iid, algo.predict(uid=13552,iid=iid).est))
# rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
# rating  
reco = pd.DataFrame(my_recs, columns=['ISBN', 'predictions']).sort_values('predictions', ascending=False).head(10)
reco = reco.merge(book, on="ISBN")[['ISBN','Book-Title']]
reco

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


Unnamed: 0,ISBN,Book-Title
0,0552143774,Horse Whisperer
1,0446671002,The Celestine Prophecy
2,042510107X,Red Storm Rising
3,0671886665,A Cry In The Night
4,067189109X,The Blessing
5,0671534742,Music in the Night (Logan)
6,0515132136,The Jury
7,0671873202,Hidden Jewel (Landry)
8,0553250531,The Valley of Horses
9,0425116840,The Cardinal of the Kremlin (Jack Ryan Novels)


In [37]:
sim_options       = {'name':'pearson_baseline','min_support':1,'user_based':False}

# # We'll use KNNWIthMeans
algo = KNNWithMeans(sim_options=sim_options)

algo.fit(trainset).test(testset)
my_recs = []
for iid in book_to_predict:
    my_recs.append((iid, algo.predict(uid=13552,iid=iid).est))
# rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
# rating  
reco = pd.DataFrame(my_recs, columns=['ISBN', 'predictions']).sort_values('predictions', ascending=False).head(10)
reco = reco.merge(book, on="ISBN")[['ISBN','Book-Title']]
reco

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


Unnamed: 0,ISBN,Book-Title
0,552143774,Horse Whisperer
1,1844262553,Free
2,553348981,Jitterbug Perfume
3,64471047,"The Lion, the Witch, and the Wardrobe (The Chr..."
4,590353403,Harry Potter and the Sorcerer's Stone (Book 1)
5,812548051,"Wizard's First Rule (Sword of Truth, Book 1)"
6,385424736,The Rainmaker
7,439136350,Harry Potter and the Prisoner of Azkaban (Book 3)
8,452283442,The Darwin Awards: Evolution in Action
9,811801802,Sabine's Notebook: In Which the Extraordinary ...
