In [157]:
# Libraries for data preparation & visualization
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "png"

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")

# pip install scikit-surprise
# Importing libraries for model building & evaluation
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD
from surprise import accuracy

In [158]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df

book   = loaddata("../../BX-Books")
user   = loaddata("../../BX-Users")
rating = loaddata("../../BX-Book-Ratings")

In [105]:
# #Preprocessing Data
# book = book[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
# book.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
# user.rename(columns = {'User-ID':'user_id', 'Location':'location', 'Age':'age'}, inplace=True)
# rating.rename(columns = {'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)

In [159]:
rating_users = rating['User-ID'].value_counts().reset_index().\
               rename({'Index':'User-ID','User-ID':'Rating'}, axis=1)
rating_books = rating['ISBN'].value_counts().reset_index().\
               rename({'Index':'ISBN','ISBN':'Rating'}, axis=1)
# In order to avoid rating bias & for making good recommendations, limit the dataset to only those
# users that have made at least 250 ratings & books that have received at least 50 ratings

rating = rating[rating['User-ID'].isin(rating_users[rating_users['Rating']>250]['index'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['Rating']>50]['index'])]

rating

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1468,277427,006092988X,0
1469,277427,0060930535,0
1470,277427,0060932139,0
1471,277427,0060934417,0
...,...,...,...
1147440,275970,1400031354,0
1147441,275970,1400031362,0
1147470,275970,1558744606,0
1147517,275970,1573229725,0


In [160]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['User-ID','ISBN','Book-Rating','Book-Title']] # merging with the book dataframe
rating               

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
79308,234828,0345333926,8,Ringworld
79309,236283,0345333926,0,Ringworld
79310,249628,0345333926,0,Ringworld
79311,261829,0345333926,0,Ringworld


In [161]:
ratings_explicit=rating[rating['Book-Rating']!=0]
ratings_implicit=rating[rating['Book-Rating']==0]
print(ratings_explicit.shape)
print(ratings_implicit.shape)

(18255, 4)
(61058, 4)


In [109]:
# x = rating['user_id'].value_counts() > 50
# y = x[x].index  #user_ids
# print(y.shape)
# ratings = rating[rating['user_id'].isin(y)]

In [110]:
# rating_with_books = ratings.merge(book, on='ISBN')
# rating_with_books.head()

In [111]:
# number_rating = rating_with_books.groupby('title')['rating'].count().reset_index()
# number_rating.rename(columns= {'rating':'number_of_ratings'}, inplace=True)
# final_rating = rating_with_books.merge(number_rating, on='title')
# final_rating.shape
# final_rating = final_rating[final_rating['number_of_ratings'] > 50]
# final_rating.drop_duplicates(['user_id','title'], inplace=True)
# final_rating

In [112]:
# x = rating_with_names.groupby('user_id').count()['rating'] >50
# like_user = x[x].index

In [113]:
# filter_rating = rating_with_names[rating_with_names['user_id'].isin(like_user)]
# y = filter_rating.groupby('title').count()['rating'] >50
# like_book = y[y].index

In [114]:
# final_rating = filter_rating[filter_rating['title'].isin(like_book)]
# final_rating

In [162]:
# creating a surprise object

reader = Reader(rating_scale=(1, 10))
data   = Dataset.load_from_df(ratings_explicit[['User-ID','ISBN','Book-Rating']], reader)
data1  = Dataset.load_from_df(rating[['User-ID','ISBN','Book-Rating']], reader)


# Split the data into training & testing sets. Python's surprise documentation has the steps detailed out
# https://surprise.readthedocs.io/en/stable/FAQ.html

raw_ratings = data.raw_ratings
import random
random.shuffle(raw_ratings)                 # shuffle dataset

threshold   = int(len(raw_ratings)*0.8)

train_raw_ratings = raw_ratings[:threshold] # 80% of data is trainset
test_raw_ratings  = raw_ratings[threshold:] # 20% of data is testset

data.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data.build_full_trainset() 
testset          = data.construct_testset(test_raw_ratings)

In [164]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data1, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [165]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,2.534315,3.244592,0.107939,1.148718
knns.KNNBaseline,2.533466,3.249884,0.147195,1.469613
knns.KNNWithZScore,2.523053,3.276664,0.15405,1.313387
knns.KNNBasic,2.584221,3.422385,0.091954,0.979471


In [166]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),KNNBaseline()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [167]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithZScore,1.276008,1.729309,0.035031,0.048386
knns.KNNWithMeans,1.296274,1.734548,0.01643,0.045664
knns.KNNBaseline,1.327387,1.750711,0.020371,0.053514
knns.KNNBasic,1.472258,1.935368,0.009334,0.043558


In [176]:
# Hyperparameter tuning - KNNWithMeans

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNWithMeans.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNWithMeans.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNWithMeans.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNWithMeans.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNWithMeans.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
MAE Best Score:       1.2548607279760038

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
RMSE Best Score:      1.6856873564742845



In [175]:
# Hyperparameter tuning - KNNWithMeans

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNNBasic.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNBasic.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNBasic.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNBasic.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNBasic.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
MAE Best Score:       1.2455467137747185

RMSE Best Parameters: {'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
RMSE Best Score:      1.6882591098492425



In [174]:
# Hyperparameter tuning - KNNWithMeans

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNWithZScore, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
MAE Best Score:       1.2497031229042617

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': True}}
RMSE Best Score:      1.6874698180323033



In [173]:
# Hyperparameter tuning - KNNWithMeans

param_grid = { 'sim_options' : {'name': ['msd','cosine','pearson','pearson_baseline'], \
                                'min_support': [1,5], \
                                'user_based': [False, True]}
             }

gridsearchKNN = GridSearchCV(KNNBaseline, param_grid, measures=['mae', 'rmse'], \
                                      cv=5, n_jobs=-1)
                                    
gridsearchKNN.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNN.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNN.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNN.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNN.best_score["rmse"]}\n')

MAE Best Parameters:  {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}
MAE Best Score:       1.1778894947909673

RMSE Best Parameters: {'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}
RMSE Best Score:      1.5420048724186066



In [180]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'cosine','min_support':3,'user_based':False}
final_model = KNNWithMeans(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.3279
RMSE: 1.7495
MAE: 1.3278984096065414, RMSE: 1.7494852121400515


In [121]:
rating

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
79308,234828,0345333926,8,Ringworld
79309,236283,0345333926,0,Ringworld
79310,249628,0345333926,0,Ringworld
79311,261829,0345333926,0,Ringworld


In [122]:
reader = Reader(rating_scale=(1, 10))
trainset = data.build_full_trainset()

In [181]:
# KNNWithMeans

def generate_recommendationsKNN(userID=254, like_recommend=40, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'cosine','min_support':1,'user_based':False}
    similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
                      

    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [182]:
recommendationsKNN = generate_recommendationsKNN(userID=254, like_recommend=40, get_recommend=10)
recommendationsKNN

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


['059035342X',
 '0312195516',
 '0385484518',
 '043935806X',
 '0345337662',
 '067976402X',
 '0060987103',
 '0375727345',
 '0312966970',
 '006101351X',
 '0553572997']

In [183]:
recommendationsKNN

['059035342X',
 '0312195516',
 '0385484518',
 '043935806X',
 '0345337662',
 '067976402X',
 '0060987103',
 '0375727345',
 '0312966970',
 '006101351X',
 '0553572997']

In [184]:
red = pd.DataFrame(recommendationsKNN,columns = ['ISBN'])
red

Unnamed: 0,ISBN
0,059035342X
1,0312195516
2,0385484518
3,043935806X
4,0345337662
5,067976402X
6,0060987103
7,0375727345
8,0312966970
9,006101351X


In [188]:
red_ = pd.merge(book, red , on="ISBN")[['ISBN','Book-Title']]
red_

Unnamed: 0,ISBN,Book-Title
0,0060987103,Wicked: The Life and Times of the Wicked Witch...
1,006101351X,The Perfect Storm : A True Story of Men Agains...
2,0312195516,The Red Tent (Bestselling Backlist)
3,0553572997,The Alienist
4,067976402X,Snow Falling on Cedars
5,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...
6,0385484518,"Tuesdays with Morrie: An Old Man, a Young Man,..."
7,0345337662,Interview with the Vampire
8,0312966970,Four To Score (A Stephanie Plum Novel)
9,0375727345,House of Sand and Fog


#####

In [151]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithZScore,1.274647,1.729314,0.044096,0.060864
knns.KNNWithMeans,1.299897,1.74531,0.020414,0.056113
knns.KNNBaseline,1.320924,1.747081,0.02154,0.065336
knns.KNNBasic,1.467127,1.925629,0.013116,0.058503


In [153]:
from surprise import Dataset, SVD
from surprise.model_selection import cross_validate


# # Load the movielens-100k dataset (download it if needed),
# data = Dataset.load_builtin("ml-100k")

# # We'll use the famous SVD algorithm.
algo = KNNWithZScore()

# Run 5-fold cross-validation and print results
cross_validate(algo, data1, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2782  3.2969  3.2744  3.2913  3.2445  3.2771  0.0183  
MAE (testset)     2.5253  2.5374  2.5299  2.5287  2.5022  2.5247  0.0119  
Fit time          0.13    0.16    0.14    0.14    0.14    0.14    0.01    
Test time         0.89    0.92    0.86    0.88    0.84    0.88    0.03    


{'test_rmse': array([3.27818507, 3.29693241, 3.27439862, 3.29134407, 3.24448568]),
 'test_mae': array([2.52531315, 2.53741773, 2.5298994 , 2.52869935, 2.50222298]),
 'fit_time': (0.13056445121765137,
  0.1573190689086914,
  0.13755583763122559,
  0.14490318298339844,
  0.13589739799499512),
 'test_time': (0.8915252685546875,
  0.9186336994171143,
  0.8615617752075195,
  0.8788585662841797,
  0.8445315361022949)}

In [156]:
from surprise.model_selection import train_test_split


# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data1, test_size=0.20)

# We'll use the famous SVD algorithm.
algo = KNNWithMeans()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.2245


3.2244542412103083

In [155]:
predictions = algo.fit(trainset).test(testset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [132]:
# # reducing to user rating 100 books and books have 100 ratings
# counts1 = rating['User-ID'].value_counts()
# rating = rating[rating['User-ID'].isin(counts1[counts1 >= 100].index)]
# counts = rating['Book-Rating'].value_counts()
# rating = rating[rating['Book-Rating'].isin(counts[counts >= 100].index)]

# rating

In [193]:
ratings_explicit

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
15,52584,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
18,71712,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
79301,166123,0345333926,7,Ringworld
79303,171118,0345333926,8,Ringworld
79304,184299,0345333926,8,Ringworld
79308,234828,0345333926,8,Ringworld


In [194]:
ratings_matrix = ratings_explicit.pivot(index = 'User-ID', columns = 'ISBN', values = 'Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

(656, 2085)


ISBN,000649840X,002026478X,0020442203,002542730X,0028604199,006000438X,0060008032,0060008776,006001203X,0060085444,...,1860492592,1878424319,1885171080,1931561648,3257228007,3257229534,3404148665,3423202327,3442541751,3492045170
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
275970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277427,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [195]:
from IPython.display import display
import pandas as pd
import numpy as np

# Import seaborn for statistic evaluation
import seaborn as sns
# Apply the default theme
sns.set_theme()
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
import sklearn
import re
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import os, sys

In [196]:
# KNN Function
#setting global variables
global metric,k
k=10
global_metric='cosine'
global_algorithm = 'brute'

In [197]:
def findksimilarusers(user_id, ratings, metric=global_metric, algo=global_algorithm,k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = algo)
    model_knn.fit(ratings.values)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1 - distances.flatten()
    
    return similarities, indices

In [198]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = global_metric, algorithm = global_algorithm, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilarusers(user_id, ratings, metric, algorithm, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc, :].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings
    #which are handled here as below
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    # print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))

    return prediction

In [199]:
def recommendedItem(user_id, ratings, metric=global_metric):
    if (user_id not in ratings.index.values) or type(user_id) is not int:
        print("User id should be a valid integer from this list : \n\n {}".format(re.sub('[\[\]]', '', np.array_str(ratings_matrix.index.values))))
    else:
        prediction = []
        
#         ids = ['Item-based (correlation)', 'Item-based (cosine)', 'User-based (correlation)', 'User-based (cosine)']
#         select = widgets.Dropdown(options = ids, value = ids[0], description = ' Select approach', width = '1000px')
#         def on_change(change):
#             clear_output(wait = True)
#             prediction = []
#             if change['type'] == 'change' and change['name'] == 'value':
#                 if (select.value == 'Item-based (correlation)') | (select.value == 'User-based (correlation)'):
#                     global_metric = 'correlation'
#                 else:
#                     global_metric = 'cosine'
                    
#                 with suppress_stdout():
#         if (select.value == 'User-based (correlation)') | (select.value == 'User-based (cosine)'):

        total = ratings.shape[1]
        for i in range(ratings.shape[1]):
            if (ratings[str(ratings.columns[i])][user_id] != 0): #not rated already
                print("\r{0:<0.2f}% Finished".format(i*100/total), end='')
                prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))
            else:                    
                prediction.append(-1) #for books that user already rated
#         else:
#             print("Item-based not implemented yet")
        print("\r100.00% Finished")
        
        prediction = pd.Series(prediction)
        prediction = prediction.sort_values(ascending = False)
        print("...")
        print(prediction)
        
        recommended = prediction[:10]
        print("\nFor the User-based ({0} - {1}) approach, the following books are recommended: \n".format(global_algorithm, global_metric))
        for i in range(len(recommended)):
            print("{0}. Rated {1}: {2}".format(i+1, prediction.values[i], book['Book-Title'][recommended.index[i]]))
            

In [200]:
global_algorithm = 'brute'
global_metric = 'cosine'
recommendedItem(254, ratings_matrix)

100.00% Finished
...
847     9
843     9
845     8
1601    7
844     2
       ..
691    -1
690    -1
689    -1
688    -1
2084   -1
Length: 2085, dtype: int64

For the User-based (brute - cosine) approach, the following books are recommended: 

1. Rated 9: The Second Coming of Curly Red
2. Rated 9: Leaving Pipe Shop: Memories of Kin
3. Rated 8: Skin: Talking About Sex, Class &amp; Literature
4. Rated 7: Der Mann, der's wert ist.
5. Rated 2: Love Ruins Everything: A Novel
6. Rated 2: Death: At Death's Door (Vertigo, Number 1)
7. Rated 2: Stately Pursuits
8. Rated 1: Witching Hour (Lives of the Mayfair Witches)
9. Rated 1: It Happened to Nancy : By an Anonymous Teenager, A True Story from Her Diary (Confident Collector)
10. Rated 0: Chopping Spree


In [201]:
global_algorithm = 'brute'
global_metric = 'euclidean'
recommendedItem(254, ratings_matrix)

100.00% Finished
...
847     9
843     9
845     8
1601    7
844     2
       ..
691    -1
690    -1
689    -1
688    -1
2084   -1
Length: 2085, dtype: int64

For the User-based (brute - euclidean) approach, the following books are recommended: 

1. Rated 9: The Second Coming of Curly Red
2. Rated 9: Leaving Pipe Shop: Memories of Kin
3. Rated 8: Skin: Talking About Sex, Class &amp; Literature
4. Rated 7: Der Mann, der's wert ist.
5. Rated 2: Love Ruins Everything: A Novel
6. Rated 2: Death: At Death's Door (Vertigo, Number 1)
7. Rated 2: Stately Pursuits
8. Rated 1: Witching Hour (Lives of the Mayfair Witches)
9. Rated 1: It Happened to Nancy : By an Anonymous Teenager, A True Story from Her Diary (Confident Collector)
10. Rated 0: Chopping Spree


In [202]:
global_algorithm = 'brute'
global_metric = 'correlation'
recommendedItem(254, ratings_matrix)

100.00% Finished
...
847     9
843     9
845     8
1601    7
844     2
       ..
691    -1
690    -1
689    -1
688    -1
2084   -1
Length: 2085, dtype: int64

For the User-based (brute - correlation) approach, the following books are recommended: 

1. Rated 9: The Second Coming of Curly Red
2. Rated 9: Leaving Pipe Shop: Memories of Kin
3. Rated 8: Skin: Talking About Sex, Class &amp; Literature
4. Rated 7: Der Mann, der's wert ist.
5. Rated 2: Love Ruins Everything: A Novel
6. Rated 2: Death: At Death's Door (Vertigo, Number 1)
7. Rated 2: Stately Pursuits
8. Rated 1: Witching Hour (Lives of the Mayfair Witches)
9. Rated 1: It Happened to Nancy : By an Anonymous Teenager, A True Story from Her Diary (Confident Collector)
10. Rated 0: Chopping Spree


In [143]:
#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE
#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print 
#RMSE values
def evaluateRS(ratings):
#     ids = ['User-based CF (cosine)','User-based CF (correlation)','Item-based CF (cosine)','Item-based CF (adjusted cosine)']
    approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')
    n_users = ratings.shape[0]
    n_items = ratings.shape[1]
    prediction = np.zeros((n_users, n_items))
    prediction= pd.DataFrame(prediction)
    def on_change(change):
        clear_output(wait=True)
        with suppress_stdout():
            if change['type'] == 'change' and change['name'] == 'value':            
                if (approach.value == 'User-based CF (cosine)'):
                    metric = 'cosine'
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)
                elif (approach.value == 'User-based CF (correlation)')  :                       
                    metric = 'correlation'               
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)
                elif (approach.value == 'Item-based CF (cosine)'):
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings)
                else:
                    for i in range(n_users):
                        for j in range(n_items):
                            prediction[i][j] = predict_userbased(i+1, j+1, ratings)
              
        MSE = mean_squared_error(prediction, ratings)
        RMSE = round(sqrt(MSE),3)
#         print "RMSE :".format(approach.value,RMSE)
              
    approach.observe(on_change)
    display(approach)

In [203]:
#This is final function to evaluate the performance of selected recommendation approach and the metric used here is RMSE
#suppress_stdout function is used to suppress the print outputs of all the functions inside this function. It will only print 
#RMSE values
def evaluateRS(ratings):
    approach = widgets.Dropdown(options=ids, value=ids[0],description='Select Approach', width='500px')
    n_users = ratings.shape[0]
    n_items = ratings.shape[1]
    prediction = np.zeros((n_users, n_items))
    prediction= pd.DataFrame(prediction)
    def on_change(change):
        metric = 'cosine'
        for i in range(n_users):
            for j in range(n_items):
                prediction[i][j] = predict_userbased(i+1, j+1, ratings, metric)
              
        MSE = mean_squared_error(prediction, ratings)
        RMSE = round(sqrt(MSE),3)
#         print "RMSE :".format(approach.value,RMSE)
              
    approach.observe(on_change)
    display(approach)

In [144]:
reader = Reader(rating_scale=(1, 10))
data   = Dataset.load_from_df(rating[['User-ID','ISBN','Book-Rating']], reader)


In [145]:
rating

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
79308,234828,0345333926,8,Ringworld
79309,236283,0345333926,0,Ringworld
79310,249628,0345333926,0,Ringworld
79311,261829,0345333926,0,Ringworld


In [146]:
# Model fit & prediction - KNNWithMeans

sim_options = {'name':'cosine','min_support':1,'user_based':False}
final_model = KNNWithMeans(sim_options=sim_options)

# Fitting the model on trainset & predicting on testset, printing test accuracy
pred = final_model.fit(trainset).test(testset)

print(f'\nUnbiased Testing Performance:')
print(f'MAE: {accuracy.mae(pred)}, RMSE: {accuracy.rmse(pred)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.

Unbiased Testing Performance:
MAE:  1.3329
RMSE: 1.7578
MAE: 1.3328607215548554, RMSE: 1.7578052391815986
