In [37]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

import os, sys
import re
import json

import ipywidgets as widgets
from IPython.display import display, clear_output

In [2]:
df_books = pd.read_csv('Books.csv')

In [3]:
df_books = df_books[['ISBN', 'Book-Title', 'Book-Author']]
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [4]:
df_ratings = pd.read_csv('Ratings.csv')
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
df_books.dropna(inplace=True)

In [6]:
ratings = df_ratings['User-ID'].value_counts()
ratings.sort_values(ascending=False).head()

User-ID
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
Name: count, dtype: int64

In [7]:
len(ratings[ratings < 200])

104378

In [8]:
df_ratings['User-ID'].isin(ratings[ratings < 200].index).sum()

622224

In [9]:
df_ratings_rm = df_ratings[
  ~df_ratings['User-ID'].isin(ratings[ratings < 200].index)
]
df_ratings_rm.shape

(527556, 3)

In [10]:
ratings = df_ratings['ISBN'].value_counts() 
ratings.sort_values(ascending=False).head()

ISBN
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: count, dtype: int64

In [11]:
len(ratings[ratings < 100])

339825

In [12]:
df_books['ISBN'].isin(ratings[ratings < 100].index).sum()

269422

In [13]:
df_ratings_rm = df_ratings_rm[
  ~df_ratings_rm['ISBN'].isin(ratings[ratings < 100].index)
]
df_ratings_rm.shape

(49781, 3)

In [14]:
books = ["Ender's Game (Ender Wiggins Saga (Paperback))",
        "To Kill a Mockingbird",
        "The Hitchhiker's Guide to the Galaxy"]

for book in books:
    print(df_ratings_rm['ISBN'].isin(df_books[df_books['Book-Title'] == book]['ISBN']).sum())

50
139
42


In [15]:
df_ratings_rm.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1469,277427,0060930535,0
1471,277427,0060934417,0
1474,277427,0061009059,9
1484,277427,0140067477,0


In [16]:
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [17]:
df = df_ratings_rm.pivot_table(index=['User-ID'],columns=['ISBN'],values='Book-Rating').fillna(0).T
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df.index = df.join(df_books.set_index('ISBN'))['Book-Title']

In [19]:
df = df[~df.index.isna()]
df.index = df.index.astype(str)
df = df.sort_index()
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df.loc["The Hitchhiker's Guide to the Galaxy"][:5]

User-ID
254     0.0
2276    0.0
2766    0.0
2977    0.0
3363    0.0
Name: The Hitchhiker's Guide to the Galaxy, dtype: float64

In [21]:
model = NearestNeighbors(metric='cosine')
model.fit(df.values)

In [22]:
df.iloc[0].shape

(888,)

In [23]:
title = "The Hitchhiker's Guide to the Galaxy"
df.loc[title].shape

(888,)

In [24]:
distance, indice = model.kneighbors([df.loc[title].values], n_neighbors=6)

print(distance)
print(indice)

[[0.         0.70952891 0.77445555 0.78556039 0.78874027 0.80565717]]
[[550 555   0  49 541  32]]


In [25]:
df.iloc[indice[0]].index.values

array(["The Hitchhiker's Guide to the Galaxy", 'The Hours : A Novel',
       '1984', 'American Psycho (Vintage Contemporaries)',
       'The Giver (21st Century Reference)', 'A Wrinkle In Time'],
      dtype=object)

In [26]:
pd.DataFrame({
    'title'   : df.iloc[indice[0]].index.values,
    'distance': distance[0]
}) \
.sort_values(by='distance', ascending=False)

Unnamed: 0,title,distance
5,A Wrinkle In Time,0.805657
4,The Giver (21st Century Reference),0.78874
3,American Psycho (Vintage Contemporaries),0.78556
2,1984,0.774456
1,The Hours : A Novel,0.709529
0,The Hitchhiker's Guide to the Galaxy,0.0


In [27]:
def get_recommends(title = ""):
  try:
    book = df.loc[title]
  except KeyError as e:
    print('The given book', e, 'does not exist')
    return

  distance, indice = model.kneighbors([book.values], n_neighbors=6)

  recommended_books = pd.DataFrame({
      'title'   : df.iloc[indice[0]].index.values,
      'distance': distance[0]
    }) \
    .sort_values(by='distance', ascending=False) \
    .head(5).values

  return [title, recommended_books]

In [28]:
books = get_recommends("Ender's Game (Ender Wiggins Saga (Paperback))")
print(books)

["Ender's Game (Ender Wiggins Saga (Paperback))", array([['Flesh and Blood', 0.7496617045236182],
       ['Dune (Remembering Tomorrow)', 0.7494563214570386],
       ['Hearts in Atlantis', 0.7309808197945691],
       ['Absolute Power', 0.7182511453318849],
       ['Waiting (Vintage International)', 0.6579142144881253]],
      dtype=object)]


In [29]:
books = get_recommends("To Kill a Mockingbird")
print(books)

['To Kill a Mockingbird', array([['STONES FROM THE RIVER', 0.8072619947688987],
       ['Animal Farm', 0.8012895971566558],
       ['Lord of the Flies', 0.7742660384765063],
       ['Drowning Ruth', 0.773584503027546],
       ['The Catcher in the Rye', 0.7657838212105741]], dtype=object)]


In [30]:
books = get_recommends("The Hitchhiker's Guide to the Galaxy")
print(books)

["The Hitchhiker's Guide to the Galaxy", array([['A Wrinkle In Time', 0.8056571681536644],
       ['The Giver (21st Century Reference)', 0.7887402731342112],
       ['American Psycho (Vintage Contemporaries)', 0.7855603933352981],
       ['1984', 0.7744555526517893],
       ['The Hours : A Novel', 0.7095289095872036]], dtype=object)]


In [31]:
with open('book_recommender_model.pkl', 'wb') as f:
    pickle.dump({'model': model, 'df': df, 'df_books': df_books, 'df_ratings_rm': df_ratings_rm}, f)

In [35]:
def evaluate_knn_performance(df_ratings, test_size=0.2):
    unique_users = df_ratings['User-ID'].unique()
    train_users, test_users = train_test_split(unique_users, test_size=test_size, random_state=42)
    
    train_ratings = df_ratings[df_ratings['User-ID'].isin(train_users)]
    test_ratings = df_ratings[df_ratings['User-ID'].isin(test_users)]
    
    train_matrix = train_ratings.pivot_table(
        index=['User-ID'], columns=['ISBN'], values='Book-Rating'
    ).fillna(0).T
    
    model = NearestNeighbors(metric='cosine', n_neighbors=10)
    model.fit(train_matrix.values)
    
    predictions = []
    actuals = []
    
    for user in test_users[:100]:
        user_ratings = test_ratings[test_ratings['User-ID'] == user]
        
        for _, row in user_ratings.iterrows():
            isbn, actual_rating = row['ISBN'], row['Book-Rating']
            
            if isbn in train_matrix.index:
                book_vector = train_matrix.loc[isbn].values.reshape(1, -1)
                
                distances, indices = model.kneighbors(book_vector, n_neighbors=5)
                
                similar_books = train_matrix.iloc[indices[0][1:]].index
                similar_ratings = []
                
                for similar_isbn in similar_books:
                    if user in train_matrix.columns:
                        rating = train_matrix.loc[similar_isbn, user]
                        if rating > 0:
                            similar_ratings.append(rating)
                
                if similar_ratings:
                    predicted = np.mean(similar_ratings)
                else:
                    predicted = np.mean(train_matrix.loc[isbn])
                
                predictions.append(predicted)
                actuals.append(actual_rating)
    
    if predictions:
        mae = mean_absolute_error(actuals, predictions)
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        
        return {
            'MAE': mae,
            'RMSE': rmse,
            'n_predictions': len(predictions),
            'coverage': len(predictions) / len(test_ratings) * 100
        }
    return None

performance = evaluate_knn_performance(df_ratings_rm)

In [39]:
print(json.dumps(performance, indent=1))

{
 "MAE": 1.9094909526336974,
 "RMSE": 3.7960232993116763,
 "n_predictions": 5493,
 "coverage": 57.76632663792197
}
