Dataset: https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset

In [64]:
import pandas as pd
import numpy as np

In [65]:
#Reading the datasets
books_df = pd.read_csv('Books.csv', usecols=['ISBN', 'Book-Title', 'Book-Author'], dtype = {'ISBN': 'str', 'Book-Title': 'str', 'Book-Author':'str'})
ratings_df = pd.read_csv('Ratings.csv', usecols=['ISBN', 'Book-Rating', 'User-ID'], dtype = {'ISBN': 'str', 'Book-Rating': 'int32', 'User-ID':'int32'})
users_df = pd.read_csv('Users.csv', usecols=['User-ID'], dtype = {'User-ID':'int32'})

In [66]:
books_df.head()
# Checking missing values
books_df.isnull().sum()

ISBN           0
Book-Title     0
Book-Author    2
dtype: int64

In [67]:
ratings_df.head()
# Checking missing values
ratings_df.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [68]:
users_df.head()
# Checking missing values
users_df.isnull().sum()

User-ID    0
dtype: int64

# Using Collaborative filtering and KNN

In [69]:
# Merging Books and Ratings dataframes
combined_df = pd.merge(books_df, ratings_df, on = 'ISBN')

#Dropping Book author column since its not needed
combined_df = combined_df.drop('Book-Author', axis=1)
combined_df.head()

Unnamed: 0,ISBN,Book-Title,User-ID,Book-Rating
0,195153448,Classical Mythology,2,0
1,2005018,Clara Callan,8,5
2,2005018,Clara Callan,11400,0
3,2005018,Clara Callan,11676,8
4,2005018,Clara Callan,41385,0


In [70]:
# Grouping by booktitles and their counts of rating
book_rating_count = (combined_df.groupby (by = ['Book-Title'])['Book-Rating'].
                     count().
                     reset_index().
                     rename(columns = {'Book-Rating': 'TotalRatingCount'})
                    [['Book-Title', 'TotalRatingCount']]
                    )
book_rating_count

Unnamed: 0,Book-Title,TotalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [71]:
# Merging the book_rating_count with combined_df
merged_df = combined_df.merge(book_rating_count, left_on = 'Book-Title', right_on = 'Book-Title', how = 'left')
merged_df.head()

Unnamed: 0,ISBN,Book-Title,User-ID,Book-Rating,TotalRatingCount
0,195153448,Classical Mythology,2,0,2
1,2005018,Clara Callan,8,5,14
2,2005018,Clara Callan,11400,0,14
3,2005018,Clara Callan,11676,8,14
4,2005018,Clara Callan,41385,0,14


In [72]:
# Creating a threshold to remove less popular books
popularity_threshold = 50
rating_popular_book = merged_df.query('TotalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,ISBN,Book-Title,User-ID,Book-Rating,TotalRatingCount
30,399135782,The Kitchen God's Wife,8,0,311
31,399135782,The Kitchen God's Wife,11676,9,311
32,399135782,The Kitchen God's Wife,29526,9,311
33,399135782,The Kitchen God's Wife,36836,0,311
34,399135782,The Kitchen God's Wife,46398,9,311


In [73]:
rating_popular_book.shape

(288740, 5)

In [74]:
# Using KNN
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Remove duplicates
rating_popular_book = rating_popular_book.drop_duplicates(['User-ID', 'Book-Title'])

#Pivotting and matrix
pivot_df = rating_popular_book.pivot(index = 'Book-Title', columns = 'User-ID', values = 'Book-Rating').fillna(0)
matrix_df = csr_matrix(pivot_df.values)

pivot_df.head()


User-ID,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(matrix_df)

In [76]:
# Randomly choosing a book
query_index = np.random.choice(pivot_df.shape[0])
print(query_index)
distances, indices = knn_model.kneighbors(pivot_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

1421


In [77]:
pivot_df.index[query_index]

"River's End"

In [78]:
# Finding recommendations
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(pivot_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, pivot_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for River's End:

1: Genuine Lies, with distance of 0.7096796892228145:
2: Considering Kate (The Stanislaskis) (Silhouette Special Edition), with distance of 0.7421524999011877:
3: Born in Ice, with distance of 0.7519153790515136:
4: Private Scandals, with distance of 0.7586325137125823:
5: Montana Sky, with distance of 0.760181556140312:
