## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Upload Dataset

In [2]:
book_df = pd.read_csv(r'book/Books.csv')
rating_df = pd.read_csv(r'book/Ratings.csv')

## Book Data Preprocessing

In [3]:
book_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
book_df.shape

(271360, 8)

In [None]:
book_df.isna().sum()

In [None]:
book_df.dropna(inplace=True)

In [None]:
book_df.isna().sum()

## Rating Data Preprocessing

In [None]:
rating_df.head()

In [None]:
rating_df.shape

In [None]:
rating_df.isna().sum()

## Exploratory Data Analysis

In [None]:
user_rating = rating_df['User-ID'].value_counts()

In [None]:
user_rating

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Book-Rating', data=rating_df)
plt.title('Distribution of Book Ratings')
plt.xlabel('Book Rating')
plt.ylabel('Count')
plt.show()

In [None]:
num_ratings_per_book = rating_df['ISBN'].value_counts()

plt.figure(figsize=(10, 6))
sns.histplot(num_ratings_per_book, bins=50, kde=True)
plt.title('Distribution of Number of Ratings per Book')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Books')
plt.show()

In [None]:
user_ratings_dist = rating_df.groupby('User-ID')['Book-Rating'].mean()

plt.figure(figsize=(10, 6))
sns.histplot(user_ratings_dist, bins=10, kde=True)
plt.title('Distribution of Average User Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Number of Users')
plt.show()

In [None]:
rating_df.info()

In [None]:
rating_df.describe()

## Preprocess for Machine Learning

In [None]:
book_df = book_df.drop(['Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [None]:
book_df

In [None]:
ratings = rating_df['User-ID'].value_counts()

In [None]:
ratings.sort_values(ascending=False).head()

### Filter users with less than 200 ratings

In [None]:
len(ratings[ratings < 200])

In [None]:
rating_df['User-ID'].isin(ratings[ratings < 200].index).sum()

In [None]:
filtered_users = ratings[ratings < 200].index

df_ratings_rm = rating_df[~rating_df['User-ID'].isin(filtered_users)]

df_ratings_rm.shape

### Filter books with less than 100 ratings

In [None]:
ratings = rating_df['ISBN'].value_counts() 
ratings.sort_values(ascending=False).head()

In [None]:
len(ratings[ratings < 100])

In [None]:
book_df['ISBN'].isin(ratings[ratings < 100].index).sum()

In [None]:
filtered_isbns = ratings[ratings < 100].index

df_ratings_rm = df_ratings_rm[~df_ratings_rm['ISBN'].isin(filtered_isbns)]

df_ratings_rm.shape

In [None]:
df_ratings_rm.head()

In [None]:
book_df.head()

In [None]:
df = df_ratings_rm.pivot_table(index=['User-ID'],columns=['ISBN'],values='Book-Rating').fillna(0).T
df.head()

In [None]:
df.index = df.join(book_df.set_index('ISBN'))['Book-Title']

In [None]:
df = df.sort_index()
df.head()

In [None]:
df.loc["I'll Be Seeing You"][:5]

In [None]:
df.to_csv('preprocessed_book_data.csv', index=True)

### Create KNN Model

In [None]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
model = NearestNeighbors(metric='cosine')
model.fit(df.values)

### Check Prediction

In [None]:
title = "I'll Be Seeing You"
df.loc[title].shape

In [None]:
distance, indice = model.kneighbors([df.loc[title].values], n_neighbors=6)

print(distance)
print(indice)

In [None]:
pd.DataFrame({
    'title'   : df.iloc[indice[0]].index.values,
    'distance': distance[0]
}) \
.sort_values(by='distance', ascending=False)

In [None]:
def get_recommends(title=""):
    if title not in df.index:
        print(f"The book '{title}' does not exist in the dataset.")
        return []

    book_vector = df.loc[title].values.reshape(1, -1)

    distances, indices = model.kneighbors(book_vector, n_neighbors=6)

    recommended_books = pd.DataFrame({
        'title': df.index[indices.flatten()][1:],
        'distance': distances.flatten()[1:]  
    })

    recommended_books = recommended_books.sort_values(by='distance', ascending=True)

    return [title, recommended_books.values.tolist()]

In [None]:
books = get_recommends("I'll Be Seeing You")
print(books)

## Save Model

In [None]:
import joblib

joblib.dump(model, 'book_recommender_model.joblib')

In [None]:
loaded_model = joblib.load('book_recommender_model.joblib')

In [None]:
title = "I'll Be Seeing You"

if title in df.index:
    book_vector = df.loc[title].values.reshape(1, -1)
    distances, indices = loaded_model.kneighbors(book_vector, n_neighbors=6)

    # Print the recommended books
    recommended_books = df.index[indices.flatten()][1:]  # Exclude the book itself
    print(f"Recommendations for '{title}':")
    for book, distance in zip(recommended_books, distances.flatten()[1:]):
        print(f"{book} (similarity score: {1 - distance:.2f})")
else:
    print(f"Book '{title}' not found in the dataset.")
