In [1]:
# Import Libraries

import numpy as np
import pandas as pd
import html
import pickle

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

## Loading and Preprocessing the Dataset

This section of the code **loads, inspects, and preprocesses** the book recommendation dataset. It consists of three CSV files:

- **Books.csv** → Contains book details (title, author, ISBN, etc.).
- **Ratings.csv** → Stores user ratings for books.
- **Users.csv** → Includes user demographic information.

### **Steps in the Process:**

1. **Load the Datasets:**
   - Uses `pd.read_csv()` to load the three datasets into Pandas DataFrames.
   - `low_memory=False` is set for `Books.csv` to handle mixed data types efficiently.

2. **Inspect the Data:**
   - `head()` displays the first few rows of each dataset for an overview.
   - `shape` shows the number of rows and columns.

3. **Handle Missing Data:**
   - `isnull().sum()` checks for missing values in the `books` dataset.
   - `dropna(inplace=True)` removes any rows containing missing values.

4. **Remove Duplicate Book Titles:**
   - `drop_duplicates('Book-Title')` ensures that each book title appears only once.

5. **Fix HTML-Encoded Titles:**
   - `html.unescape()` is applied to `Book-Title` to correct improperly encoded characters.

6. **Merge Ratings with Book Titles:**
   - `ratings.merge(books, on='ISBN')` joins the `ratings` and `books` datasets so that ratings include book details instead of just ISBNs.

This **cleaned and structured dataset** serves as the foundation for building the recommendation system. 


In [2]:
# Load Dataset

books = pd.read_csv('data/input/Books.csv', low_memory=False)
ratings = pd.read_csv('data/input/Ratings.csv')
users = pd.read_csv('data/input/Users.csv')

In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
books.shape

(271360, 8)

In [7]:
ratings.shape

(1149780, 3)

In [8]:
users.shape

(278858, 3)

In [9]:
books.dropna(inplace=True)

In [10]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64

In [11]:
books.drop_duplicates('Book-Title')

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271354,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [12]:
books['Book-Title'] = books['Book-Title'].apply(html.unescape)

In [13]:
books_with_ratings = ratings.merge(books,on='ISBN')

## Collaborative Filtering-Based Book Recommendation System

This section of the code **implements a book recommender system** using **collaborative filtering** based on user-book interactions. The goal is to suggest books similar to a given book based on user ratings.

### **Steps in the Process:**

1. **Merge Ratings and Books Data:**
   - Combines the `ratings` dataset with `books` using the `ISBN` as the key to include book details.

2. **Filter Active Users:**
   - Identifies users who have rated more than **200 books** to focus on engaged readers.
   - Stores these **active users** for further filtering.

3. **Filter Popular Books:**
   - Selects books that have at least **50 ratings** to ensure recommendations are based on meaningful interactions.

4. **Create a User-Book Pivot Table (`pt`):**
   - Converts the filtered data into a **matrix** where:
     - **Rows** represent book titles.
     - **Columns** represent user IDs.
     - **Values** are the ratings users gave to books.
   - Missing values are filled with `0`.

5. **Compute Similarity Using Cosine Similarity:**
   - Uses `cosine_similarity(pt)` to measure how similar books are based on user ratings.
   - This results in a **similarity matrix** where each book is compared to others.

6. **Implement the `recommend()` Function:**
   - Finds the **index** of the given book in the pivot table.
   - Retrieves the **top 4 most similar books** based on cosine similarity scores.
   - Extracts book details (title, author, and image) from the `books` dataset.
   - Returns the **recommended books** as a list.

### **Usage Example:**
Running `recommend('The Da Vinci Code')` returns a list of similar books based on user preferences.

This **collaborative filtering approach** enables the system to make recommendations **without requiring explicit book features**, relying instead on **patterns in user behavior**.


In [14]:
active_users = books_with_ratings.groupby('User-ID')['Book-Rating'].count().loc[lambda x: x > 200].index

In [15]:
active_user_ratings = books_with_ratings[books_with_ratings['User-ID'].isin(active_users)]

In [16]:
highly_rated_books = active_user_ratings.groupby('Book-Title')['Book-Rating'].count().loc[lambda x: x >= 50].index

In [17]:
filtered_popular_ratings = active_user_ratings[active_user_ratings['Book-Title'].isin(highly_rated_books)]

In [18]:
pivot_table = filtered_popular_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [19]:
pivot_table.fillna(0,inplace=True)

In [20]:
pivot_table.head(10)

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Case of Need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
"A Child Called \It\"": One Child's Courage to Survive""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Civil Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Day Late and a Dollar Short,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Fine Balance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
similarity = cosine_similarity(pivot_table)

In [22]:
similarity.shape

(706, 706)

In [23]:
def recommend(book_name):
    # index fetch
    index = np.where(pivot_table.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity[index])),key=lambda x:x[1],reverse=True)[1:5]

    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pivot_table.index[i[0]]]
        item.extend(temp_df.drop_duplicates('Book-Title')['Book-Title'].to_list())
        item.extend(temp_df.drop_duplicates('Book-Title')['Book-Author'].to_list())
        item.extend(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].to_list())

        data.append(item)

    return data

In [24]:
recommend('The Da Vinci Code')

[['Angels & Demons',
  'Dan Brown',
  'http://images.amazon.com/images/P/0671027360.01.MZZZZZZZ.jpg'],
 ['Touching Evil',
  'Kay Hooper',
  'http://images.amazon.com/images/P/0553583441.01.MZZZZZZZ.jpg'],
 ['Saving Faith',
  'David Baldacci',
  'http://images.amazon.com/images/P/0446608890.01.MZZZZZZZ.jpg'],
 ["The Sweet Potato Queens' Book of Love",
  'JILL CONNER BROWNE',
  'http://images.amazon.com/images/P/0609804138.01.MZZZZZZZ.jpg']]

## Identifying Popular Books

This section processes the dataset to **identify and rank popular books** based on user ratings. It follows these steps:

### **1. Count the Number of Ratings per Book**
- Groups the dataset by **Book-Title** and counts how many times each book has been rated.
- Stores this information in `num_rating_df`, renaming the column to `num_ratings`.

### **2. Calculate the Average Rating per Book**
- Computes the **mean rating** for each book and stores it in `avg_rating_df`, renaming the column to `avg_rating`.

### **3. Merge the DataFrames**
- Combines `num_rating_df` and `avg_rating_df` into a new dataset called `popular_df`.
- This dataset now includes both the number of ratings and the average rating for each book.

### **4. Filter for Popular Books**
- Selects books that have received **at least 250 ratings** to ensure only widely reviewed books are considered.
- Sorts the books in **descending order** based on their average rating, keeping only the top **50 books**.

### **5. Merge with Book Details**
- Joins `popular_df` with the `books` dataset to include **author names and large-sized images (`Image-URL-L`)**.
- Removes duplicate book titles to ensure uniqueness.

### **6. Ensure Secure Image URLs**
- Converts **HTTP** image links to **HTTPS** for security and proper rendering.

The resulting `popular_df` contains the **top 50 most popular books** based on user ratings, ready to be displayed in the recommendation system. 🚀


In [25]:
num_ratings = books_with_ratings.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_ratings.rename(columns={'Book-Rating':'num_ratings'},inplace=True)
num_ratings.head()

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [26]:
avg_ratings = books_with_ratings.groupby('Book-Title', as_index=False)['Book-Rating'].mean()
avg_ratings.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)
avg_ratings.head()

Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


In [27]:
popular_books = num_ratings.merge(avg_ratings,on='Book-Title')

In [28]:
popular_books = popular_books[popular_books['num_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)

In [29]:
popular_books = popular_books.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-L','num_ratings','avg_rating']]

In [30]:
popular_books['Image-URL-L'] = popular_books['Image-URL-L'].str.replace('http://', 'https://', regex=False)

## K-Nearest Neighbors (KNN) for Book Recommendation

This section applies the **K-Nearest Neighbors (KNN)** algorithm to recommend books based on user preferences. Using the user-book matrix created earlier, the model identifies books that are most similar to a given book using cosine similarity.

### **Steps in the Process:**

1. **Convert to Sparse Matrix:**
    - The pivot table is converted into a **sparse matrix** using `csr_matrix` to reduce memory usage and computational cost.

2. **Build and Fit the KNN Model:**
    - The model is created using `NearestNeighbors` with the **cosine similarity** metric and **brute-force** search for neighbor identification.
    - The model is then fitted to the sparse matrix.

3. **Recommendation Function:**
    - The `recommend_books()` function takes a book title as input.
    - It checks whether the book exists in the dataset.
    - Using the KNN model, it computes the **10 nearest neighbors** to the input book.
    - For each recommended book, it retrieves details such as **title**, **author**, and **cover image**.

4. **Example Usage:**
To get book recommendations similar to *Harry Potter and the Chamber of Secrets (Book 2)*, you can use the following command: `recommend_books('Harry Potter and the Chamber of Secrets (Book 2)')`


In [31]:
sparse_matrix = csr_matrix(pivot_table.values)

In [32]:
# Build and Fit KNN Model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(sparse_matrix)

In [33]:
# Function to Recommend Books using KNN
def recommend_books(book_name):
    if book_name not in pivot_table.index:
        print(f"Error: '{book_name}' not found in the dataset.")
        return []

    index = np.where(pivot_table.index == book_name)[0][0]

    book_vector = sparse_matrix[index].toarray().reshape(1, -1)
    distances, indices = model.kneighbors(book_vector, n_neighbors=11)

    data = []
    for i in indices[0][1:]:
        item = []
        temp_df = books[books['Book-Title'] == pivot_table.index[i]]
        item.extend(temp_df.drop_duplicates('Book-Title')['Book-Title'].to_list())
        item.extend(temp_df.drop_duplicates('Book-Title')['Book-Author'].to_list())
        item.extend(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].to_list())
        data.append(item)

    return data

In [34]:
recommend_books('Harry Potter and the Chamber of Secrets (Book 2)')

[['Harry Potter and the Prisoner of Azkaban (Book 3)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Book 1)",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Order of the Phoenix (Book 5)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg'],
 ["Charlotte's Web (Trophy Newbery)",
  'E. B. White',
  'http://images.amazon.com/images/P/0064400557.01.MZZZZZZZ.jpg'],
 ['The Fellowship of the Ring (The Lord of the Rings, Part 1)',
  'J.R.R. TOLKIEN',
  'http://images.amazon.com/images/P/0345339703.01.MZZZZZZZ.jpg'],
 ['The Witness

## Serializing and Saving Objects with Pickle

This section of the code **saves important data structures** using Python's `pickle` module. The purpose is to **store precomputed objects** so they can be reloaded later without recomputing them.

### Code Breakdown:
- **`collaborative_filtering.pkl`** → Saves the similarity matrix for recommendations.
- **`books.pkl`** → Stores the book dataset.
- **`pivot_table.pkl`** → Saves the pivot table of books and user interactions.
- **`popular_books.pkl`** → Stores the precomputed dataset of popular books.

### How It Works:
- The `pickle.dump()` function serializes Python objects into `.pkl` files.
- The `with open(..., 'wb')` statement opens files in **write-binary (`wb`)** mode to store the data.
- These files can later be **reloaded using `pickle.load()`** to avoid recomputation.

This approach **improves efficiency** when working with large datasets, as you don't need to generate these objects every time the script runs.

In [35]:
file_path = "data/models/"

In [36]:
with open(file_path + 'books.pkl', 'wb') as f:
    pickle.dump(books, f)

In [37]:
with open(file_path + 'pivot_table.pkl', 'wb') as f:
    pickle.dump(pivot_table, f)

In [38]:
with open(file_path + 'collaborative_filtering.pkl', 'wb') as f:
    pickle.dump(similarity, f)

In [39]:
with open(file_path + 'popular_books.pkl', 'wb') as f:
    pickle.dump(popular_books, f)