### **Project 2: Book Recommendation System**

Iâ€™ve built both a popularity-based and a collaborative filtering book recommendation system.

#### **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### **Loading Data from Google Drive and Reading the Datasets**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
books = pd.read_csv('/content/drive/My Drive/Infotact Internship/Project 2/Books.csv')
users = pd.read_csv('/content/drive/My Drive/Infotact Internship/Project 2/Users.csv')
ratings = pd.read_csv('/content/drive/My Drive/Infotact Internship/Project 2/Ratings.csv')

ValueError: mount failed

#### **Previewing and Checking the Data**

In [None]:
books.head()

In [None]:
users.head()

In [None]:
ratings.head()

In [None]:
print(books.shape)
print(ratings.shape)
print(users.shape)

### **Clean & Preprocess**

##### **Check missing values summary**

In [None]:
books.isnull().sum()

In [None]:
users.isnull().sum()

In [None]:
ratings.isnull().sum()

In [None]:
print(books.duplicated().sum())

In [None]:
print(ratings.duplicated().sum())

In [None]:
print(users.duplicated().sum())

#### **Normalize column names for consistency**

In [None]:
books.columns = books.columns.str.lower().str.replace('-', '_')
users.columns = users.columns.str.lower().str.replace('-', '_')
ratings.columns = ratings.columns.str.lower().str.replace('-', '_')

####  **Merge data for EDA**

In [None]:
merged_df = ratings.merge(books, on='isbn')
merged_df = merged_df.merge(users, on='user_id')

### **EDA**

##### **Most rated books**

In [None]:
top_books = merged_df['book_title'].value_counts().head(10)
print("\nTop 10 Most Rated Books:\n", top_books)

##### **Distribution of ratings**

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=merged_df, x='book_rating', palette='viridis')
plt.title("Distribution of Book Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

##### **Top users with most ratings**

In [None]:
top_users = merged_df['user_id'].value_counts().head(10)
print("\nTop 10 Active Users (by rating count):\n", top_users)

##### **Age distribution of users**

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(users['age'].dropna(), bins=30, kde=True, color='teal')
plt.title("User Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

### **Popularity Based Recommender System**

#### **Merge Ratings + Books**

In [None]:
ratings_with_name = ratings.merge(books,on='isbn')

In [None]:
ratings.merge(books,on='isbn').shape

In [None]:
ratings_with_name

##### **We are calculating the number of rating of each book**

In [None]:
number_of_rating_df = ratings_with_name.groupby('book_title').count()['book_rating'].reset_index()
number_of_rating_df.rename(columns={'book_rating':'number_of_ratings'},inplace=True)
number_of_rating_df

##### **We are calculating the average rating of each book**

In [None]:
avg_rating_df = ratings_with_name.groupby('book_title')['book_rating'].mean().reset_index()
avg_rating_df.rename(columns={'book_rating':'avg_rating'},inplace=True)
avg_rating_df

##### **We are basically merging both "number_of_rating_df" and "avg_rating_df" dataframes and named it as "popular_df"**

In [None]:
popular_df = number_of_rating_df.merge(avg_rating_df,on='book_title')
popular_df

##### **Now, we will consider only those books whose number_of_rating is greater than 250**

In [None]:
popular_df[popular_df['number_of_ratings']>=250]

##### **Now, we are sorting the avg_rating in descending order and displaying the top 50 books only**

In [None]:
popular_df = popular_df[popular_df['number_of_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)
popular_df

In [None]:
popular_df = popular_df.merge(books,on = 'book_title')
popular_df

##### **Now, as we can see number of rows increases to 196 as there are books for example harry potter books with different ISBN, so we will drop the duplicates**

In [None]:
popular_df = popular_df.merge(books,on='book_title').drop_duplicates('book_title')
popular_df

In [None]:
popular_df.merge(books,on='book_title').drop_duplicates('book_title').shape

##### **Now, we are mentioning whatever we want like 'book_title' , 'book_author','image_url_m','num_ratings','avg_rating'**

In [None]:
popular_df = popular_df.merge(books,on='book_title').drop_duplicates('book_title')[['book_title','book_author','image_url_m','number_of_ratings','avg_rating']]


In [None]:
# Fix broken image links
popular_df.loc[popular_df['book_title'] == "The Hitchhiker's Guide to the Galaxy", 'image_url_m'] = "https://m.media-amazon.com/images/I/71i2fm1QJQL.jpg"
popular_df.loc[popular_df['book_title'] == "Outlander", 'image_url_m'] = "https://m.media-amazon.com/images/I/81RU5F03oqL.jpg"
popular_df.loc[popular_df['book_title'] == "The Color Purple", 'image_url_m'] = "https://m.media-amazon.com/images/I/71f6DRbcrsL._UF1000,1000_QL80_.jpg"

In [None]:
popular_df

In [None]:
# popular_df[popular_df['book_title'].isin([
#     "The Hitchhiker's Guide to the Galaxy",
#     "Outlander",
#     "The Color Purple"
# ])][['book_title', 'image_url_m']]


In [None]:
popular_df['image_url_m'][0]

### **Collaborative Filtering Based Recommender System**

In [None]:
# we will use "ratings_with_name" dataframe
ratings_with_name

##### **let's find out ki kis user nai kitni books ko rate kiya hai**

In [None]:
ratings_with_name.groupby('user_id').count()['book_rating']

#### **User based filtering**

**Criteria:** We are basically doing the filtering i.e. we only consider users who rated more than 200 books

In [None]:
ratings_with_name.groupby('user_id').count()['book_rating'] > 200

##### **We are basically removing those users jinhone 200 sai jyada books pr rating nahi ki**

In [None]:
x = ratings_with_name.groupby('user_id').count()['book_rating'] > 200
x[x]      # boolean indexing

##### **We are basically storing those user's id jinhone minimum 200 books par rating ki hai in a variable 'appreciable_users'**

In [None]:
appreciable_users = x[x].index

##### **total rows mai sai sirf vo rows display kar rahe hai jinmai user 'appreciable_users' mai sai hi eek hai**

In [None]:
ratings_with_name[ratings_with_name['user_id'].isin(appreciable_users)]

In [None]:
filtered_rating = ratings_with_name[ratings_with_name['user_id'].isin(appreciable_users)]

#### **Books based filtering**

**Criteria:** Consider books that got at least 50 ratings

In [None]:
filtered_rating.groupby('book_title').count()['book_rating']>=50

#### **We are basically removing those books jinpar 50 sai jyada rating nahi hui hai**

In [None]:
y = filtered_rating.groupby('book_title').count()['book_rating']>=50
y[y]

##### **We are basically storing those books jinpr minimum 50 rating hui hai in a variable 'famous_books'**

In [None]:
famous_books = y[y].index

##### **total rows mai sai sirf vo rows display kar rahe hai jinmai books 'famous_books' mai sai hi eek hai**

In [None]:
filtered_rating[filtered_rating['book_title'].isin(famous_books)]

In [None]:
final_ratings = filtered_rating[filtered_rating['book_title'].isin(famous_books)]

#### **Ceating a pivot table containing user's with rating on more than 200 books and books with more than 50 rating**

In [None]:
pt = final_ratings.pivot_table(index='book_title',columns='user_id',values='book_rating')

In [None]:
pt

In [None]:
# filling NAN with 0
pt.fillna(0,inplace=True)

In [None]:
pt

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

##### **we have to calculate the cosine similarity of each row(book) with every other row(book)**

In [None]:
cosine_similarity(pt)

In [None]:
similarity_scores = cosine_similarity(pt)

In [None]:
similarity_scores.shape

#### **defining a function that takes a book name as input and suggest 5 similar books**

**it will works like :**


**def** recommend(book_name):

  **return** suggestion




In [None]:
def recommend(book_name):
    # index fetch
    index = np.where(pt.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:5]

    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['book_title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('book_title')['book_title'].values))
        item.extend(list(temp_df.drop_duplicates('book_title')['book_author'].values))
        item.extend(list(temp_df.drop_duplicates('book_title')['image_url_m'].values))

        data.append(item)

    return data

In [None]:
recommend('1984')

In [None]:
pt.index[545]

In [None]:
import pickle
pickle.dump(popular_df,open('popular.pkl','wb'))

In [None]:
from google.colab import files
files.download("popular.pkl")

In [None]:
books.drop_duplicates('book_title')

In [None]:
pickle.dump(pt, open('pt.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(similarity_scores, open('similarity_scores.pkl', 'wb'))

In [None]:
files.download("pt.pkl")

In [None]:
files.download("books.pkl")

In [None]:
files.download("similarity_scores.pkl")