# Content-based recommender system using book description and book categories

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### use code from other notebooks to load data

In [5]:
ratings = pd.read_csv('book_ratings.csv')
history = pd.read_csv('book_history.csv')
items = pd.read_csv('items_info.csv')
users = pd.read_csv('users_info.csv')

# 🔍 Step 1: Clean Ratings
ratings.drop_duplicates(inplace=True)
ratings = ratings.dropna(subset=['user', 'item', 'rating'])

# 🔍 Step 2: Clean History
history.drop_duplicates(inplace=True)
history = history.dropna(subset=['user', 'item', 'accessed'])

# 🔍 Step 3: Clean Items
items.drop_duplicates(subset=['Book_ID', 'ISBN'], inplace=True)

# Fix types
items['Year-Of-Publication'] = pd.to_numeric(items['Year-Of-Publication'], errors='coerce')
items['Year-Of-Publication'] = items['Year-Of-Publication'].fillna(0).astype(int)
items['Publisher'] = items['Publisher'].fillna('Unknown')
items['Book-Title'] = items['Book-Title'].fillna('Unknown')
items['Book-Author'] = items['Book-Author'].fillna('Unknown')

# 🔍 Step 4: Clean Users
users.drop_duplicates(inplace=True)
users['Age'] = pd.to_numeric(users['Age'], errors='coerce')
users.loc[(users['Age'] < 10) | (users['Age'] > 90), 'Age'] = np.nan

In [38]:
# Create user-item rating matrix (users as rows, items as columns)
user_item_matrix = ratings.pivot(index='user', columns='item', values='rating')

In [40]:
# fill missing ratings with 0
filled_matrix = user_item_matrix.fillna(0)

In [8]:
book_info = pd.read_csv("books_info_updated.csv")

In [10]:
book_info.head()

Unnamed: 0,ISBN,Title,Description,Category
0,60973129,Decision in Normandy,No description available,"['Normandy (France)', 'France', 'Campaigns', '..."
1,393045218,The mummies of Ürümchi,"In the museums of Urumchi, the wind-swept regi...","['China', 'Antiquities', 'Mummies', 'Bronze ag..."
2,425176428,What if?,No description available,"['Imaginary wars and battles.', 'Imaginary his..."
3,452264464,Beloved,No description available,"['African American History', 'Ohio', 'History'..."
4,609804618,Our dumb century,No description available,"['American wit and humor', 'Headlines', 'Humor..."


In [12]:
items.head()

Unnamed: 0,Book_ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,1,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
1,3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...
2,4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...
3,5,609804618,Our Dumb Century: The Onion Presents 100 Years...,The Onion,1999,Three Rivers Press,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...
4,7,1879384493,If I'd Known Then What I Know Now: Why Not Lea...,J. R. Parrish,2003,Cypress House,http://images.amazon.com/images/P/1879384493.0...,http://images.amazon.com/images/P/1879384493.0...,http://images.amazon.com/images/P/1879384493.0...


In [16]:
book_info.shape

(17347, 4)

In [18]:
items.shape

(16411, 9)

#### book info and items don't have the same shape, merge two df based on ISBN

In [20]:
merged_book_info = pd.merge(items, book_info, on = "ISBN")

In [22]:
merged_book_info.shape

(16378, 12)

In [24]:
merged_book_info.head()

Unnamed: 0,Book_ID,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Title,Description,Category
0,1,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,Decision in Normandy,No description available,"['Normandy (France)', 'France', 'Campaigns', '..."
1,3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,What if?,No description available,"['Imaginary wars and battles.', 'Imaginary his..."
2,4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,Beloved,No description available,"['African American History', 'Ohio', 'History'..."
3,5,609804618,Our Dumb Century: The Onion Presents 100 Years...,The Onion,1999,Three Rivers Press,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...,http://images.amazon.com/images/P/0609804618.0...,Our dumb century,No description available,"['American wit and humor', 'Headlines', 'Humor..."
4,7,1879384493,If I'd Known Then What I Know Now: Why Not Lea...,J. R. Parrish,2003,Cypress House,http://images.amazon.com/images/P/1879384493.0...,http://images.amazon.com/images/P/1879384493.0...,http://images.amazon.com/images/P/1879384493.0...,If I'd Known Then What I Know Now: Why Not Lea...,No description available,"['Personal Growth - General', 'Personal & Prac..."


In [28]:
# drop title since book-title already exists
merged_book_info = merged_book_info.drop("Title", axis = 1)

In [34]:
# create new df with just book id, book-title, book-author, year
book_info_base = merged_book_info[["Book_ID", "Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"]]
# create a separate df with book id and description
book_description = merged_book_info[["Book_ID", "Description"]]
# create a separate df with book id and category
book_category = merged_book_info[["Book_ID", "Category"]]

In [48]:
book_category.head()

Unnamed: 0,Book_ID,Category
0,1,"['Normandy (France)', 'France', 'Campaigns', '..."
1,3,"['Imaginary wars and battles.', 'Imaginary his..."
2,4,"['African American History', 'Ohio', 'History'..."
3,5,"['American wit and humor', 'Headlines', 'Humor..."
4,7,"['Personal Growth - General', 'Personal & Prac..."


In [50]:
all_cat = book_category["Category"].explode()

In [54]:
all_cat.unique().shape

(13341,)

In [56]:
book_category.shape

(16378, 2)

#### Process base df

#### Process the category df. There are 16378 books with 13341 unique category, doing MultiLabelBinarizer might not be the best. See if we can cluster to narrow down unique categories. 

#### Vectorize the description df with CountVectorizer, then apply TfidfTransformer

In [64]:
# remove books without description
books_with_desc = book_description[book_description["Description"] != "No description available"]

In [66]:
books_with_desc.shape

(2748, 2)

In [100]:
books_with_desc.head()

Unnamed: 0,Book_ID,Description
5,8,Three passengers are dead. Fifty-six are injur...
6,9,"In an Arizona desert, a man wanders in a daze,..."
24,28,"When eleven-year old Shabanu, the daughter of ..."
25,29,Having relented to the ways of her people in P...
31,35,The how of Pooh? The Tao of who? The Tao of Po...


In [478]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(stop_words = "english")
vec_words = vectorizer.fit_transform(books_with_desc["Description"])

tfidf = TfidfTransformer()
vec_tfidf = tfidf.fit_transform(vec_words)

In [480]:
vec_tfidf.shape

(2748, 22106)

In [482]:
tfidf_desc = pd.DataFrame(vec_tfidf.toarray(), columns=vectorizer.get_feature_names_out(), index = books_with_desc["Book_ID"])
tfidf_desc.head()

Unnamed: 0_level_0,00,000,05,0l,0s,10,100,101,1023,1030l,...,étonnante,éxito,última,últimas,último,últimos,única,über,überlebens,übertrifft
Book_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Implement content based user recommendation
We can look at user ratings, find their top n books then recommend k similar books to users.

## Create a function that does the following
1. look at the top rated books by a user (top n percentile, to account for user rating preference, remove 0 when ranking)
3. for each book find top similar books (make sure to normalize when doing this step)
4. after finding similar books, sort then recommend top k books
5. use book id to reference merged_book_info to get title

In [525]:
def content_similarity(user_scores, data_matrix, book_db, k = 10, n = 0.75):
    # user_scores is pd series of scores 
    # data matrix is the feature matrix
    # book_db is the df that maps book_id and title
    # n is the percentile, default is 75th percentile
    # k is how many books to recommend

    from sklearn.metrics.pairwise import cosine_similarity

    # Compute cosine similarity
    cosine_sim = cosine_similarity(data_matrix)
    
    # Create df for cosine sims (item x item)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=data_matrix.index, columns=data_matrix.index)
    
    filtered_ratings = user_scores[user_scores != 0]
    percentile_n = filtered_ratings[filtered_ratings >= filtered_ratings.quantile(n)]
    top_index = percentile_n.index

    book_sim = cosine_sim_df.loc[top_index]

    # initialize an empty series
    combined_sims = pd.Series(dtype = "float64")

    for ind in top_index:
        current_sims = book_sim.loc[ind].drop(ind)
        combined_sims = pd.concat([combined_sims, current_sims])

    # after combining sims, there will be duplicate entries for the sims, just keep the max sims for each book then sort
    final_sims = combined_sims.groupby(combined_sims.index).max().sort_values(ascending = False)

    book_rec = final_sims.iloc[:k]
    book_rec_index = book_rec.index

    rec_titles = book_db[book_db["Book_ID"].isin(book_rec_index)]["Book-Title"]

    top_rated_titles = book_db[book_db["Book_ID"].isin(top_index)]["Book-Title"]
    print("Top rated books by the user are the following:\n")
    for title in top_rated_titles.to_list():
        print(title)
    
    print("------------------------------------------------------------")
    print(f"Top {k} recommendations for the user are the following:\n")
    for title in rec_titles.to_list():
        print(title)
    return

#### Implement using book description

In [130]:
# create user-item matrix with only books that have description available. 
desc_matrix = filled_matrix.loc[:, filled_matrix.columns.isin(book_desc_tf["Book_ID"])]
desc_matrix.shape

(1295, 2371)

In [128]:
desc_matrix.head()

item,8,9,28,29,35,39,45,54,58,68,...,17277,17278,17287,17294,17303,17305,17319,17320,17344,17351
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [522]:
tfidf_desc.head()

Unnamed: 0_level_0,00,000,05,0l,0s,10,100,101,1023,1030l,...,étonnante,éxito,última,últimas,último,últimos,única,über,überlebens,übertrifft
Book_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Test the recommender on some random users

In [527]:
# desc_matrix is where you can get user rating for user_scores
# tfidf_desc is the data_matrix for applying recommendations based on description

content_similarity(desc_matrix.iloc[11,:], tfidf_desc, merged_book_info, 5, 0.8)

Top rated books by the user are the following:

Lena (50 State Quarters)
Strange but True
The Protocol
------------------------------------------------------------
Top 5 recommendations for the user are the following:

The Diary of Ellen Rimbauer: My Life at Rose Red
Love Is the Key
Chasing Redbird
Grab Hands and Run
The Moonlight Man


In [528]:
content_similarity(desc_matrix.iloc[278,:], tf_df, merged_book_info, 5, 0.8)

Top rated books by the user are the following:

Tis : A Memoir
The Reader
Jurassic Park
A Thousand Words for Stranger (Daw Book Collectors)
The Lord of the Rings (Movie Art Cover)
The Egg and I
------------------------------------------------------------
Top 5 recommendations for the user are the following:

The Fellowship of the Ring
The Two Towers (The Lord of the Rings, Part 2)
The Fellowship of the Ring (The Lord of the Rings, Part 1)
The Fellowship of the Ring (The Lord of the Rings, Part 1)
The Lord of the Rings


#### Implement using categories