#### Load Dataset

In [169]:
import pandas as pd

books = pd.read_csv("dataset/Books.csv", dtype={'Book-Author': str}, low_memory=False)

In [170]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

Remove all duplicated entries and Null Values

In [171]:
# Remove duplicates based on Title and Author
books_unique = books.drop_duplicates(subset=['Book-Title', 'Book-Author'], keep='first')

# Reset the index for remaining books
books_unique = books_unique.reset_index(drop=True)

books = books_unique.copy()

In [172]:
books = books.dropna(subset=['Book-Author', 'Publisher', 'Image-URL-L'])

Slice the data to take only important data

In [173]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Image-URL-L']]

In [174]:
# Limit the dataset to the first 10,000 rows to reduce memory usage and improve processing speed
books = books.head(10000)


In [175]:
books["tags"] = (
    books["Book-Title"].astype(str).str.lower() + " " +
    books["Book-Author"].str.replace(" ", "").str.lower() + " " +
    books["Publisher"].astype(str).str.replace(" ", "").str.lower()
)

In [176]:
books_df = books[['ISBN', 'Book-Title', 'Book-Author', 'tags', 'Image-URL-L']]
books_df = books_df.rename(columns={
    'Book-Title': 'book_title',
    'Book-Author': 'book_author',
    'Image-URL-L': 'cover_url'
})

In [177]:
books_df.head(1)

Unnamed: 0,ISBN,book_title,book_author,tags,cover_url
0,195153448,Classical Mythology,Mark P. O. Morford,classical mythology markp.o.morford oxforduniv...,http://images.amazon.com/images/P/0195153448.0...


In [178]:
# get only root words
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def get_root_words(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [179]:
books_df['tags'] = books_df['tags'].apply(get_root_words)

In [180]:
# Convert the 'tags' column into a numerical feature matrix using Bag of Words.
# Limiting to the 2000 most frequent words and removing common English stopwords.
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, stop_words='english')
vectors = cv.fit_transform(books_df['tags']).toarray()


In [181]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'â¼bbe', 'â¼ber', 'â¼r'],
      shape=(2000,), dtype=object)

In [182]:
# Compute pairwise cosine similarity between all items in 'vectors' to measure how similar they are to each other
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)


In [183]:
def recommend_books(book):
    book_index = books_df[books_df['book_title'] == book].index[0]       
    distances = similarity[book_index]
        
    books_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in books_list:
        print(books_df.iloc[i[0]].book_title)


In [184]:
recommend_books('Classical Mythology')

The Selfish Gene
Classic Whodunits
Metaphysical Lyrics Poems 17 Cen
Mythology
What a Wonderful World: A Lifetime of Recordings


In [None]:
# Save the computed similarity matrix and book dataset as a pickle file for later use in the app
import pickle

data_to_save = {
    "similarity": similarity,
    "books": books_df
}

file_path = "app/models/"
with open(f"{file_path}model.pkl", "wb") as f:
    pickle.dump(data_to_save, f)
    print(f"File Created on path \"{file_path}\" Successfully. ")


File Created on path "app/models/" Successfully. 
