In [12]:
# import libraries
import re
import numpy as np
import pandas as pd
from difflib import get_close_matches
from IPython.display import Image, display
from IPython.core.display import HTML
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
df_books = pd.read_csv("cleaned_books.csv")

In [14]:
from sklearn.model_selection import train_test_split

# Split book data (non-personalized, so no users involved)
train_books, test_books = train_test_split(df_books, test_size=0.2, random_state=42)
train_books = train_books.reset_index(drop=True)
test_books = test_books.reset_index(drop=True)

In [15]:
# Creates TF-IDF matrix from book titles and authors.
def create_tfidf_matrix(book_df):
    # Combine title and authors, handle potential NaNs
    authors_title_df = book_df[['book_id', 'original_title', 'title', 'authors', 'average_rating', 'image_url']].copy()
    authors_title_df['content'] = authors_title_df['original_title'].fillna('') + ' ' + authors_title_df['authors'].fillna('')
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(authors_title_df['content'])
    return tfidf_matrix, authors_title_df # Return df with 'content' column

In [16]:
tfidf_matrix_train, authors_title_df_train = create_tfidf_matrix(train_books)
tfidf_matrix_test, authors_title_df_test = create_tfidf_matrix(test_books)

In [24]:
def rcmd_content_based(ori_book_id, books_df, k=5):
    # Get the index of the book that matches the ID
    idx_series = train_books.index[books_df['book_id'] == ori_book_id]
    idx = idx_series[0]
    
    tfidf_matrix, authors_title_df = create_tfidf_matrix(books_df)
    
    v = cosine_similarity(tfidf_matrix, tfidf_matrix)
    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(v[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the k most similar books (excluding the book itself)
    sim_scores = sim_scores[1:k+1]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Get the similarity scores
    similarity_values = [i[1] for i in sim_scores]

    # Return the top k most similar books
    rcmd = authors_title_df.iloc[book_indices][["book_id", "title", "authors", "average_rating", "image_url"]].copy()
    rcmd['similarity_score'] = similarity_values
    rcmd = rcmd.reset_index(drop=True)

    return rcmd

In [27]:
book_id = 827
title = df_books.loc[df_books['book_id'] == book_id, 'original_title'].iloc[0]
print("Book Title: ", title)
rcmd_c_b = rcmd_content_based(book_id, train_books)

Book Title:  To All the Boys I've Loved Before


In [28]:
# Function to display images in a DataFrame
def display_images(df, image_column):
    # Create an HTML representation of the DataFrame with images
    html = df.to_html(escape=False, formatters={
        image_column: lambda url: f'<img src="{url}" width="100">'
    })
    display(HTML(html))

# Display the DataFrame with images
display_images(rcmd_c_b, 'image_url')

Unnamed: 0,book_id,title,authors,average_rating,image_url,similarity_score
0,2112,"P.S. I Still Love You (To All the Boys I've Loved Before, #2)",Jenny Han,4.17,,0.543969
1,1284,"It's Not Summer Without You (Summer, #2)",Jenny Han,4.14,,0.51427
2,1614,"We'll Always Have Summer (Summer, #3)",Jenny Han,4.17,,0.435687
3,9769,The Summer I Turned Pretty Trilogy: The Summer I Turned Pretty; It's Not Summer Without You; We'll Always Have Summer,Jenny Han,4.46,,0.368802
4,6470,"Burn for Burn (Burn for Burn, #1)","Jenny Han, Siobhan Vivian",3.87,,0.271112
