In [None]:
import kagglehub
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
books_df = pd.read_csv(f"{path}/books.csv")
books_df

In [None]:
pip install seaborn matplotlib numpy

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

ax = plt.axes()
sns.heatmap(books_df.isna().transpose(), cbar=False, ax=ax)

plt.xlabel("Columns")
plt.ylabel("Missing Values")

plt.show()

In [None]:
import numpy as np

books_df['missing_description'] = np.where(books_df['description'].isna(), 1, 0)
books_df['age_of_book'] = 2025 - books_df['published_year']

books_df

In [None]:
columns_of_interest = ['num_pages', 'age_of_book', 'missing_description', 'average_rating']

correlation_matrix = books_df[columns_of_interest].corr(method='spearman')

sns.set_theme(style='white')
plt.figure(figsize=(8,6))
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar_kws={"label": "Spearman Correlation"})

heatmap.set_title('Correlation heatmap')
plt.show()

In [None]:
books_missing = books_df[~(books_df['missing_description'].isna()) &
        ~(books_df['num_pages'].isna()) &
        ~(books_df['average_rating'].isna()) &
        ~(books_df['published_year'].isna())
]

books_missing

In [None]:
books_missing['categories'].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
books_missing['words_in_description'] = books_missing['description'].str.split().str.len()

books_missing

In [None]:
books_missing.loc[books_missing['words_in_description'].between(1,4), 'description']


In [None]:
books_missing_25_words = books_missing[books_missing['words_in_description'] >= 25]
books_missing_25_words

In [None]:
books_missing_25_words['title_and_subtitle'] = (
    np.where(books_missing_25_words['subtitle'].isna(),
             books_missing_25_words['title'],
             books_missing_25_words[['title', 'subtitle']].astype(str).agg(': '.join, axis=1))
)

In [None]:
books_missing_25_words

In [None]:
books_missing_25_words['tagged_description'] = books_missing_25_words[['isbn13', 'description']].astype(str).agg(' '.join, axis=1)

In [None]:
books_missing_25_words

In [16]:
(
    books_missing_25_words
    .drop(['subtitle', 'missing_description', 'age_of_book', 'words_in_description'], axis=1)
    .to_csv("books_cleaned.csv", index=False)
)

In [None]:
pip install dotenv langchain

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")
books

In [20]:
books['tagged_description'].to_csv("books_descriptions.txt", sep='\n', index=False, header=False)

In [None]:
pip install langchain_community langchain_openai langchain_chroma

In [22]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
raw_documents = TextLoader("books_tagged_descriptions.txt", encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")

documents = text_splitter.split_documents(raw_documents)


In [None]:
documents[0]

In [None]:
pip install sentence-transformers


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [27]:
db_books = Chroma.from_documents(
    documents,
    embedding=embedding_model,
    persist_directory="chroma_books"
)

In [None]:
query = "A book to teach children about nature"

docs = db_books.similarity_search(query, k=10)
docs

In [None]:
books[books['isbn13'] == int(docs[0].page_content.split()[0].strip())]

In [39]:
def retrieve_semantic_recommendations(query, top_k=10):
    rec_docs = db_books.similarity_search(query, k=50)
    books_list = []
    for doc in range(0, len(rec_docs)):
        books_list += [int(rec_docs[doc].page_content.strip('"').split()[0])]
        
    return books[books['isbn13'].isin(books_list)].head(top_k)

In [None]:
retrieve_semantic_recommendations("A book to teach children about nature")