In [1]:
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter


In [2]:
import logging
import os

import pandas as pd

In [3]:
# Load environment variables from .env file
load_dotenv()

# Verify the key is loaded
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
	print("Error: OPENAI_API_KEY not found.")
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
if not openrouter_api_key:
	print("Error: OPENROUTER_API_KEY not found.")

In [4]:
qwen_3_embedding_model = OpenAIEmbeddings(
    model="qwen/qwen3-embedding-8b",
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    check_embedding_ctx_length=False  # distinct from OpenAI's context limits
)

In [5]:
# Load the cleanded version of the books dataset
books = pd.read_csv("data/books_cleaned.csv")
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,clean_categories,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Fiction,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Others,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Fiction,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,Others,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,Others,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Others,Mistaken Identity,9788172235222 On A Train Journey Home To North...
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Others,Journey to the East,9788173031014 This book tells the tale of a ma...
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,Health & Fitness,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,Philosophy,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...


In [6]:
# Save the tagged descriptions as a .txt file
books["tagged_description"].to_csv(path_or_buf="data/tagged_description.txt", sep="\n", index=False, header=False)

In [7]:
# Load the documents
raw_documents = TextLoader("data/tagged_description.txt").load()

# Silence the Logger
logging.getLogger("langchain_text_splitters").setLevel(logging.ERROR)

# Run the split (Completely silent)
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)
print("Done Splitting.")

Done Splitting.


In [8]:
# Sanity Check: verify if the chunk is complete and not overlapping
display(documents[0])

Document(metadata={'source': 'data/tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s detail

In [9]:
# Create a Chroma vector store from the pre-processed document chunks using OpenAI embeddings for semantic search
db_books = Chroma.from_documents(
	documents=documents,
	embedding=qwen_3_embedding_model,
	collection_name="books_qwen_8b",
	persist_directory="data/chroma_data"
)

```python
"""THIS IS A MARKDOWN CELL"""
# Example: Persisting a Vector Database to Disk with Chroma and Qwen-3 Embedding Model
# Load from disk (No cost, instant load) if needed to run later/again
db_books = Chroma(
    persist_directory="./chroma_data_storage", # Must match the folder above
    embedding_function=qwen_3_embedding_model, # Must use the SAME model
    collection_name="books_qwen_8b"            # Must match the collection name
)

print("Database loaded from disk.")
```

In [10]:
from rich import print as print_rich

query = "A book to teach children about nature"
docs = db_books.similarity_search(query=query, k=5)
print_rich(docs)

In [11]:
# Extracts the ISBN13 from the first document's page content (which starts with the ISBN13 number), converts it to an integer, and filters the books dataframe to find the matching book record.
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,clean_categories,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Juvenile Fiction,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...


In [12]:
def retrieve_recommendations(
    query: str,
    top_k: int = 50,
    db: Chroma = db_books,
    embedding: OpenAIEmbeddings = OpenAIEmbeddings(),
) -> pd.DataFrame:
    """
    Retrieve book recommendations based on semantic similarity to a query.

    Parameters
    ----------
    query : str
        The search query describing the desired book.
    top_k : int, optional
        Number of top recommendations to return. Default is 10.
    db : Chroma, optional
        Chroma vector database containing book embeddings. Default is ``db_books``.
    embedding : OpenAIEmbeddings, optional
        Embedding model instance to use for query encoding. Default is
        ``OpenAIEmbeddings()``.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the top_k book recommendations, sorted by
        semantic relevance (most relevant first).

    """
    # Get raw recommendations from Vector Store fetching exactly top_k
    recs = db.similarity_search(query=query, k=top_k)

    # Extract ISBNs maintaining the list order
    ordered_isbns = [
        int(rec.page_content.strip('"').split()[0]) for rec in recs
    ]

    # Create a temporary DataFrame to enforce order acting as our "Leaderboard"
    ranking_df = pd.DataFrame({"isbn13": ordered_isbns})

    # Merge (Left Join) to hydrate with book details
    recommendations = ranking_df.merge(books, on="isbn13", how="left")

    return recommendations

In [13]:
query_children_nature = retrieve_recommendations(query="A book to teach children about nature")
display(query_children_nature[:5])

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,clean_categories,title_and_subtitle,tagged_description
0,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Juvenile Fiction,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...
1,9780553112443,553112449,Switch on the Night,Ray Bradbury;Leo Dillon;Diane Dillon,Juvenile Fiction,http://books.google.com/books/content?id=1eUMA...,When a magical little girl named Dark shows a ...,2004.0,4.09,40.0,186.0,Juvenile Fiction,Switch on the Night,9780553112443 When a magical little girl named...
2,9781561381487,1561381489,The Zen Gardening Kit,Running Press,Gardening,http://books.google.com/books/content?id=jSzoT...,"The illustrated Zen Rock Gardening Book, an in...",1992.0,3.21,96.0,30.0,Others,The Zen Gardening Kit,9781561381487 The illustrated Zen Rock Gardeni...
3,9780060775858,60775858,Goodnight Moon 60th Anniversary Edition,Margaret Wise Brown,Juvenile Fiction,http://books.google.com/books/content?id=lLYOr...,"In a great green room, tucked away in bed, is ...",2005.0,4.27,32.0,264013.0,Juvenile Fiction,Goodnight Moon 60th Anniversary Edition,"9780060775858 In a great green room, tucked aw..."
4,9780689861130,689861133,"Moo, Baa, la la La!",Sandra Boynton,Animal sounds,http://books.google.com/books/content?id=Gz40A...,Children will love joining in and imitating th...,2004.0,4.2,14.0,28261.0,Others,"Moo, Baa, la la La!",9780689861130 Children will love joining in an...
