In [4]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv

In [140]:
load_dotenv()

True

In [13]:
import pandas as pd
books = pd.read_csv('books_cleaned.csv')

category_mapping = {'Fiction':"Fiction",
                    'Juvenile Fiction':"Children's Fiction",
                    'Biography & Autobiography':"Non-fiction",
                    'History':"Non-fiction",
                    'Literary Criticism' : "Non-fiction",
                    'Philosophy':"Non-fiction",
                    'Religion':"Non-fiction",
                    'Comics & Graphic Novels':"Fiction",
                    'Drama':"Fiction",
                    'Poetry':"Fiction",
                    'Science':"Non-fiction",
                    'Juvenile Nonfiction':"Children's Non-fiction",
                    }

books['simple_categories'] = books['categories'].map(category_mapping)

In [17]:
books['simple_categories'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5299 entries, 0 to 5298
Series name: simple_categories
Non-Null Count  Dtype 
--------------  ----- 
3826 non-null   object
dtypes: object(1)
memory usage: 41.5+ KB


In [18]:
from transformers import pipeline
fiction_categories = ["Fiction","Non-fiction"]
pipe = pipeline("zero-shot-classification",
                model="facebook/bart-large-mnli",
                device='mps'
                )

Device set to use mps


In [20]:
def generate_predictions(sequence,categories):
    predictions = pipe(sequence,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    return max_label

In [28]:
from tqdm import tqdm
import numpy as np

isbns = []
predicted_categories = []

missing_categories = books.loc[books['simple_categories'].isna(),["isbn13","description"]].reset_index(drop=True)

for i in tqdm(range(0,len(missing_categories))):
    sequence = missing_categories['description']
    predicted_categories += [generate_predictions(sequence,fiction_categories)]
    isbns += [missing_categories['isbn13'][i]]

100%|██████████| 1473/1473 [08:24<00:00,  2.92it/s]


In [32]:
missing_predicted_df = pd.DataFrame({'isbn13':isbns,'predicted_categories':predicted_categories})

In [34]:
missing_predicted_df

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,Fiction
2,9780006280934,Fiction
3,9780006380832,Fiction
4,9780006470229,Fiction
...,...,...
1468,9788125026600,Fiction
1469,9788171565641,Fiction
1470,9788172235222,Fiction
1471,9788173031014,Fiction


In [37]:
books = pd.merge(books,missing_predicted_df,on='isbn13',how='left')
books["simple_categories"] = np.where(books["simple_categories"].isnull(),books["predicted_categories"],books["simple_categories"])
books = books.drop(columns=["predicted_categories"])

In [41]:
books = books.drop(columns=['predicted_categories_x', 'predicted_categories_y'])
books.sample(2)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
1674,9780374525866,374525862,The Magic Barrel,Bernard Malamud,Fiction,http://books.google.com/books/content?id=Wmcxu...,Winner of the National Book Award for Fiction ...,2003.0,4.0,240.0,1898.0,The Magic Barrel: Stories,9780374525866 Winner of the National Book Awar...,Fiction
1413,9780330363686,330363689,Just Disgusting!,Andy Griffiths,Australian fiction,http://books.google.com/books/content?id=cUeXr...,"In the tradition of Just Tricking!, Just Annoy...",2002.0,3.81,180.0,67.0,Just Disgusting!,9780330363686 In the tradition of Just Trickin...,Fiction


In [43]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5299 entries, 0 to 5298
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   isbn13              5299 non-null   int64  
 1   isbn10              5299 non-null   object 
 2   title               5299 non-null   object 
 3   authors             5265 non-null   object 
 4   categories          5269 non-null   object 
 5   thumbnail           5131 non-null   object 
 6   description         5299 non-null   object 
 7   published_year      5299 non-null   float64
 8   average_rating      5299 non-null   float64
 9   num_pages           5299 non-null   float64
 10  ratings_count       5299 non-null   float64
 11  title_and_subtitle  5299 non-null   object 
 12  tagged_description  5299 non-null   object 
 13  simple_categories   5299 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory usage: 579.7+ KB


In [45]:
books.to_csv('books_cleaned_with_categories.csv',index=False)

In [54]:
books["tagged_description"] = books["tagged_description"] + " [Category: " + books["simple_categories"].astype(str) + "]"

In [55]:
books['tagged_description'][2]

"9780006178736 A memorable, mesmerizing heroine Jennifer -- brilliant, beautiful, an attorney on the way up until the Mafia's schemes win her the hatred of an implacable enemy -- and a love more destructive than hate. A dangerous, dramatic world The Dark Arena of organized crime and flashbulb lit courtrooms where ambitious prosecutors begin their climb to political power. [Category: Fiction]"

In [56]:
books.to_csv('books_cleaned_with_categories.csv',index=False)

In [57]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5299 entries, 0 to 5298
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   isbn13              5299 non-null   int64  
 1   isbn10              5299 non-null   object 
 2   title               5299 non-null   object 
 3   authors             5265 non-null   object 
 4   categories          5269 non-null   object 
 5   thumbnail           5131 non-null   object 
 6   description         5299 non-null   object 
 7   published_year      5299 non-null   float64
 8   average_rating      5299 non-null   float64
 9   num_pages           5299 non-null   float64
 10  ratings_count       5299 non-null   float64
 11  title_and_subtitle  5299 non-null   object 
 12  tagged_description  5299 non-null   object 
 13  simple_categories   5299 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory usage: 579.7+ KB


In [58]:
books['tagged_description'].to_csv('tagged_description.txt',
                                  sep="\n",
                                  index = False)

In [59]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0,chunk_overlap=0,separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 18, which is longer than the specified 0
Created a chunk of size 1188, which is longer than the specified 0
Created a chunk of size 1234, which is longer than the specified 0
Created a chunk of size 393, which is longer than the specified 0
Created a chunk of size 329, which is longer than the specified 0
Created a chunk of size 503, which is longer than the specified 0
Created a chunk of size 502, which is longer than the specified 0
Created a chunk of size 980, which is longer than the specified 0
Created a chunk of size 208, which is longer than the specified 0
Created a chunk of size 863, which is longer than the specified 0
Created a chunk of size 316, which is longer than the specified 0
Created a chunk of size 217, which is longer than the specified 0
Created a chunk of size 901, which is longer than the specified 0
Created a chunk of size 1108, which is longer than the specified 0
Created a chunk of size 1209, which is longer than the specified 0
Created

In [188]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient

load_dotenv()
api_key = os.getenv("QDRANT_API_KEY")
client = QdrantClient(
    url="https://1899bf2c-b912-493b-b2e4-0e0f50051fb1.europe-west3-0.gcp.cloud.qdrant.io", 
    api_key=api_key
)
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='star_charts')])

In [189]:
from qdrant_client.models import VectorParams, Distance

client.recreate_collection(
    collection_name="books",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE)  # Adjust size to match your embeddings
)


  client.recreate_collection(


True

In [169]:
embed_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [193]:
from tqdm import tqdm
from qdrant_client.models import PointStruct

book_embeddings = []
for idx, desc in tqdm(enumerate(books["description"]), desc="Embedding Progress"):
    emb = embed_model.embed_query(desc)  # Generate embedding
    book_embeddings.append((idx, emb))  # Store (id, embedding) pairs
# Convert to Qdrant format
points = [
    PointStruct(
        id=idx, 
        vector=emb, 
        payload={"description": books["description"][idx]}  # Store metadata if needed
    )
    for idx, emb in book_embeddings
]

client.upsert(collection_name=collection_name, points=points)
print("Embeddings stored successfully in Qdrant!")

Embedding Progress: 1947it [13:30,  2.40it/s]


KeyboardInterrupt: 

In [None]:
def retrieve_books_for_llm(query:str,top_k:int=5):
    a = retrieve_sementic_recommendation(query,top_k)
    books_list = []
    for i in range(len(a)):
        books_list.append("Title:" + a.iloc[i]["title"] + ", " + a.iloc[i]["tagged_description"])
    
    return books_list

retrieve_books_for_llm("A book similar to good habits",top_k=5)

['Title:Authentic Happiness, 9780743222983 Argues that happiness can be a learned and cultivated behavior, explaining how every person possesses at least five of twenty-four profiled strengths that can be built on in order to improve life. [Category: Fiction]',
 'Title:The 7 Habits of Highly Effective People Personal Workbook, 9780743250979 Outlining seven key organizational rules for improving effectiveness and increasing productivity at work and at home, a companion volume to The 7 Habits of Highly Effective People presents a step-by-step guide that includes in-depth exercises and solutions that teach the fundamentals of fairness, integrity, honesty, and dignity and help readers set goals, enhance relationships, and promote success. Original. 75,000 first printing. [Category: Fiction]',
 'Title:First Things First, 9780743468596 In the spirit of THE 7 HABITS OF HIGHLY EFFECTIVE PEOPLE, the international bestseller, FIRST THINGS FIRST is a revolutionary guide to managing your time by l

In [146]:
documents[1].page_content

'9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the wors

In [62]:
query = "A book related to discipline"
docs = db_books.similarity_search(query,k=5)
for doc in docs:
    isbn = int(doc.page_content.split()[0].strip())
    books_detail = books[books["isbn13"]==isbn]
    print(books_detail.to_string(index=False))

       isbn13     isbn10               title      authors             categories                                                                                             thumbnail                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              description  published_year  average_rating  num_pages  ratings_count               

In [64]:
def retrieve_sementic_recommendation(query:str,top_k:int=10)->pd.DataFrame:
    recs = db_books.similarity_search(query,k=top_k)
    books_list = []
    for i in range(len(recs)):
        books_list.append(int(recs[i].page_content.strip('"').split()[0]))
    
    return books[books["isbn13"].isin(books_list)].head(top_k) 

In [65]:
retrieve_sementic_recommendation("A book similar to Atomic habits",top_k=5)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
614,9780140259100,0140259104,The Nuclear Age,Tim O'Brien,Fiction,http://books.google.com/books/content?id=p0EMA...,From Library Journal : Brilliant nuclear deton...,1996.0,3.44,320.0,928.0,The Nuclear Age,9780140259100 From Library Journal : Brilliant...,Fiction
826,9780142402931,0142402931,The Far Side of Evil,Sylvia Engdahl,Juvenile Fiction,http://books.google.com/books/content?id=7nijj...,A young girl from an advanced civilization is ...,2005.0,3.98,324.0,57.0,The Far Side of Evil,9780142402931 A young girl from an advanced ci...,Children's Fiction
2270,9780441012374,044101237X,Rocket Ship Galileo,Robert Anson Heinlein,Fiction,http://books.google.com/books/content?id=1421n...,Three teenagers and an older scientist develop...,2004.0,3.71,211.0,6199.0,Rocket Ship Galileo,9780441012374 Three teenagers and an older sci...,Fiction
4359,9780960989867,0960989862,The Twelve Steps and Twelve Traditions of Over...,"Overeaters Anonymous, Inc. (U.S.)",Self-Help,http://books.google.com/books/content?id=pGeWA...,Explains the twelve steps of Overeaters Anonym...,1993.0,4.37,221.0,200.0,The Twelve Steps and Twelve Traditions of Over...,9780960989867 Explains the twelve steps of Ove...,Fiction
4479,9781400078394,1400078393,Learned Optimism,Martin E. P. Seligman,Psychology,http://books.google.com/books/content?id=JYxID...,An authority on cognitive psychology and motiv...,2006.0,4.0,319.0,13231.0,Learned Optimism: How to Change Your Mind and ...,9781400078394 An authority on cognitive psycho...,Fiction


In [66]:
books.sample(1)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
2175,9780439681360,439681367,Underworld,Jude Watson,Juvenile Fiction,http://books.google.com/books/content?id=b2lvG...,"Ferus Olin, a former Jedi apprentice, journeys...",2005.0,3.96,160.0,799.0,Underworld,"9780439681360 Ferus Olin, a former Jedi appren...",Children's Fiction


In [121]:
import numpy as np

max_index = np.argmax(pipe(sequence,fiction_categories)['scores'])
max_label = pipe(sequence,fiction_categories)['labels'][max_index]
max_label

'Fiction'

In [102]:
def retrieve_books_for_llm(query:str,top_k:int=5):
    a = retrieve_sementic_recommendation(query,top_k)
    books_list = []
    for i in range(len(a)):
        books_list.append("Title:" + a.iloc[i]["title"] + ", " + a.iloc[i]["tagged_description"])
    
    return books_list

retrieve_books_for_llm("A book similar to good habits",top_k=5)

['Title:Authentic Happiness, 9780743222983 Argues that happiness can be a learned and cultivated behavior, explaining how every person possesses at least five of twenty-four profiled strengths that can be built on in order to improve life. [Category: Fiction]',
 'Title:The 7 Habits of Highly Effective People Personal Workbook, 9780743250979 Outlining seven key organizational rules for improving effectiveness and increasing productivity at work and at home, a companion volume to The 7 Habits of Highly Effective People presents a step-by-step guide that includes in-depth exercises and solutions that teach the fundamentals of fairness, integrity, honesty, and dignity and help readers set goals, enhance relationships, and promote success. Original. 75,000 first printing. [Category: Fiction]',
 'Title:First Things First, 9780743468596 In the spirit of THE 7 HABITS OF HIGHLY EFFECTIVE PEOPLE, the international bestseller, FIRST THINGS FIRST is a revolutionary guide to managing your time by l

In [125]:
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate


def generate_response(query):
    retrieved_books = retrieve_sementic_recommendation(query,top_k=5)
    llm = ChatGoogleGenerativeAI(
        temperature=0.5,
        model='gemini-2.0-flash-lite-preview-02-05'
    )
    
    prompt = PromptTemplate.from_template("""
    You are a knowledgeable and friendly book recommendation assistant. 
    Your goal is to help users find books they'll love based on their preferences and the available books. Dont directly mention the books. be like a bookworm. give the user the treatment of a bookworm.
    Also provide a link to purchase the book.
    Query: {query}
    
    **Available Book Information:**
    {retrieved_books}
    provide all the books that have been provided to you
    For each recommended book, provide:
   - Title and Author
   - Brief reason why it matches their request
   - Key themes or features 
   
    
    """    
    )
    
    chain = prompt | llm
    
    # Invoke the chain with the inputs
    response = chain.invoke({
        "retrieved_books": retrieved_books,
        "query": query
    })
    
    print(response.content)
    return response.content
    

In [126]:
generate_response("Tell me some good non fiction books")

Oh, my dear reader, you've come to the right place! I have a few non-fiction gems that might just tickle your fancy. Let's dive in, shall we?

Here are a few titles that might pique your interest:

1.  **Pathologies of Power** by Paul Farmer

    *   **Why it matches your request**: This book delves into the realms of history and human rights, offering a deep dive into the complexities of power and its impact on health.
    *   **Key themes/features**: Expect a thought-provoking exploration of how social and political forces shape health outcomes, with a focus on marginalized communities. It's a book that will stay with you long after you've turned the final page.
    *   **Where to find it**: I'd recommend checking out this [link](https://www.amazon.com/Pathologies-Power-Health-Human-Rights/dp/0520243269)

2.  **Epictetus: the Discourses as reported by Arrian**

    *   **Why it matches your request**: A classic in the realm of philosophy, offering timeless wisdom on how to live a vir

"Oh, my dear reader, you've come to the right place! I have a few non-fiction gems that might just tickle your fancy. Let's dive in, shall we?\n\nHere are a few titles that might pique your interest:\n\n1.  **Pathologies of Power** by Paul Farmer\n\n    *   **Why it matches your request**: This book delves into the realms of history and human rights, offering a deep dive into the complexities of power and its impact on health.\n    *   **Key themes/features**: Expect a thought-provoking exploration of how social and political forces shape health outcomes, with a focus on marginalized communities. It's a book that will stay with you long after you've turned the final page.\n    *   **Where to find it**: I'd recommend checking out this [link](https://www.amazon.com/Pathologies-Power-Health-Human-Rights/dp/0520243269)\n\n2.  **Epictetus: the Discourses as reported by Arrian**\n\n    *   **Why it matches your request**: A classic in the realm of philosophy, offering timeless wisdom on how 