In [104]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv

In [105]:
load_dotenv()

True

In [106]:
import pandas as pd
books = pd.read_csv('books_cleaned.csv')
books["tagged_description"] = books["tagged_description"] + " [Category: " + books["categories"].astype(str) + "]"

In [107]:
books['tagged_description'][0]

'9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the wors

In [108]:
books['tagged_description'].to_csv('tagged_description.txt',
                                  sep="\n",
                                  index = False)

In [109]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0,chunk_overlap=0,separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 18, which is longer than the specified 0
Created a chunk of size 1188, which is longer than the specified 0
Created a chunk of size 1256, which is longer than the specified 0
Created a chunk of size 393, which is longer than the specified 0
Created a chunk of size 336, which is longer than the specified 0
Created a chunk of size 510, which is longer than the specified 0
Created a chunk of size 507, which is longer than the specified 0
Created a chunk of size 1008, which is longer than the specified 0
Created a chunk of size 218, which is longer than the specified 0
Created a chunk of size 863, which is longer than the specified 0
Created a chunk of size 327, which is longer than the specified 0
Created a chunk of size 227, which is longer than the specified 0
Created a chunk of size 909, which is longer than the specified 0
Created a chunk of size 1104, which is longer than the specified 0
Created a chunk of size 1215, which is longer than the specified 0
Create

In [110]:
documents[1].page_content

'9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the wors

In [111]:
db_books = Chroma.from_documents(
    documents,
    embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
)

In [112]:
query = "A book related to discipline"
docs = db_books.similarity_search(query,k=5)
for doc in docs:
    isbn = int(doc.page_content.split()[0].strip())
    books_detail = books[books["isbn13"]==isbn]
    print(books_detail.to_string(index=False))

       isbn13     isbn10               title      authors             categories                                                                                             thumbnail                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              description  published_year  average_rating  num_pages  ratings_count               

In [113]:
def retrieve_sementic_recommendation(query:str,top_k:int=10)->pd.DataFrame:
    recs = db_books.similarity_search(query,k=top_k)
    books_list = []
    for i in range(len(recs)):
        books_list.append(int(recs[i].page_content.strip('"').split()[0]))
    
    return books[books["isbn13"].isin(books_list)].head(top_k) 

In [114]:
retrieve_sementic_recommendation("A book similar to Atomic habits",top_k=5)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
826,9780142402931,0142402931,The Far Side of Evil,Sylvia Engdahl,Juvenile Fiction,http://books.google.com/books/content?id=7nijj...,A young girl from an advanced civilization is ...,2005.0,3.98,324.0,57.0,The Far Side of Evil,9780142402931 A young girl from an advanced ci...
2270,9780441012374,044101237X,Rocket Ship Galileo,Robert Anson Heinlein,Fiction,http://books.google.com/books/content?id=1421n...,Three teenagers and an older scientist develop...,2004.0,3.71,211.0,6199.0,Rocket Ship Galileo,9780441012374 Three teenagers and an older sci...


In [115]:
from transformers import pipeline
fiction_categories = ["Fiction","Non-fiction"]
pipe = pipeline("zero-shot-classification",
                model="facebook/bart-large-mnli",
                device='mps'
                )

Device set to use mps


In [116]:
category_mapping = {'Fiction':"Fiction",
                    'Juvenile Fiction':"Children's Fiction",
                    'Biography & Autobiography':"Non-fiction",
                    'History':"Non-fiction",
                    'Literary Criticism' : "Non-fiction",
                    'Philosophy':"Non-fiction",
                    'Religion':"Non-fiction",
                    'Comics & Graphic Novels':"Fiction",
                    'Drama':"Fiction",
                    'Poetry':"Fiction",
                    'Science':"Non-fiction",
                    'Juvenile Nonfiction':"Children's Non-fiction",
                    }

In [117]:
books['simple_categories'] = books['categories'].map(category_mapping)

In [118]:
books.sample(2)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories
4090,9780812971897,812971892,No God But God,Reza Aslan,Religion,http://books.google.com/books/content?id=ERvuB...,An authoritative study of the Islamic faith in...,2006.0,4.13,310.0,16907.0,"No God But God: The Origins, Evolution, and Fu...",9780812971897 An authoritative study of the Is...,Non-fiction
3499,9780743487696,743487699,My Antonia,Willa Cather;Alyssa Harad,Fiction,http://books.google.com/books/content?id=P0XNP...,A successful lawyer remembers his boyhood in N...,2004.0,3.78,314.0,5974.0,My Antonia,9780743487696 A successful lawyer remembers hi...,Fiction


In [119]:
sequence = books.loc[books['simple_categories'] == "Fiction","description"].reset_index(drop=True)

In [120]:
pipe(sequence,fiction_categories)

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [121]:
import numpy as np

max_index = np.argmax(pipe(sequence,fiction_categories)['scores'])
max_label = pipe(sequence,fiction_categories)['labels'][max_index]
max_label

'Fiction'

In [122]:
def generate_predictions(sequence,categories):
    predictions = pipe(sequence,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    return max_label

In [123]:
from tqdm import tqdm

actual_categories = []
predicted_categories = []

for i in tqdm(range(0,300)):
    sequence = books.loc[books['simple_categories'] == 'Fiction','description'].reset_index(drop=True)[i]
    predicted_categories += [generate_predictions(sequence,fiction_categories)]
    actual_categories += ['Fiction']

100%|██████████| 300/300 [01:21<00:00,  3.67it/s]


In [124]:
for i in tqdm(range(0,300)):
    sequence = books.loc[books['simple_categories'] == 'Non-fiction','description'].reset_index(drop=True)[i]
    predicted_categories += [generate_predictions(sequence,fiction_categories)]
    actual_categories += ['Non-fiction']

100%|██████████| 300/300 [01:20<00:00,  3.74it/s]


In [125]:
import pandas as pd

predictions_df = pd.DataFrame({"actual_categories":actual_categories,"predicted_categories":predicted_categories})
predictions_df['correct_predictions'] = (
    np.where(predictions_df['actual_categories'] == predictions_df['predicted_categories'],1,0)
)

In [126]:
predictions_df['correct_predictions'].sum()/len(predictions_df)

0.7483333333333333

In [127]:
isbns = []
predicted_categories = []

missing_categories = books.loc[books['simple_categories'].isna(),["isbn13","description"]].reset_index(drop=True)

In [128]:
for i in tqdm(range(0,len(missing_categories))):
    sequence = missing_categories['description']
    predicted_categories = [generate_predictions(sequence,fiction_categories)]
    isbns += [missing_categories['isbn13'][i]]

100%|██████████| 1473/1473 [11:51<00:00,  2.07it/s] 


In [87]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5299 entries, 0 to 5298
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   isbn13              5299 non-null   int64  
 1   isbn10              5299 non-null   object 
 2   title               5299 non-null   object 
 3   authors             5265 non-null   object 
 4   categories          5269 non-null   object 
 5   thumbnail           5131 non-null   object 
 6   description         5299 non-null   object 
 7   published_year      5299 non-null   float64
 8   average_rating      5299 non-null   float64
 9   num_pages           5299 non-null   float64
 10  ratings_count       5299 non-null   float64
 11  title_and_subtitle  5299 non-null   object 
 12  tagged_description  5299 non-null   object 
 13  simple_categories   3826 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory usage: 579.7+ KB


In [None]:
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

def generate_response(query):
    retrieved_books = retrieve_sementic_recommendation(query,top_k=5)
    llm = ChatGoogleGenerativeAI(
        temperature=0.5
        model='gemini-2.0-flash-lite-preview-02-05'
    )
    
    prompt = PromptTemplate.from_template("""
    Query: {query}
    
    """
    
        
    )
    