In [9]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import pandas as pd
from sentence_transformers import SentenceTransformer

In [10]:
df = pd.read_csv("books_cleaned.csv")

In [47]:
df['tagged_description']

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

In [11]:
df['tagged_description'].to_csv("tagged_description.txt", sep = "\n", index = False, header = False)

In [12]:
raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()

In [13]:
text_splitter = CharacterTextSplitter(chunk_size = 0, chunk_overlap = 0, separator="\n")

In [14]:
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 1168, which is longer than the specified 0
Created a chunk of size 1214, which is longer than the specified 0
Created a chunk of size 373, which is longer than the specified 0
Created a chunk of size 309, which is longer than the specified 0
Created a chunk of size 483, which is longer than the specified 0
Created a chunk of size 482, which is longer than the specified 0
Created a chunk of size 960, which is longer than the specified 0
Created a chunk of size 188, which is longer than the specified 0
Created a chunk of size 843, which is longer than the specified 0
Created a chunk of size 296, which is longer than the specified 0
Created a chunk of size 197, which is longer than the specified 0
Created a chunk of size 881, which is longer than the specified 0
Created a chunk of size 1088, which is longer than the specified 0
Created a chunk of size 1189, which is longer than the specified 0
Created a chunk of size 304, which is longer than the specified 0
Create

In [15]:
documents[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gi

In [16]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name="sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [18]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [None]:
from langchain_qdrant import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams


In [55]:
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME")

In [56]:
# Initialize Qdrant client
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Create collection if not exists
client.recreate_collection(
    collection_name=QDRANT_COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)  # Adjust size to match embedding model
)

  client.recreate_collection(


True

In [57]:
# Extract text and metadata from documents
texts = [doc.page_content for doc in documents]

# ✅ Ensure metadata contains `isbn13`
metadatas = []
for doc in documents:
    metadata = doc.metadata.copy()  # Avoid modifying original metadata
    metadata["isbn13"] = str(metadata.get("isbn13", "unknown"))  # Convert to string, ensure it's present
    metadatas.append(metadata)

# Generate embeddings
qdrant = Qdrant(
    client=client,
    collection_name=QDRANT_COLLECTION_NAME,
    embeddings=hf  # Correctly pass the embedding model
)

# ✅ Store data in Qdrant with proper metadata
qdrant.add_texts(
    texts=texts,
    metadatas=metadatas  # Metadata now correctly includes `isbn13`
)


['d867be27485c47b3ba0c4bf4cbe98313',
 '82a1cf6125254a72885843fbcd2dc17c',
 'af30505a16734a4a92f9adff64eb99b1',
 '46ce7ec0f522451e8e894c978b185dc0',
 '79ceff22df8b413c952b0a3762f5c8af',
 '1694c359e19241be9b06aeb3376734ba',
 'db2416413395469194e233f1fdfc5162',
 '4f9b1823ea1e4d29909069d4fe31c0f2',
 '64112d81611f46e99983adb71f96e589',
 '9a3c79954ad34e7e97e059210463359c',
 '43f172eb7d9249708cb01690b1520318',
 'b543dcf7a099487e8246bca854ba2a9e',
 'bde4958ab76e4702bb647c00b511b96c',
 '054aadbddaf6427fa055a9b1d4cd8eda',
 'ff4e3a90daab495294610a55c64c2b6f',
 'ad010ef40e634a6fb3bcd6f8a3b5a55c',
 '28065d98f70b435d8f6eebaf8a2df01d',
 '452fd97400154dd2b136fcad0e437165',
 '0f5cc9889c7247528b61dbda8fb1dd16',
 'b739906f7b9c4e4ba31c57e61ee05cfb',
 'e012031bd6c446beb686d52da83b3948',
 'd815bca3a1cf44eaa77ee60e37fd8797',
 '35636167d2384f30b8c410afba135d96',
 '3eadfc55151e402d8e29f0c1b5eeb8fa',
 '02b0e62a335a41699abfea064f8f5e18',
 '57f126b7050d4b1da190b3011bca1dfb',
 '8292f7590f004184abefd8940b66e032',
 

In [46]:
test_query = "A story about adventure"
query_embedding = hf.embed_query(test_query)
test_results = qdrant.similarity_search_by_vector(query_embedding, k=5)

for result in test_results:
    print(result.metadata)  # See what metadata exists

{'source': 'tagged_description.txt', '_id': '6e9614c2-fcd2-4e76-af44-8a946cbd18fa', '_collection_name': 'Book-recommender'}
{'source': 'tagged_description.txt', '_id': '51f5d586-ea16-46a9-974b-c94844bc0285', '_collection_name': 'Book-recommender'}
{'source': 'tagged_description.txt', '_id': '37e4863e-eea9-4619-9e99-59f9c8a86c78', '_collection_name': 'Book-recommender'}
{'source': 'tagged_description.txt', '_id': '1aa8d854-c9b8-4e6c-9fd1-774c6f14fb74', '_collection_name': 'Book-recommender'}
{'source': 'tagged_description.txt', '_id': '2ecbfd1d-b479-4029-b18c-a8abc093643b', '_collection_name': 'Book-recommender'}


In [49]:
query = "A book to teach children about nature"
results = qdrant.similarity_search(query, k=10)  # Retrieves top 5 similar books

for result in results:
    print(result.page_content)  # This should print the most relevant book summaries


9780786808069 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.
9780786808380 Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful -- information about the subjects.
9780786808397 Introduce your baby to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to exopse little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies eac

In [37]:
int(results[0].page_content.split()[0].strip())

9780786808069

In [38]:
df[df["isbn13"] == int(results[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...


In [43]:
def retrieve_semantic_recommendation(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = qdrant.similarity_search(query, k = top_k)

    books_list = []

    for index in range(len(recs)):
        books_list.append(int(recs[index].page_content.strip('"').split()[0]))

    return df[df["isbn13"].isin(books_list)]

In [45]:
temp = retrieve_semantic_recommendation("A book about world war 1", 3)
temp

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
1429,9780340774779,340774770,"The Road to War, 1933-1939",Andrew Hunt,Germany,http://books.google.com/books/content?id=YYLPG...,This text uses a source-based approach to stud...,2000.0,0.0,128.0,0.0,"The Road to War, 1933-1939",9780340774779 This text uses a source-based ap...
2843,9780575073357,575073357,The Man in the High Castle,Philip K. Dick,Alternative histories (Fiction),http://books.google.com/books/content?id=8HBAP...,It is 1962 and the Second World War has been o...,2001.0,3.63,256.0,437.0,The Man in the High Castle,9780575073357 It is 1962 and the Second World ...
3180,9780688085872,688085873,A Short History of World War II,James L. Stokesbury,History,http://books.google.com/books/content?id=uDBhl...,"Despite the numerous books on World War II, un...",1980.0,3.93,416.0,454.0,A Short History of World War II,9780688085872 Despite the numerous books on Wo...


In [51]:
retrieve_semantic_recommendation("A book about billionares", 3)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
1286,9780312878603,312878605,The Collected Stories of Arthur C. Clarke,Arthur C. Clarke,Fiction,http://books.google.com/books/content?id=HLz1g...,Introduces readers to the author's shorter wor...,2002.0,4.3,966.0,3964.0,The Collected Stories of Arthur C. Clarke,9780312878603 Introduces readers to the author...
4480,9781416500339,1416500332,Oedipus the King,Sophocles;Frederic Will;Bernard Knox,Drama,http://books.google.com/books/content?id=jn1bN...,"Each volume in a collection of affordable, rea...",2005.0,3.69,144.0,706.0,Oedipus the King,9781416500339 Each volume in a collection of a...
4481,9781416500391,1416500391,Gulliver's Travels and A Modest Proposal,Jonathan Swift;Jesse Gale,Fiction,http://books.google.com/books/content?id=WWBcc...,"Each volume in a collection of affordable, rea...",2005.0,3.79,416.0,2001.0,Gulliver's Travels and A Modest Proposal,9781416500391 Each volume in a collection of a...
