In [None]:
from langchain_community.document_loaders import TextLoader
from  langchain_text_splitters import CharacterTextSplitter


In [None]:
from dotenv import load_dotenv

load_dotenv(dotenv_path='.env')

In [None]:
import pandas as pd

books = pd.read_csv('books_cleaned.csv')

In [None]:
books

In [None]:
books["tagged_description"].head()

In [None]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   sep="\n",
                                   index=False,
                                   header=False)

In [None]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0]

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Load Hugging Face embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the Chroma vector store
db_books = Chroma.from_documents(documents, embedding=embeddings)


In [None]:
query = "A book to teach children about nature"
docs = db_books.similarity_search_with_score(query, k=10)

In [None]:
docs

In [None]:
doc, score = docs[0]

books[books["isbn13"] == int(doc.page_content.split()[0].strip())]

In [None]:
def retrieve_semantic_representation(
        query: str,
        top_k: int = 10
) -> pd.DataFrame:
    # Search for the top 50 most similar books
    recs = db_books.similarity_search(query, k=50)

    books_list = []

    # Extract the ISBN or identifier from the content
    for rec in recs:
        try:
            # Extract the first part as ISBN
            isbn_str = rec.page_content.split()[0].strip()

            # Convert to integer (if needed)
            isbn = int(isbn_str)

            books_list.append(isbn)
        except ValueError:
            print(f"Skipping invalid entry: {rec.page_content}")

    # Retrieve matching books from the DataFrame
    return books[books["isbn13"].isin(books_list)].head(top_k)


In [None]:
query = "A children's book about nature and wildlife"
top_books = retrieve_semantic_representation(query)

print(top_books)


In [None]:
top_books.head()