In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document

In [2]:
# Step 1 - Load your Markdown text (from file or string)
with open("../2_JSON To Markdown/hotels.md", "r", encoding="utf-8") as file:
    markdown_text = file.read()

# Step 2 - Chunk the markdown using header-based splitter
splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "Hotel Name"),
        ("##", "Section"),
        ("###", "Review")
    ]
)
documents: list[Document] = splitter.split_text(markdown_text)

In [3]:
# Step 3 - Use HuggingFace Embeddings (compatible with Groq)
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")




In [4]:
# Step 4 - Store chunks in Chroma
vectorstore = Chroma.from_documents(documents, embedding, persist_directory="chroma_store")

In [5]:
# Step 5 - Persist vector store
vectorstore.persist()

print("✅ Vector storage created and persisted.")

✅ Vector storage created and persisted.


  vectorstore.persist()


# Example

In [7]:
# Reload vector store
vectorstore = Chroma(persist_directory="chroma_store", embedding_function=embedding)

  vectorstore = Chroma(persist_directory="chroma_store", embedding_function=embedding)


In [8]:
# Example: Get chunks for a specific hotel
retrieved_docs = vectorstore.similarity_search_with_relevance_scores(
    query="The Residence Tunis",
    k=5  # Or more, depending on recall needed
)

In [9]:
# Filter only those that belong to the correct hotel
filtered = [doc for doc, score in retrieved_docs if "The Residence Tunis" in doc.metadata.get("Hotel Name", "")]

# Output filtered chunks
for doc in filtered:
    print("🧩 Chunk:\n", doc.page_content)
    print("📌 Metadata:", doc.metadata)
    print("=" * 50)

🧩 Chunk:
 **Rating**: 5.0
**Date**: Apr 2025
**Title**: Excellent family holiday
**Text**: The Residence is an institution in Tunis with a high quality service. Everyone is taking care of you to please you.  Large rooms with very comfortable beds, very good Spa with sauna/hammam and small gym.  Breakfast is very complete and excellent, the Tunisian restaurant is also delicious.  For the kids the kids club leaders are adorable they had a memorable stay A big thank you to the entire Residence team!  
---
📌 Metadata: {'Hotel Name': 'Hotel Name: The Residence Tunis', 'Review': '🔸 Review 8', 'Section': '💬 Reviews'}
