In [None]:
import json
import os
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from dotenv import load_dotenv

# Load keys from .env file if available
load_dotenv()

# --------------------------
# CONFIGURATION
# --------------------------
JSON_PATH = "D:/gov-scheme-assistant-updated/threetry/schemes.json"
DB_DIR = "rag_db"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # lightweight & fast

# --------------------------
# STEP 1: LOAD AND PARSE JSON
# --------------------------
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

docs = []
for entry in data:
    kb = entry["knowledge_base_entry"]
    text_parts = []

    # Main fields
    text_parts.append(f"Scheme: {kb.get('scheme', '')}")
    text_parts.append(f"Summary: {kb.get('summary', '')}")

    # Flatten nested fields (key_information, all_extracted_sections, etc.)
    for section in ["key_information", "all_extracted_sections"]:
        section_data = kb.get(section, {})
        if isinstance(section_data, dict):
            for key, value in section_data.items():
                if isinstance(value, list):
                    text_parts.extend(value)
                elif isinstance(value, str):
                    text_parts.append(value)

    # Combine all text
    full_text = "\n".join(text_parts).strip()

    # Create Document for embedding
    if full_text:
        docs.append(Document(page_content=full_text, metadata={"scheme": kb.get("scheme", "Unknown")}))

print(f"Loaded {len(docs)} documents.")