In [1]:
import json
import os
import sys
from pathlib import Path
from typing import List, Tuple, Dict, Any

from tqdm import tqdm

# LangChain / Chroma imports
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

ROOT_DIR = Path().resolve().parent  # parent of notebooks/
sys.path.append(str(ROOT_DIR))

# Import project paths
from config.paths import DATA_DIR, VECTORSTORE1_DIR , VECTORSTORE2_DIR

# -----------------------
# CONFIG
# -----------------------
JSON_DIR = DATA_DIR / "processed"
VECTORSTORE_DIR = Path(VECTORSTORE1_DIR)
VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)

EMBED_MODELS = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "BGE-Base": "BAAI/bge-base-en-v1.5",
    "BGE-Large": "BAAI/bge-large-en-v1.5",
}

# -----------------------
# HELPERS
# -----------------------
def list_json_files(json_dir: Path) -> List[Path]:
    files = sorted(list(json_dir.glob("*.json")))
    print(f"Found {len(files)} JSON files in {json_dir}")
    for f in files:
        print(" -", f)
    return files


def load_json_file(path: Path) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def load_all_jsons(json_dir: Path) -> List[Any]:
    files = list_json_files(json_dir)
    all_docs = []
    for p in files:
        try:
            data = load_json_file(p)
            all_docs.append({"__source_file": p.name, "__raw": data})
        except Exception as e:
            print(f"‚ùå Failed to read {p.name}: {e}")
    return all_docs


def normalize_and_extract(raw_documents: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict[str, Any]]]:
    """
    Extract texts and metadata from the new JSON schema:
    {
      "document_title": str,
      "preamble": str,
      "parts": [
        {
          "part_number": str,
          "part_title": str,
          "articles": [
            {
              "article_number": str,
              "article_title": str,
              "clauses": [str]
            }
          ]
        }
      ],
      "schedules": [
        {
          "schedule_number": str,
          "schedule_title": str
        }
      ]
    }
    """
    texts: List[str] = []
    metadatas: List[Dict[str, Any]] = []

    for raw_doc in raw_documents:
        source_name = raw_doc.get("__source_file", "unknown")
        data = raw_doc.get("__raw")

        if data is None or not isinstance(data, dict):
            continue

        document_title = data.get("document_title", "")
        preamble = data.get("preamble", "")
        
        # Process preamble if exists
        if preamble and preamble.strip():
            metadata = {
                "source": source_name,
                "document_title": document_title,
                "section_type": "preamble",
                "chapter": "Preamble",
                "section": "Preamble"
            }
            texts.append(preamble.strip())
            metadatas.append(metadata)

        # Process parts and articles
        parts = data.get("parts", [])
        if not isinstance(parts, list):
            continue

        for part in parts:
            if not isinstance(part, dict):
                continue
                
            part_num = part.get("part_number", "")
            part_title = part.get("part_title", "")
            chapter_label = f"Part {part_num}: {part_title}".strip()

            articles = part.get("articles", [])
            if not isinstance(articles, list):
                continue

            for article in articles:
                if not isinstance(article, dict):
                    continue
                    
                art_num = article.get("article_number", "")
                art_title = article.get("article_title", "")
                clauses = article.get("clauses", [])
                
                if not isinstance(clauses, list):
                    continue

                # Join all clauses into a single text block for this article
                combined_text = "\n\n".join([clause.strip() for clause in clauses if clause and clause.strip()])
                
                if not combined_text:
                    continue

                section_label = f"Article {art_num}: {art_title}".strip()
                metadata = {
                    "source": source_name,
                    "document_title": document_title,
                    "section_type": "article",
                    "chapter": chapter_label,
                    "section": section_label,
                    "part_number": part_num,
                    "article_number": art_num,
                    "num_clauses": len(clauses)
                }
                texts.append(combined_text)
                metadatas.append(metadata)

        # Process schedules
        schedules = data.get("schedules", [])
        if isinstance(schedules, list):
            for schedule in schedules:
                if not isinstance(schedule, dict):
                    continue
                    
                schedule_num = schedule.get("schedule_number", "")
                schedule_title = schedule.get("schedule_title", "")
                
                # Some schedules might have content in the title
                if schedule_title and schedule_title.strip():
                    section_label = f"Schedule {schedule_num}".strip()
                    metadata = {
                        "source": source_name,
                        "document_title": document_title,
                        "section_type": "schedule",
                        "chapter": "Schedules",
                        "section": section_label,
                        "schedule_number": schedule_num
                    }
                    texts.append(schedule_title.strip())
                    metadatas.append(metadata)

    print(f"Normalized and extracted {len(texts)} textual units.")
    return texts, metadatas


def build_and_persist_chroma(label: str, model_name: str, texts: List[str], metadatas: List[Dict[str, Any]]):
    print(f"\n=== Building Chroma store for {label} ({model_name}) ===")
    if not texts:
        print(f"‚ö†Ô∏è No texts to embed for {label}. Skipping.")
        return False

    persist_dir = VECTORSTORE_DIR / f"{label.lower().replace('-', '_')}_store"
    persist_dir.mkdir(parents=True, exist_ok=True)

    try:
        print("Loading embedding model (this may download weights on first run)...")
        embedder = HuggingFaceEmbeddings(model_name=model_name)

        # Embedding sanity check
        sample = texts[:2]
        emb_sample = embedder.embed_documents(sample)
        if not emb_sample or len(emb_sample) != len(sample):
            print("‚ùå Embedding sanity check failed.")
            return False

        final_texts = [t for t in texts if t.strip()]
        final_metas = [m for t, m in zip(texts, metadatas) if t.strip()]

        if not final_texts:
            print("‚ö†Ô∏è No non-empty texts to embed after filtering. Skipping.")
            return False

        vectordb = Chroma.from_texts(
            texts=final_texts,
            embedding=embedder,
            metadatas=final_metas,
            persist_directory=str(persist_dir),
        )
        print(f"‚úÖ Successfully persisted {label} store at: {persist_dir}")
        print(f"   Total documents: {len(final_texts)}")
        return True

    except Exception as e:
        print(f"‚ùå Error while building {label} store: {e}")
        return False


# -----------------------
# PIPELINE ENTRYPOINT
# -----------------------
def main():
    print("üöÄ Starting full ingestion + embedding pipeline")
    print(f"üìÇ JSON Directory: {JSON_DIR}")
    print(f"üíæ Vector Store Directory: {VECTORSTORE_DIR}")
    
    if not JSON_DIR.exists():
        print(f"‚ùå JSON directory does not exist: {JSON_DIR}")
        return

    raw_docs = load_all_jsons(JSON_DIR)
    if not raw_docs:
        print("‚ùå No JSON documents loaded. Please check JSON_DIR path and files.")
        return

    texts, metadatas = normalize_and_extract(raw_docs)
    
    if not texts:
        print("‚ùå No texts extracted from documents. Please check the JSON structure.")
        return

    summary = {}
    for label, model_name in EMBED_MODELS.items():
        ok = build_and_persist_chroma(label, model_name, texts, metadatas)
        summary[label] = ok

    print("\nüîö Pipeline finished. Summary:")
    for label, ok in summary.items():
        status = '‚úÖ OK' if ok else '‚ùå Skipped/Failed'
        print(f" - {label}: {status}")


if __name__ == "__main__":
    main()

üöÄ Starting full ingestion + embedding pipeline
üìÇ JSON Directory: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed
üíæ Vector Store Directory: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\chroma_db1
Found 10 JSON files in D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Bank and Financial Institution Act 2073.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Banking Offence and Punishment Act 2064.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Constitution of Nepal 2072.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Electronic Commerce Act 2081.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\International Financial Transactions Act 2054.json
 - D:\Fusemachine\MyPocketLawyer-AI

  embedder = HuggingFaceEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Successfully persisted MiniLM store at: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\chroma_db1\minilm_store
   Total documents: 1930

=== Building Chroma store for BGE-Base (BAAI/bge-base-en-v1.5) ===
Loading embedding model (this may download weights on first run)...
‚úÖ Successfully persisted BGE-Base store at: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\chroma_db1\bge_base_store
   Total documents: 1930

=== Building Chroma store for BGE-Large (BAAI/bge-large-en-v1.5) ===
Loading embedding model (this may download weights on first run)...
‚úÖ Successfully persisted BGE-Large store at: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\chroma_db1\bge_large_store
   Total documents: 1930

üîö Pipeline finished. Summary:
 - MiniLM: ‚úÖ OK
 - BGE-Base: ‚úÖ OK
 - BGE-Large: ‚úÖ OK


In [2]:
import json
import os
import sys
from pathlib import Path
from typing import List, Tuple, Dict, Any

from tqdm import tqdm

# LangChain / Chroma imports
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

ROOT_DIR = Path().resolve().parent  # parent of notebooks/
sys.path.append(str(ROOT_DIR))

# Import project paths
from config.paths import DATA_DIR, VECTORSTORE1_DIR , VECTORSTORE2_DIR
from langchain.text_splitter import RecursiveCharacterTextSplitter
JSON_DIR = DATA_DIR / "processed"
VECTORSTORE_DIR = Path(VECTORSTORE2_DIR)
VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)

EMBED_MODELS = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "BGE-Base": "BAAI/bge-base-en-v1.5",
    "BGE-Large": "BAAI/bge-large-en-v1.5",
}

# -----------------------
# HELPERS
# -----------------------
def list_json_files(json_dir: Path) -> List[Path]:
    files = sorted(list(json_dir.glob("*.json")))
    print(f"Found {len(files)} JSON files in {json_dir}")
    for f in files:
        print(" -", f)
    return files


def load_json_file(path: Path) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def load_all_jsons(json_dir: Path) -> List[Any]:
    files = list_json_files(json_dir)
    all_docs = []
    for p in files:
        try:
            data = load_json_file(p)
            all_docs.append({"__source_file": p.name, "__raw": data})
        except Exception as e:
            print(f"‚ùå Failed to read {p.name}: {e}")
    return all_docs



def normalize_and_extract(raw_documents: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict[str, Any]]]:
    """
    Extract texts and metadata from the JSON schema and chunk article texts
    into smaller segments for better embedding performance.
    """

    texts: List[str] = []
    metadatas: List[Dict[str, Any]] = []

    # --- Initialize text splitter for intra-article chunking ---
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,        # ~350‚Äì450 words
        chunk_overlap=100,     # keeps local context
        separators=["\n\n", ".", "?", "!", ";", ":", " "]
    )

    for raw_doc in raw_documents:
        source_name = raw_doc.get("__source_file", "unknown")
        data = raw_doc.get("__raw")

        if data is None or not isinstance(data, dict):
            continue

        document_title = data.get("document_title", "")
        preamble = data.get("preamble", "")

        # -------------------------
        # PREAMBLE
        # -------------------------
        if preamble and preamble.strip():
            metadata = {
                "source": source_name,
                "document_title": document_title,
                "section_type": "preamble",
                "chapter": "Preamble",
                "section": "Preamble"
            }
            texts.append(preamble.strip())
            metadatas.append(metadata)

        # -------------------------
        # PARTS ‚Üí ARTICLES ‚Üí CLAUSES
        # -------------------------
        parts = data.get("parts", [])
        if not isinstance(parts, list):
            continue

        for part in parts:
            if not isinstance(part, dict):
                continue

            part_num = part.get("part_number", "")
            part_title = part.get("part_title", "")
            chapter_label = f"Part {part_num}: {part_title}".strip()

            articles = part.get("articles", [])
            if not isinstance(articles, list):
                continue

            for article in articles:
                if not isinstance(article, dict):
                    continue

                art_num = article.get("article_number", "")
                art_title = article.get("article_title", "")
                section_label = f"Article {art_num}: {art_title}".strip()

                clauses = article.get("clauses", [])
                if not isinstance(clauses, list):
                    continue

                combined_text = "\n\n".join(
                    [clause.strip() for clause in clauses if clause and clause.strip()]
                )
                if not combined_text:
                    continue

                # --- Chunk the combined article text ---
                chunks = splitter.split_text(combined_text)
                for i, chunk in enumerate(chunks):
                    metadata = {
                        "source": source_name,
                        "document_title": document_title,
                        "section_type": "article",
                        "chapter": chapter_label,
                        "section": f"{section_label} (chunk {i+1}/{len(chunks)})",
                        "part_number": part_num,
                        "article_number": art_num,
                        "num_clauses": len(clauses),
                        "chunk_index": i + 1,
                        "total_chunks": len(chunks)
                    }
                    texts.append(chunk)
                    metadatas.append(metadata)

        # -------------------------
        # SCHEDULES
        # -------------------------
        schedules = data.get("schedules", [])
        if isinstance(schedules, list):
            for schedule in schedules:
                if not isinstance(schedule, dict):
                    continue

                schedule_num = schedule.get("schedule_number", "")
                schedule_title = schedule.get("schedule_title", "")

                if schedule_title and schedule_title.strip():
                    section_label = f"Schedule {schedule_num}".strip()
                    metadata = {
                        "source": source_name,
                        "document_title": document_title,
                        "section_type": "schedule",
                        "chapter": "Schedules",
                        "section": section_label,
                        "schedule_number": schedule_num
                    }
                    texts.append(schedule_title.strip())
                    metadatas.append(metadata)

    # --- Log summary ---
    print(f"‚úÖ Normalized and extracted {len(texts)} chunks.")
    avg_len = sum(len(t) for t in texts) / max(len(texts), 1)
    print(f"üìè Average chunk length: {avg_len:.1f} characters")

    return texts, metadatas


In [3]:
def build_and_persist_chroma(label: str, model_name: str, texts: List[str], metadatas: List[Dict[str, Any]]):
    print(f"\n=== Building Chroma store for {label} ({model_name}) ===")
    if not texts:
        print(f"‚ö†Ô∏è No texts to embed for {label}. Skipping.")
        return False

    persist_dir = VECTORSTORE_DIR / f"{label.lower().replace('-', '_')}_store"
    persist_dir.mkdir(parents=True, exist_ok=True)

    try:
        print("Loading embedding model (this may download weights on first run)...")
        embedder = HuggingFaceEmbeddings(model_name=model_name)

        # Embedding sanity check
        sample = texts[:2]
        emb_sample = embedder.embed_documents(sample)
        if not emb_sample or len(emb_sample) != len(sample):
            print("‚ùå Embedding sanity check failed.")
            return False

        final_texts = [t for t in texts if t.strip()]
        final_metas = [m for t, m in zip(texts, metadatas) if t.strip()]

        if not final_texts:
            print("‚ö†Ô∏è No non-empty texts to embed after filtering. Skipping.")
            return False

        vectordb = Chroma.from_texts(
            texts=final_texts,
            embedding=embedder,
            metadatas=final_metas,
            persist_directory=str(persist_dir),
        )
        print(f"‚úÖ Successfully persisted {label} store at: {persist_dir}")
        print(f"   Total documents: {len(final_texts)}")
        return True

    except Exception as e:
        print(f"‚ùå Error while building {label} store: {e}")
        return False


# -----------------------
# PIPELINE ENTRYPOINT
# -----------------------
def main():
    print("üöÄ Starting full ingestion + embedding pipeline")
    print(f"üìÇ JSON Directory: {JSON_DIR}")
    print(f"üíæ Vector Store Directory: {VECTORSTORE_DIR}")
    
    if not JSON_DIR.exists():
        print(f"‚ùå JSON directory does not exist: {JSON_DIR}")
        return

    raw_docs = load_all_jsons(JSON_DIR)
    if not raw_docs:
        print("‚ùå No JSON documents loaded. Please check JSON_DIR path and files.")
        return

    texts, metadatas = normalize_and_extract(raw_docs)
    
    if not texts:
        print("‚ùå No texts extracted from documents. Please check the JSON structure.")
        return

    summary = {}
    for label, model_name in EMBED_MODELS.items():
        ok = build_and_persist_chroma(label, model_name, texts, metadatas)
        summary[label] = ok

    print("\nüîö Pipeline finished. Summary:")
    for label, ok in summary.items():
        status = '‚úÖ OK' if ok else '‚ùå Skipped/Failed'
        print(f" - {label}: {status}")


if __name__ == "__main__":
    main()

üöÄ Starting full ingestion + embedding pipeline
üìÇ JSON Directory: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed
üíæ Vector Store Directory: D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\chroma_db2
Found 10 JSON files in D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Bank and Financial Institution Act 2073.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Banking Offence and Punishment Act 2064.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Constitution of Nepal 2072.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\Electronic Commerce Act 2081.json
 - D:\Fusemachine\MyPocketLawyer-AI-Powered-Legal-Aid-Assistant-2\data\processed\International Financial Transactions Act 2054.json
 - D:\Fusemachine\MyPocketLawyer-AI