**Objective:** 

Implement a Retrieval-Augmented Generation (RAG) pipeline. The goal is to build an efficient system that can answer queries related to Indian laws by combining effective retrieval techniques with LLMs. You are encouraged to explore and implement retrieval optimization methods to enhance the system's performance in retrieving relevant legal content and ensuring accurate and contextually grounded responses.

1. Storing in VectorDB
2. Retrieving from DB
3. Implementing Retrieving with Langchain

In [2]:
%%capture
!pip install chromadb sentence-transformers datasets langchain langchain-community langchain-core
# pip install datasets           # For Hugging Face datasets
# pip install sentence-transformers  # For embeddings
# pip install chromadb           # Vector database
# pip install langchain          # Orchestration & RAG
# pip install openai             # For ChatOpenAI / OpenAI API


In [3]:
import chromadb
import os
from sentence_transformers import SentenceTransformer

2025-10-18 15:36:28.829075: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760801789.010406      57 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760801789.066615      57 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
configurator = {
  "dataset": {
    "name": "mratanusarkar/Indian-Laws",
    "split": "train"
  },
  "embedding": {
    "model_name": "sentence-transformers/all-MiniLM-L6-v2"
  },
  "chroma": {
    "collection_name": "indian_laws_collection",
    "persist_directory": "./chroma_store",
    "batch_size": 5000
  },
  "llm": {
    "model_name": "gpt-4o-mini-2024-07-18",
    "temperature": 0,
    "api_key": "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"
  },
  "retriever": {
    "top_k": 3
  },
  "logging": {
    "level": "INFO",
    "format": "%(asctime)s - %(levelname)s - %(message)s"
  }
}



In [5]:
"""
Load and expose configuration variables from config.json
"""

import json
import os
import logging

# Path to config.json
# CONFIG_PATH = os.path.join(os.path.dirname(__file__), "config.json")

def load_config():
    """Load and validate configuration file."""
    try:
        with open(CONFIG_PATH, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Configuration file not found at {CONFIG_PATH}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format in config file: {str(e)}")

# Load the config
config = configurator

# ------------------------------
# Dataset
# ------------------------------
DATASET_NAME = config["dataset"]["name"]
DATASET_SPLIT = config["dataset"]["split"]

# ------------------------------
# Embedding model
# ------------------------------
EMBEDDING_MODEL_NAME = config["embedding"]["model_name"]

# ------------------------------
# Chroma settings
# ------------------------------
CHROMA_COLLECTION_NAME = config["chroma"]["collection_name"]
CHROMA_PERSIST_DIR = config["chroma"]["persist_directory"]
CHROMA_BATCH_SIZE = config["chroma"].get("batch_size", 5000)

# ------------------------------
# LLM settings
# ------------------------------
LLM_MODEL_NAME = config["llm"]["model_name"]
LLM_TEMPERATURE = config["llm"]["temperature"]
OPENAI_API_KEY = config["llm"]["api_key"]

# ------------------------------
# Retriever settings
# ------------------------------
TOP_K = config["retriever"].get("top_k", 3)

# ------------------------------
# Logging setup
# ------------------------------
logging.basicConfig(
    level=getattr(logging, config["logging"]["level"], logging.INFO),
    format=config["logging"]["format"]
)


In [7]:
import kaggle_secrets
import os
import wandb

# Fetch API key from Kaggle Secrets
OPENAI_API_KEY = kaggle_secrets.UserSecretsClient().get_secret("OPENAI_API_KEY")

# Set as environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
# !rm -rf /kaggle/working/*
os.listdir("/kaggle/working")

In [21]:
"""
Config-driven RAG Pipeline with Dataset Ingestion
- Loads Hugging Face dataset
- Stores embeddings in ChromaDB
- Fully modular LangChain RAG
"""

import logging
from typing import List
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings

# from config import (
#     DATASET_NAME, DATASET_SPLIT, EMBEDDING_MODEL_NAME,
#     CHROMA_COLLECTION_NAME, CHROMA_PERSIST_DIR, CHROMA_BATCH_SIZE,
#     LLM_MODEL_NAME, LLM_TEMPERATURE, OPENAI_API_KEY, TOP_K
# )

# ------------------------------
# Logging
# ------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# ------------------------------
# Dataset loading & preprocessing
# ------------------------------
def load_dataset_texts(dataset_name: str = DATASET_NAME, split: str = DATASET_SPLIT):
    """
    Load Hugging Face dataset and combine act_title, section, and law into one text per row.
    Returns:
        texts (List[str]): List of combined text documents
        metadatas (List[dict]): List of metadata dictionaries per document
    """
    try:
        dataset = load_dataset(dataset_name, split=split)
        texts = []
        metadatas = []

        for row in dataset:
            if row.get("law"):  # skip empty laws
                doc_text = f"Act: {row['act_title']}\nSection: {row['section']}\nLaw: {row['law']}"
                texts.append(doc_text)
                metadatas.append({"act_title": row["act_title"], "section": row["section"]})

        logging.info(f"Loaded {len(texts)} documents from dataset '{dataset_name}' (split={split})")
        return texts, metadatas

    except Exception as e:
        logging.error(f"Failed to load dataset '{dataset_name}': {str(e)}")
        raise


# ------------------------------
# ChromaDB ingestion
# ------------------------------
import uuid

def store_embeddings_in_chroma(texts: List[str], metadatas: List[dict]):
    """
    Store embeddings in ChromaDB in batches with unique IDs and metadata.
    
    Args:
        texts (List[str]): List of combined text documents
        metadatas (List[dict]): List of metadata dictionaries corresponding to each document
    """
    try:
        if len(texts) != len(metadatas):
            raise ValueError("Length of texts and metadatas must be the same.")

        # Initialize embedding model
        embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        
        # Connect or create Chroma collection
        client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
        try:
            collection = client.get_collection(CHROMA_COLLECTION_NAME)
        except:
            collection = client.create_collection(CHROMA_COLLECTION_NAME)
        
        logging.info(f"Storing embeddings in ChromaDB (collection={CHROMA_COLLECTION_NAME})...")

        # Store in batches
        for i in range(0, len(texts), CHROMA_BATCH_SIZE):
            batch_texts = texts[i:i+CHROMA_BATCH_SIZE]
            batch_metadatas = metadatas[i:i+CHROMA_BATCH_SIZE]
            
            # Generate embeddings for the batch
            embeddings = embed_model.encode(batch_texts, show_progress_bar=True)

            # Generate unique IDs for each document
            ids = [str(uuid.uuid4()) for _ in batch_texts]

            # Add documents, embeddings, and metadata to Chroma
            collection.add(
                ids=ids,
                documents=batch_texts,
                embeddings=embeddings,
                metadatas=batch_metadatas
            )

            logging.info(f"Stored batch {i//CHROMA_BATCH_SIZE + 1} ({len(batch_texts)} docs)")

        logging.info("All embeddings and metadata stored successfully.")

    except Exception as e:
        logging.error(f"Failed to store embeddings: {str(e)}")
        raise

"""
Enhanced LangChain RAG pipeline (Step 2: LLM Interaction Enhancements)

Features added:
- System prompt enrichment (explicit citation rules, conservative behavior)
- Inline citation guidance (ask model to use [1], [2] style referring to retrieved contexts)
- Automatic answer evaluation: use the LLM to score faithfulness (0-100) + short rationale
- Show numbered retrieved contexts so citations map to sources

Prereqs:
- config.py must expose: OPENAI_API_KEY, LLM_MODEL_NAME, LLM_TEMPERATURE, TOP_K,
  CHROMA_COLLECTION_NAME, CHROMA_PERSIST_DIR, EMBEDDING_MODEL_NAME
- Chroma must be populated with documents and metadata
"""

import os
import logging
from typing import List, Tuple, Dict, Any

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

# # Import your config variables (assumes config.py in same package)
# from config import (
#     OPENAI_API_KEY,
#     LLM_MODEL_NAME,
#     LLM_TEMPERATURE,
#     TOP_K,
#     CHROMA_COLLECTION_NAME,
#     CHROMA_PERSIST_DIR,
#     EMBEDDING_MODEL_NAME,
# )

# Basic logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# ------------------------------
# Build RAG chain with enriched system prompt
# ------------------------------
def build_rag_chain(top_k: int = TOP_K) -> RetrievalQA:
    """
    Build LangChain RetrievalQA pipeline using ChromaDB with ChatOpenAI.
    The system prompt enforces:
      - Use ONLY the provided context to answer.
      - When referring to a retrieved chunk, cite it inline using [1], [2], ... which correspond
        to the order of retrieved documents returned by the retriever.
      - If the answer cannot be supported by the context, respond: "I don't know based on the available data."
      - Provide a short final answer (2-4 sentences) and then a one-line "Sources:" list with numbers and metadata.
    """
    # Prepare vectorstore retriever
    embed_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    vectorstore = Chroma(
        collection_name=CHROMA_COLLECTION_NAME,
        persist_directory=CHROMA_PERSIST_DIR,
        embedding_function=embed_model,
    )
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

    # Ensure API key in environment (LangChain sometimes validates env var)
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

    # Create ChatOpenAI instance and pass api key explicitly for safety
    llm = ChatOpenAI(
        model_name=LLM_MODEL_NAME,
        temperature=LLM_TEMPERATURE,
        openai_api_key=OPENAI_API_KEY,  # explicit param avoids validation errors in some versions
    )

    # Enriched system prompt with citation instructions and behaviour constraints
    system_prompt = (
        "You are a precise legal assistant whose job is to answer user legal questions using only the provided context. "
        "BEHAVIOR RULES:\n"
        "1) Use ONLY the information present in the 'Context' to produce answers. Do not invent facts.\n"
        "2) If you assert any fact that appears in the context, append an inline citation to the corresponding chunk using [1], [2], ... where numbers match the order of retrieved contexts.\n"
        "   Example: 'Under Section 420, cheating is punishable [2].'\n"
        "3) If the information needed is NOT present in the context, respond exactly: \"I don't know based on the available data.\" and avoid speculation.\n"
        "4) Provide a concise answer (2-4 sentences). After the answer, add a short 'Sources:' line listing used source numbers and, if available, metadata (e.g., Act, Section).\n"
        "5) If multiple sources conflict, state that there are conflicting sources and list them.\n"
        "6) Maintain neutral, formal tone appropriate for legal information. This is NOT legal advice."
    )

    # The human template must include both {context} and {question} variables
    human_template = "Context:\n{context}\n\nQuestion:\n{question}\n\nRespond with citations and a Sources: line."

    prompt = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(system_prompt),
        HumanMessagePromptTemplate.from_template(human_template),
    ])

    # Build RetrievalQA with chain_type="stuff" and pass our custom prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt},
    )

    logging.info("RAG pipeline built with enriched system prompt.")
    return qa_chain

# ------------------------------
# Utility to safely extract text from ChatOpenAI response
# ------------------------------
def extract_llm_output(resp: Any) -> str:
    """
    Safely extract the text from a ChatOpenAI response.
    Handles AIMessage, list of AIMessage, dicts, and plain strings.
    """
    if isinstance(resp, list) and hasattr(resp[0], "content"):
        return resp[0].content
    elif hasattr(resp, "content"):
        return resp.content
    elif isinstance(resp, dict):
        if "text" in resp:
            return resp["text"]
        elif "generations" in resp:
            return resp["generations"][0]["message"]["content"]
    return str(resp)

# ------------------------------
# Answer evaluation helper
# ------------------------------
def evaluate_answer(answer: str, sources: List[Any], question: str) -> Dict[str, Any]:
    """
    Use the LLM to score the answer's faithfulness to the provided sources.

    Returns a dict:
        {
          "score": int (0-100),
          "verdict": "faithful"|"not_faithful"|"partial",
          "comment": "short explanation"
        }

    Note: This will make an extra API call. Keep responses short to reduce cost.
    """
    # Build a compact context summarizing sources for the evaluator
    # We'll provide numbered source blocks so the evaluator can compare.
    numbered_sources = []
    for i, src in enumerate(sources, start=1):
        # src is typically a Document with .page_content and .metadata
        metadata = src.metadata if hasattr(src, "metadata") else {}
        preview = src.page_content[:1000].replace("\n", " ")
        numbered_sources.append(f"[{i}] Meta: {metadata}\n{preview}")

    eval_context = "\n\n".join(numbered_sources)

    # Create a short evaluation prompt
    eval_system = (
        "You are an evaluator that judges whether a given ANSWER is faithful to the provided SOURCES. "
        "Give a score from 0 to 100 (100 = fully faithful and supported by sources, 0 = not at all supported). "
        "Then provide a one-line verdict: faithful / partial / not_faithful, and a two-sentence rationale."
    )

    eval_user = (
        f"SOURCES:\n{eval_context}\n\nQUESTION:\n{question}\n\nANSWER:\n{answer}\n\n"
        "Please provide a JSON object with keys: score (int 0-100), verdict (string), comment (string)."
    )

    # Use the same ChatOpenAI but create a short-lived instance (we can reuse OPENAI settings)
    evaluator = ChatOpenAI(model_name=LLM_MODEL_NAME, temperature=0.0, openai_api_key=OPENAI_API_KEY)
    # We will pass messages directly
    from langchain.schema import SystemMessage, HumanMessage

    # Prepare messages properly
    messages = [
        SystemMessage(content=eval_system),
        HumanMessage(content=eval_user)
    ]

    # Call the evaluator
    resp = evaluator(messages)
    # resp may be a string; try to parse JSON out of it. Keep parsing robust.
    # raw = resp.content if hasattr(resp, "content") else str(resp)
    # raw = resp['generations'][0]['message']['content']
    raw = extract_llm_output(resp)
    # raw = resp
    # Try to extract JSON - but keep fallback to a simple heuristic
    import json
    try:
        # Sometimes model outputs JSON directly; attempt parsing
        parsed = json.loads(raw.strip())
        # Ensure keys exist
        score = int(parsed.get("score", 0))
        verdict = parsed.get("verdict", "unknown")
        comment = parsed.get("comment", "")
    except Exception:
        # Fallback: do a simple heuristic parse (look for numbers)
        # This is a best-effort fallback; you can refine parsing/prompting for strict JSON output.
        score = 0
        verdict = "unknown"
        comment = raw.strip().replace("\n", " ")[:300]

    return {"score": score, "verdict": verdict, "comment": comment}


# ------------------------------
# Interactive loop with context display and evaluation
# ------------------------------
def interactive_loop_with_evaluation(qa_chain: RetrievalQA, evaluate: bool = True):
    """
    Interactive loop that:
      - runs the QA chain,
      - displays retrieved numbered contexts (so [1],[2] correspond),
      - prints the LLM answer,
      - optionally runs evaluation of the answer's faithfulness.
    """
    print("RAG (enhanced) ready. Type your legal query (or 'exit' to quit):")
    while True:
        query_text = input(">> ").strip()
        if query_text.lower() in ["exit", "quit"]:
            print("Exiting...")
            break
        try:
            # Run chain - it returns answer and source_documents
            result = qa_chain(query_text)
            answer = result.get("result")
            sources = result.get("source_documents", [])

            # Print numbered retrieved contexts so model citations align with these indices
            print("\n===== Retrieved Contexts (numbered) =====\n")
            for i, src in enumerate(sources, start=1):
                preview = src.page_content[:600].replace("\n", " ")
                meta = src.metadata if hasattr(src, "metadata") else {}
                print(f"[{i}] {preview} ...")
                if meta:
                    print(f"    Metadata: {meta}")
                print("-" * 80)

            # Show answer
            print("\n===== Answer =====\n")
            print(answer)

            # Optionally evaluate
            if evaluate:
                eval_result = evaluate_answer(answer, sources, query_text)
                print("\n===== Answer Evaluation =====")
                print(f"Score: {eval_result['score']} / 100")
                print(f"Verdict: {eval_result['verdict']}")
                print(f"Comment: {eval_result['comment']}")
                print("-" * 80)

        except Exception as e:
            print("Error:", e)
            logging.exception("Error during RAG query.")


# ------------------------------
# Top-level runner
# ------------------------------
def run_rag_pipeline():
    """
    Build RAG chain and run interactive loop with evaluation.
    Assumes Chroma has been ingested already. If not, ingest separately before running.
    """
    # Load dataset
    # texts, metadatas = load_dataset_texts()
    # # Store embeddings
    # store_embeddings_in_chroma(texts, metadatas)
    # Build RAG
    qa_chain = build_rag_chain()
    # Interactive loop
    interactive_loop_with_evaluation(qa_chain, evaluate=True)


if __name__ == "__main__":
    run_rag_pipeline()

RAG (enhanced) ready. Type your legal query (or 'exit' to quit):


>>   How many years of punishment is for murder attempt.



===== Retrieved Contexts (numbered) =====

[1] Act: Air Force Act, 1950 Section: 67 Law: 67. Attempt.- Any person subject to this Act who attempts to commit any of the offences specified in sections 34 to 66 inclusive, and in such attempt does any act towards the commission of the offence shall, on conviction by court-martial, where no express provision is made by this Act for the punishment of such attempt, be liable, if the offence attempted to be committed is punishable with death, to suffer imprisonment for a term which may extend to fourteen years or such less punishment as is in this Act mentioned; and if the offence attempted to be  ...
    Metadata: {'act_title': 'Air Force Act, 1950', 'section': '67'}
--------------------------------------------------------------------------------
[2] Act: Army Act, 1950 Section: 65 Law: 65. Attempt.- Any person subject to this Act who attempts to commit any of the offences specified in sections 34 to 64 inclusive and in such attempt does any

>>  exit


Exiting...


In [None]:
# import os
# print(os.path.abspath(CHROMA_PERSIST_DIR))
# print(os.access(CHROMA_PERSIST_DIR, os.W_OK))


In [None]:
# # !mkdir -p ./chroma_store
# # !chmod 755 ./chroma_store
# >>  How many years of punishment is for murder attempt.

# ===== Answer =====

# The punishment for an attempt to commit murder, which is typically punishable with death, can extend to fourteen years of imprisonment under the Air Force Act, Army Act, and Border Security Force Act.

In [13]:
# ### WORKING VERSION: 
# """
# Config-driven RAG Pipeline with Dataset Ingestion
# - Loads Hugging Face dataset
# - Stores embeddings in ChromaDB
# - Fully modular LangChain RAG
# """

# import logging
# from typing import List
# from datasets import load_dataset
# from sentence_transformers import SentenceTransformer
# import chromadb
# from chromadb.utils import embedding_functions
# from langchain.chains import RetrievalQA
# from langchain.vectorstores import Chroma
# from langchain.embeddings import SentenceTransformerEmbeddings

# # from config import (
# #     DATASET_NAME, DATASET_SPLIT, EMBEDDING_MODEL_NAME,
# #     CHROMA_COLLECTION_NAME, CHROMA_PERSIST_DIR, CHROMA_BATCH_SIZE,
# #     LLM_MODEL_NAME, LLM_TEMPERATURE, OPENAI_API_KEY, TOP_K
# # )

# # ------------------------------
# # Logging
# # ------------------------------
# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# # ------------------------------
# # Dataset loading & preprocessing
# # ------------------------------
# def load_dataset_texts(dataset_name: str = DATASET_NAME, split: str = DATASET_SPLIT):
#     """
#     Load Hugging Face dataset and combine act_title, section, and law into one text per row.
#     Returns:
#         texts (List[str]): List of combined text documents
#         metadatas (List[dict]): List of metadata dictionaries per document
#     """
#     try:
#         dataset = load_dataset(dataset_name, split=split)
#         texts = []
#         metadatas = []

#         for row in dataset:
#             if row.get("law"):  # skip empty laws
#                 doc_text = f"Act: {row['act_title']}\nSection: {row['section']}\nLaw: {row['law']}"
#                 texts.append(doc_text)
#                 metadatas.append({"act_title": row["act_title"], "section": row["section"]})

#         logging.info(f"Loaded {len(texts)} documents from dataset '{dataset_name}' (split={split})")
#         return texts, metadatas

#     except Exception as e:
#         logging.error(f"Failed to load dataset '{dataset_name}': {str(e)}")
#         raise


# # ------------------------------
# # ChromaDB ingestion
# # ------------------------------
# import uuid

# def store_embeddings_in_chroma(texts: List[str], metadatas: List[dict]):
#     """
#     Store embeddings in ChromaDB in batches with unique IDs and metadata.
    
#     Args:
#         texts (List[str]): List of combined text documents
#         metadatas (List[dict]): List of metadata dictionaries corresponding to each document
#     """
#     try:
#         if len(texts) != len(metadatas):
#             raise ValueError("Length of texts and metadatas must be the same.")

#         # Initialize embedding model
#         embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        
#         # Connect or create Chroma collection
#         client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
#         try:
#             collection = client.get_collection(CHROMA_COLLECTION_NAME)
#         except:
#             collection = client.create_collection(CHROMA_COLLECTION_NAME)
        
#         logging.info(f"Storing embeddings in ChromaDB (collection={CHROMA_COLLECTION_NAME})...")

#         # Store in batches
#         for i in range(0, len(texts), CHROMA_BATCH_SIZE):
#             batch_texts = texts[i:i+CHROMA_BATCH_SIZE]
#             batch_metadatas = metadatas[i:i+CHROMA_BATCH_SIZE]
            
#             # Generate embeddings for the batch
#             embeddings = embed_model.encode(batch_texts, show_progress_bar=True)

#             # Generate unique IDs for each document
#             ids = [str(uuid.uuid4()) for _ in batch_texts]

#             # Add documents, embeddings, and metadata to Chroma
#             collection.add(
#                 ids=ids,
#                 documents=batch_texts,
#                 embeddings=embeddings,
#                 metadatas=batch_metadatas
#             )

#             logging.info(f"Stored batch {i//CHROMA_BATCH_SIZE + 1} ({len(batch_texts)} docs)")

#         logging.info("All embeddings and metadata stored successfully.")

#     except Exception as e:
#         logging.error(f"Failed to store embeddings: {str(e)}")
#         raise



# from langchain.chat_models import ChatOpenAI

# # ------------------------------
# # LangChain RAG Pipeline using ChatOpenAI
# # ------------------------------
# def build_rag_chain(top_k: int = TOP_K):
#     """
#     Build LangChain RetrievalQA pipeline using ChromaDB with ChatOpenAI.
    
#     Args:
#         top_k (int): Number of documents to retrieve per query
    
#     Returns:
#         RetrievalQA: Configured RAG chain
#     """
#     # Embeddings for vectorstore
#     embed_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
#     vectorstore = Chroma(
#         collection_name=CHROMA_COLLECTION_NAME,
#         persist_directory=CHROMA_PERSIST_DIR,
#         embedding_function=embed_model
#     )
#     retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

#     # Set environment variable to avoid validation errors
#     import os
#     os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

#     # Use ChatOpenAI instead of OpenAI
#     llm = ChatOpenAI(
#         model_name=LLM_MODEL_NAME,  # e.g., "gpt-4o-mini"
#         temperature=LLM_TEMPERATURE
#     )

#     qa_chain = RetrievalQA.from_chain_type(
#         llm=llm,
#         retriever=retriever,
#         return_source_documents=True,
#         chain_type="stuff"
#     )
#     logging.info("RAG pipeline ready with ChatOpenAI.")
#     return qa_chain

# # ------------------------------
# # Run interactive query loop
# # ------------------------------
# def interactive_loop(qa_chain: RetrievalQA):
#     """
#     Interactive CLI for querying the RAG pipeline.
#     """
#     print("RAG pipeline ready. Type your query (or 'exit' to quit):")
#     while True:
#         query_text = input(">> ").strip()
#         if query_text.lower() in ["exit", "quit"]:
#             print("Exiting...")
#             break
#         try:
#             result = qa_chain(query_text)
#             answer = result.get("result")
#             sources = result.get("source_documents", [])
#             print("\n===== Answer =====\n")
#             print(answer)
#             # print("\n===== Source Documents =====\n")
#             # for i, src in enumerate(sources, 1):
#             #     print(f"{i}. {src.page_content}")
#             #     print(f"Metadata: {src.metadata}")
#             #     print("-"*60)
#         except Exception as e:
#             print("Error;", e)
#             logging.error(f"Error during RAG query: {str(e)}")

# # ------------------------------
# # Main function
# # ------------------------------
# def run_rag_pipeline():
#     # Load dataset
#     # texts, metadatas = load_dataset_texts()
#     # # Store embeddings
#     # store_embeddings_in_chroma(texts, metadatas)
#     # Build RAG
#     qa_chain = build_rag_chain()
#     # # Interactive loop
#     interactive_loop(qa_chain)

# if __name__ == "__main__":
#     run_rag_pipeline()


In [None]:
# ## WORKING V2: Added system prompt to enhance LLM and its worked 
# >>  How many years of punishment is for murder attempt.
# ===== Answer =====
# Based on the provided context, if the offence attempted to be committed is punishable with death (such as murder), the punishment for an attempt is imprisonment for a term which may extend to fourteen years.
# >>  How many years of punishment is for murder attempt. Just provide me precise number
# ===== Answer =====
# Up to fourteen years.
# >>  What is the section code for doing second marriage? And what is the punishment for it.
# ===== Answer =====
# The section code for doing a second marriage while already married under the Special Marriage Act, 1954 is Section 43. The punishment for this offense is that the person shall be deemed to have committed an offense under section 494 or section 495 of the Indian Penal Code, and the marriage solemnized shall be void


# """
# Config-driven RAG Pipeline with Dataset Ingestion
# - Loads Hugging Face dataset
# - Stores embeddings in ChromaDB
# - Fully modular LangChain RAG
# """

# import logging
# from typing import List
# from datasets import load_dataset
# from sentence_transformers import SentenceTransformer
# import chromadb
# from chromadb.utils import embedding_functions
# from langchain.chains import RetrievalQA
# from langchain.vectorstores import Chroma
# from langchain.embeddings import SentenceTransformerEmbeddings

# # from config import (
# #     DATASET_NAME, DATASET_SPLIT, EMBEDDING_MODEL_NAME,
# #     CHROMA_COLLECTION_NAME, CHROMA_PERSIST_DIR, CHROMA_BATCH_SIZE,
# #     LLM_MODEL_NAME, LLM_TEMPERATURE, OPENAI_API_KEY, TOP_K
# # )

# # ------------------------------
# # Logging
# # ------------------------------
# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# # ------------------------------
# # Dataset loading & preprocessing
# # ------------------------------
# def load_dataset_texts(dataset_name: str = DATASET_NAME, split: str = DATASET_SPLIT):
#     """
#     Load Hugging Face dataset and combine act_title, section, and law into one text per row.
#     Returns:
#         texts (List[str]): List of combined text documents
#         metadatas (List[dict]): List of metadata dictionaries per document
#     """
#     try:
#         dataset = load_dataset(dataset_name, split=split)
#         texts = []
#         metadatas = []

#         for row in dataset:
#             if row.get("law"):  # skip empty laws
#                 doc_text = f"Act: {row['act_title']}\nSection: {row['section']}\nLaw: {row['law']}"
#                 texts.append(doc_text)
#                 metadatas.append({"act_title": row["act_title"], "section": row["section"]})

#         logging.info(f"Loaded {len(texts)} documents from dataset '{dataset_name}' (split={split})")
#         return texts, metadatas

#     except Exception as e:
#         logging.error(f"Failed to load dataset '{dataset_name}': {str(e)}")
#         raise


# # ------------------------------
# # ChromaDB ingestion
# # ------------------------------
# import uuid

# def store_embeddings_in_chroma(texts: List[str], metadatas: List[dict]):
#     """
#     Store embeddings in ChromaDB in batches with unique IDs and metadata.
    
#     Args:
#         texts (List[str]): List of combined text documents
#         metadatas (List[dict]): List of metadata dictionaries corresponding to each document
#     """
#     try:
#         if len(texts) != len(metadatas):
#             raise ValueError("Length of texts and metadatas must be the same.")

#         # Initialize embedding model
#         embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        
#         # Connect or create Chroma collection
#         client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
#         try:
#             collection = client.get_collection(CHROMA_COLLECTION_NAME)
#         except:
#             collection = client.create_collection(CHROMA_COLLECTION_NAME)
        
#         logging.info(f"Storing embeddings in ChromaDB (collection={CHROMA_COLLECTION_NAME})...")

#         # Store in batches
#         for i in range(0, len(texts), CHROMA_BATCH_SIZE):
#             batch_texts = texts[i:i+CHROMA_BATCH_SIZE]
#             batch_metadatas = metadatas[i:i+CHROMA_BATCH_SIZE]
            
#             # Generate embeddings for the batch
#             embeddings = embed_model.encode(batch_texts, show_progress_bar=True)

#             # Generate unique IDs for each document
#             ids = [str(uuid.uuid4()) for _ in batch_texts]

#             # Add documents, embeddings, and metadata to Chroma
#             collection.add(
#                 ids=ids,
#                 documents=batch_texts,
#                 embeddings=embeddings,
#                 metadatas=batch_metadatas
#             )

#             logging.info(f"Stored batch {i//CHROMA_BATCH_SIZE + 1} ({len(batch_texts)} docs)")

#         logging.info("All embeddings and metadata stored successfully.")

#     except Exception as e:
#         logging.error(f"Failed to store embeddings: {str(e)}")
#         raise



# from langchain.chat_models import ChatOpenAI

# # ------------------------------
# # LangChain RAG Pipeline using ChatOpenAI
# # ------------------------------
# import os
# import logging
# from langchain.chat_models import ChatOpenAI
# from langchain.chains import RetrievalQA
# from langchain.vectorstores import Chroma
# from langchain.embeddings import SentenceTransformerEmbeddings
# from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

# # ------------------------------
# # LangChain RAG Pipeline using ChatOpenAI
# # ------------------------------
# def build_rag_chain(top_k: int = TOP_K):
#     """
#     Build LangChain RetrievalQA pipeline using ChromaDB with ChatOpenAI and system prompt.
    
#     Args:
#         top_k (int): Number of documents to retrieve per query
    
#     Returns:
#         RetrievalQA: Configured RAG chain
#     """
#     # Embeddings for vectorstore
#     embed_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
#     vectorstore = Chroma(
#         collection_name=CHROMA_COLLECTION_NAME,
#         persist_directory=CHROMA_PERSIST_DIR,
#         embedding_function=embed_model
#     )
#     retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

#     # Ensure API key is available
#     os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

#     # LLM model
#     llm = ChatOpenAI(`
#         model_name=LLM_MODEL_NAME,  # e.g., "gpt-4o-mini"
#         temperature=LLM_TEMPERATURE
#     )

#     # ------------------------------
#     # Add System Prompt for robust RAG behavior
#     # ------------------------------
#     system_prompt = (
#         "You are an intelligent assistant specialized in retrieving and reasoning "
#         "over factual information from the provided context. "
#         "Use only the retrieved documents (context) to answer accurately. "
#         "If the answer is not found in the context, clearly say "
#         "'I don't know based on the available data.' "
#         "Always provide concise, factual, and context-grounded answers."
#     )

#     prompt = ChatPromptTemplate.from_messages([
#         SystemMessagePromptTemplate.from_template(system_prompt),
#         HumanMessagePromptTemplate.from_template(
#             "Context:\n{context}\n\nQuestion:\n{question}"
#         )
#     ])

#     # Build RAG Chain
#     qa_chain = RetrievalQA.from_chain_type(
#         llm=llm,
#         retriever=retriever,
#         return_source_documents=True,
#         chain_type="stuff",
#         chain_type_kwargs={"prompt": prompt}
#     )

#     logging.info("RAG pipeline ready with ChatOpenAI.")
#     return qa_chain


# # ------------------------------
# # Run interactive query loop
# # ------------------------------
# def interactive_loop(qa_chain: RetrievalQA):
#     """
#     Interactive CLI for querying the RAG pipeline.
#     """
#     print("RAG pipeline ready. Type your query (or 'exit' to quit):")
#     while True:
#         query_text = input(">> ").strip()
#         if query_text.lower() in ["exit", "quit"]:
#             print("Exiting...")
#             break
#         try:
#             result = qa_chain(query_text)
#             answer = result.get("result")
#             sources = result.get("source_documents", [])
            
#             # 🔍 Show retrieved context chunks before answer
#             print("\n===== Retrieved Context =====\n")
#             for i, src in enumerate(sources, 1):
#                 preview = src.page_content[:400].replace("\n", " ")
#                 print(f"{i}. {preview}...")
#                 if src.metadata:
#                     print(f"   Metadata: {src.metadata}")
#                 print("-" * 60)
                
#             print("\n===== Answer =====\n")
#             print(answer)
#             # print("\n===== Source Documents =====\n")
#             # for i, src in enumerate(sources, 1):
#             #     print(f"{i}. {src.page_content}")
#             #     print(f"Metadata: {src.metadata}")
#             #     print("-"*60)
#         except Exception as e:
#             print("Error;", e)
#             logging.error(f"Error during RAG query: {str(e)}")

# # ------------------------------
# # Main function
# # ------------------------------
# def run_rag_pipeline():
#     # Load dataset
#     # texts, metadatas = load_dataset_texts()
#     # # Store embeddings
#     # store_embeddings_in_chroma(texts, metadatas)
#     # Build RAG
#     qa_chain = build_rag_chain()
#     # # Interactive loop
#     interactive_loop(qa_chain)

# if __name__ == "__main__":
#     run_rag_pipeline()


