In [None]:
# steps to install llama-cpp-python on Windows with CUDA support
# 1. Install Visual Studio 2022 with "Desktop development with C++" workload
# 2. Install CUDA Toolkit from NVIDIA's website if not already installed
# 3. Install latest version of cmake
# 4. Open x64 Native Tools Command Prompt for VS 2022
# 5. Run the following command:
    # pip install llama-index
    # pip install llama-index-llms-llama-cpp
    # pip install sentence-transformers
    # pip install chromadb
    # pip install faiss-cpu
    # pip install pypdf
    # pip install docx2txt
    # pip install python-pptx
    # set FORCE_CMAKE=1
    # set CMAKE_ARGS=-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=89
    # pip install llama-cpp-python --force-reinstall --no-cache-dir --verbose --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125
# 6. Verify installation by running python shell and executing:
    # from llama_cpp import Llama
    # llm = Llama(model_path=r"C:\Users\moidhassan\Downloads\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", n_gpu_layers=-1)
# 7. If you encounter issues, refer to the GitHub repository: https://github.com/abetlen/llama-cpp-python
# Note: Adjust the model_path to point to your downloaded GGUF model file.
# 8. For CUDA support, ensure your GPU is compatible and the correct CUDA version is installed.
# 9. Test with a simple script to ensure everything is working fine.
# Note: Some installations may require restarting the terminal or IDE to recognize new environment variables.


In [3]:
import platform
print(platform.architecture())

('64bit', 'WindowsPE')


In [8]:
# %% [markdown]
# ## 1. Imports & Configuration
# All necessary libraries and configuration constants.

# %%
import sys
import time
from pathlib import Path
import chromadb
from textwrap import dedent
from sentence_transformers import SentenceTransformer
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.llama_cpp import LlamaCPP

In [13]:
# --- Configuration Constants ---

# 1. Paths
# Adjust these paths to match your folder structure.
CV_PATH = "../data/CV"
FINANCIAL_PATH = "../data/financial"
SPECS_PATH = "../data/specs"
REIMBURSEMENT_PATH = "../data/reimbursement"
DATA_PATHS_MAP = {
    "CV": CV_PATH,
    "FINANCIAL": FINANCIAL_PATH,
    "SPECS": SPECS_PATH,
    "REIMBURSEMENT": REIMBURSEMENT_PATH
}
MODEL_PATH = "./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "local_rag_demo"

# 2. Embedding Model
# "all-MiniLM-L6-v2" is small (~90MB), fast, and high-quality.
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"

# 3. Chunking Parameters
# Using a semantic splitter to keep sentences intact.
CHUNK_SIZE = 768      # Target size of each chunk (in characters)
CHUNK_OVERLAP = 128     # Overlap between chunks to maintain context

# 4. LLM Parameters
# These are settings for the Mistral 7B model.
LLM_N_CTX = 8192*2       # Context window size (Mistral has a large one)
LLM_N_GPU_LAYERS = 35  # Layers to offload to GPU (adjust based on your VRAM)
LLM_TEMPERATURE = 0.2  # Low temperature for factual, less "creative" answers
LLM_MAX_NEW_TOKENS = 768 # Max tokens to generate in an answer

# 5. Retrieval Parameters
TOP_K_RESULTS = 5      # Number of context chunks to retrieve

# 6. Helper Function
def log(message):
    """Helper function for formatted logging."""
    print(f"[INFO] {time.strftime('%Y-%m-%d %H:%M:%S')} - {message}")

log("Configuration loaded.")

[INFO] 2025-10-29 02:27:54 - Configuration loaded.


In [29]:
# %% [markdown]
# ### Step 4.1: Load Documents
# **Goal:** Load documents from each directory path separately and group them. This allows us to create a dedicated index for "CVs", "Financials", etc. (Multi-Index RAG).

# %%
def load_documents(data_paths_map):
    """
    Loads documents from multiple directories and groups them by source name.
    
    Args:
        data_paths_map (dict): A dictionary mapping index names (e.g., "CV") 
                               to their directory paths.
    
    Returns:
        dict: A dictionary where keys are the index names and values are 
              lists of LlamaIndex Document objects.
    """
    log("Starting to load documents for multiple indexes...")
    grouped_documents = {}
    
    # Iterate through the map: (Index Name) -> (Directory Path)
    for name, path in data_paths_map.items():
        if not Path(path).exists():
            log(f"Warning: Path for '{name}' does not exist, skipping: {path}")
            continue
        
        # SimpleDirectoryReader reads all files in the directory
        reader = SimpleDirectoryReader(path, recursive=True)
        try:
            documents = reader.load_data()
            grouped_documents[name] = documents
            log(f"Successfully loaded {len(documents)} documents for index: '{name}'")
        except Exception as e:
            log(f"Error loading data from {path} for '{name}': {e}")
            
    total_docs = sum(len(docs) for docs in grouped_documents.values())
    log(f"‚úÖ Total documents loaded for {len(grouped_documents)} indexes: {total_docs}")
    
    if not grouped_documents:
        log("Error: No documents were loaded. Please check your DATA_PATHS_MAP.")
        
    return grouped_documents

In [15]:
# %%
# --- Run Step 4.1 (Modified for Multi-Index) ---
# This will scan your directories and populate the 'grouped_documents' dictionary.
grouped_documents = load_documents(DATA_PATHS_MAP)



[INFO] 2025-10-29 02:30:13 - Starting to load documents for multiple indexes...




[INFO] 2025-10-29 02:30:22 - Successfully loaded 51 documents for index: 'CV'




[INFO] 2025-10-29 02:31:04 - Successfully loaded 98 documents for index: 'FINANCIAL'




[INFO] 2025-10-29 02:32:03 - Successfully loaded 843 documents for index: 'SPECS'
[INFO] 2025-10-29 02:32:03 - Successfully loaded 15 documents for index: 'REIMBURSEMENT'
[INFO] 2025-10-29 02:32:03 - ‚úÖ Total documents loaded for 4 indexes: 1007


In [16]:
grouped_documents.keys()

dict_keys(['CV', 'FINANCIAL', 'SPECS', 'REIMBURSEMENT'])

In [17]:
# Display a snippet from one of the first documents loaded

for index_name, docs in grouped_documents.items():
    if docs:
        print(f"First document content snippet from index '{index_name}': {docs[0].text[:150]}...")
        print()
        

First document content snippet from index 'CV': ABHISHEK  RANJAN                  +91  9040140733   
 Data  Scientist  |  AI  Engineer                     13eee079@gmail.com  
 
ABOUT  ME  
Data  Sc...

First document content snippet from index 'FINANCIAL': ...

First document content snippet from index 'SPECS': Surface USB-C¬Æ Travel Hub
All the connections, wherever you are
Pitch
Turn your laptop into an on-the-go productivity companion with 
this elegant, mu...

First document content snippet from index 'REIMBURSEMENT': moid hassan
RD17471298637220809
Ramaswamy
KA04AA9727
Car
May 13th 2025, 3:25 PM
Booking History
Customer Name
Ride ID
Driver name
Vehicle Number
Mode ...



In [18]:
# %% [markdown]
# ### Step 4.2: Load Embedding Model
# We define the function to load the `sentence-transformer` model and then call it.

# %%
def load_embedding_model(model_name):
    """
    Loads the SentenceTransformer embedding model.
    
    Args:
        model_name (str): The name of the model from Hugging Face.
    
    Returns:
        SentenceTransformer: The loaded embedding model.
    """
    log(f"Loading embedding model: {model_name}...")
    # The first time you run this, it will download the model.
    embed_model = SentenceTransformer(model_name)
    
    # Example: Test the model
    sample_vec = embed_model.encode("This is a test sentence.")
    log(f"‚úÖ Embedding model loaded. Vector size: {len(sample_vec)}")
    return embed_model

In [19]:
# %%
# --- Run Step 4.2 ---
# This will download the model if you don't have it cached.
print(f"Loading embedding model: {EMBED_MODEL_NAME}...")
embed_model = load_embedding_model(EMBED_MODEL_NAME)

2025-10-29 02:33:29,590 - INFO - Use pytorch device_name: cpu
2025-10-29 02:33:29,590 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Loading embedding model: all-MiniLM-L6-v2...
[INFO] 2025-10-29 02:33:29 - Loading embedding model: all-MiniLM-L6-v2...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  7.04it/s]

[INFO] 2025-10-29 02:33:34 - ‚úÖ Embedding model loaded. Vector size: 384





In [20]:
# %% [markdown]
# ### Step 4.3: Chunk and Embed Documents
# **Goal:** Convert raw documents into semantically meaningful chunks and generate high-quality embedding vectors for each chunk using the loaded model.

# %%
def chunk_and_embed(documents, embed_model):
    """
    Chunks documents and generates embeddings for each chunk.
    
    Args:
        documents (list): List of LlamaIndex Document objects for a single index group.
        embed_model (SentenceTransformer): The embedding model.
    
    Returns:
        dict: A dictionary containing the generated IDs, chunk text, and embeddings 
              for the input documents.
    """
    log("Starting document chunking and embedding...")
    
    if not documents:
        log("No documents to chunk. Skipping...")
        return {'ids': [], 'chunks': [], 'embeddings': []}

    # This is the correct, semantic way to chunk text.
    # It splits on sentences and respects word boundaries.
    text_splitter = SentenceSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    
    # LlamaIndex's SentenceSplitter returns "Node" objects.
    nodes = text_splitter.get_nodes_from_documents(documents)
    
    log(f"Split {len(documents)} source documents into {len(nodes)} chunks.")
    
    log("Generating embeddings for all chunks...")
    # 1. Get all the text content first
    all_chunk_text = [node.get_content() for node in nodes]
    
    # 2. Generate embeddings in a single, efficient batch
    # This is *much* faster than embedding chunks one by one.
    embeddings = embed_model.encode(all_chunk_text, show_progress_bar=True)
    
    # 3. Create the IDs list
    ids = [f"chunk_{i}" for i in range(len(nodes))]
    
    log(f"‚úÖ Generated {len(embeddings)} embeddings.")
    return {
        'ids': ids, 
        'chunks': all_chunk_text, 
        'embeddings': embeddings.tolist()
    }

In [21]:
# %%
# --- Run Step 4.3 (Modified for Multi-Index) ---
# This will chunk and embed the documents for all groups.
# The results are stored in a dictionary matching the group names.
embedded_groups = {}

for name, docs in grouped_documents.items():
    log(f"Processing group: {name} ({len(docs)} documents)")
    
    # We call the function and get the processed data back
    processed_data = chunk_and_embed(docs, embed_model)
    
    embedded_groups[name] = processed_data

log(f"‚úÖ Completed chunking and embedding for {len(embedded_groups)} groups.")
# You can now inspect 'embedded_groups' to see IDs, chunks, and embeddings for each index!

[INFO] 2025-10-29 02:33:54 - Processing group: CV (51 documents)
[INFO] 2025-10-29 02:33:54 - Starting document chunking and embedding...
[INFO] 2025-10-29 02:33:59 - Split 51 source documents into 80 chunks.
[INFO] 2025-10-29 02:33:59 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:01<00:00,  1.87it/s]


[INFO] 2025-10-29 02:34:01 - ‚úÖ Generated 80 embeddings.
[INFO] 2025-10-29 02:34:01 - Processing group: FINANCIAL (98 documents)
[INFO] 2025-10-29 02:34:01 - Starting document chunking and embedding...
[INFO] 2025-10-29 02:34:01 - Split 98 source documents into 121 chunks.
[INFO] 2025-10-29 02:34:01 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:01<00:00,  2.00it/s]


[INFO] 2025-10-29 02:34:03 - ‚úÖ Generated 121 embeddings.
[INFO] 2025-10-29 02:34:03 - Processing group: SPECS (843 documents)
[INFO] 2025-10-29 02:34:03 - Starting document chunking and embedding...
[INFO] 2025-10-29 02:34:15 - Split 843 source documents into 3283 chunks.
[INFO] 2025-10-29 02:34:15 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [01:06<00:00,  1.55it/s]


[INFO] 2025-10-29 02:35:22 - ‚úÖ Generated 3283 embeddings.
[INFO] 2025-10-29 02:35:22 - Processing group: REIMBURSEMENT (15 documents)
[INFO] 2025-10-29 02:35:22 - Starting document chunking and embedding...
[INFO] 2025-10-29 02:35:22 - Split 15 source documents into 15 chunks.
[INFO] 2025-10-29 02:35:22 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.89it/s]

[INFO] 2025-10-29 02:35:22 - ‚úÖ Generated 15 embeddings.
[INFO] 2025-10-29 02:35:22 - ‚úÖ Completed chunking and embedding for 4 groups.





In [22]:
# %% [markdown]
# ### Step 4.4: Initialize Vector Store (Modified for Multi-Index)
# **Goal:** Create a **separate Chroma collection** for each document group (e.g., `rag_docs_cv`, `rag_docs_financial`). We use a persistent client so the data is saved to the disk and doesn't need to be re-embedded every time.

# %%
def initialize_vector_stores(embedded_groups):
    """
    Initializes and populates a ChromaDB collection for each document group.
    
    Args:
        embedded_groups (dict): Dictionary from chunking/embedding step.
    
    Returns:
        dict: A dictionary mapping group names to their Chroma Collection objects.
    """
    log("Initializing ChromaDB Persistent Client...")
    # This client saves the database files to the CHROMA_DB_PATH directory.
    chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    all_collections = {}
    
    for name, data in embedded_groups.items():
        # Create a unique collection name based on the index name (e.g., 'rag_docs_cv')
        collection_name = f"rag_docs_{name.lower()}"
        log(f"Creating/getting collection: '{collection_name}'")
        
        # Get or create the collection
        collection = chroma_client.get_or_create_collection(name=collection_name)
        
        ids, document_chunks, embeddings = data['ids'], data['chunks'], data['embeddings']

        # Check if the collection is already populated (fast startup on subsequent runs)
        if collection.count() > 0:
            log(f"Collection '{collection_name}' already exists and has {collection.count()} entries. Skipping storage.")
        elif not ids:
             log(f"Collection '{collection_name}' is empty and no data to add. Skipping.")
        else:
            log(f"Collection '{collection_name}' is empty. Adding {len(ids)} embeddings...")
            
            # Add to Chroma in large batches for efficiency (much faster than chunk-by-chunk)
            batch_size = 500
            for i in range(0, len(ids), batch_size):
                batch_ids = ids[i:i+batch_size]
                batch_docs = document_chunks[i:i+batch_size]
                batch_embeds = embeddings[i:i+batch_size]
                
                collection.add(
                    ids=batch_ids,
                    documents=batch_docs,
                    embeddings=batch_embeds
                )
                log(f"Added batch {i//batch_size + 1} to {collection_name}.")
                
            log(f"‚úÖ Stored {collection.count()} chunks in {collection_name}.")

        all_collections[name] = collection
        
    return all_collections

In [23]:
# %%
# --- Run Step 4.4 (Modified for Multi-Index) ---
# This creates the physical ChromaDB files on your disk.
collections_map = initialize_vector_stores(embedded_groups)
log(f"‚úÖ Successfully initialized {len(collections_map)} Chroma collections.")
print("Available Collections:", list(collections_map.keys()))

2025-10-29 02:35:51,238 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


[INFO] 2025-10-29 02:35:51 - Initializing ChromaDB Persistent Client...
[INFO] 2025-10-29 02:35:52 - Creating/getting collection: 'rag_docs_cv'
[INFO] 2025-10-29 02:35:52 - Collection 'rag_docs_cv' is empty. Adding 80 embeddings...
[INFO] 2025-10-29 02:35:52 - Added batch 1 to rag_docs_cv.
[INFO] 2025-10-29 02:35:52 - ‚úÖ Stored 80 chunks in rag_docs_cv.
[INFO] 2025-10-29 02:35:52 - Creating/getting collection: 'rag_docs_financial'
[INFO] 2025-10-29 02:35:52 - Collection 'rag_docs_financial' is empty. Adding 121 embeddings...
[INFO] 2025-10-29 02:37:00 - Added batch 1 to rag_docs_financial.
[INFO] 2025-10-29 02:37:00 - ‚úÖ Stored 121 chunks in rag_docs_financial.
[INFO] 2025-10-29 02:37:00 - Creating/getting collection: 'rag_docs_specs'
[INFO] 2025-10-29 02:37:00 - Collection 'rag_docs_specs' is empty. Adding 3283 embeddings...
[INFO] 2025-10-29 02:37:01 - Added batch 1 to rag_docs_specs.
[INFO] 2025-10-29 02:37:02 - Added batch 2 to rag_docs_specs.
[INFO] 2025-10-29 02:37:04 - Added b

In [24]:
# %% [markdown]
# ### Step 4.5: Load Local LLM
# **Goal:** Load the local GGUF model (Mistral) into memory using the **`LlamaCPP`** wrapper. This makes the model available for text generation.

# %%
def load_llm():
    """
    Loads the local GGUF model using the LlamaCPP class.
    
    Returns:
        LlamaCPP: The loaded LLM instance.
    """
    log(f"Loading local LLM from {MODEL_PATH} (this may take 20-30s)...")
    if not Path(MODEL_PATH).exists():
        log(f"Error: Model file not found at {MODEL_PATH}")
        log(f"Please run the huggingface-cli download command in the script's docstring.")
        return None  # Return None instead of exiting
        
    llm = LlamaCPP(  # MODIFIED: Correct casing: LlamaCPP
        model_path=MODEL_PATH,
        temperature=LLM_TEMPERATURE,
        max_new_tokens=LLM_MAX_NEW_TOKENS,
        context_window=LLM_N_CTX,
        model_kwargs={
            # set to -1 to use all cores
            "n_threads": -1,
            # offload layers to GPU
            "n_gpu_layers": LLM_N_GPU_LAYERS
        },
        # Enable verbose logging from LlamaCpp
        verbose=True
    )
    log("‚úÖ Local LLM loaded.")
    return llm

In [55]:
# download the model using hf download command
"""hf download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir ./models"""

'hf download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir ./models'

In [25]:
# %%
# --- Run Step 4.5 ---
# This will take 20-30 seconds (or more) as it loads the model into memory (and VRAM).
llm = load_llm()

[INFO] 2025-10-29 02:38:00 - Loading local LLM from ./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (this may take 20-30s)...


ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4060 Laptop GPU, compute capability 8.9, VMM: yes
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4060 Laptop GPU) - 7100 MiB free
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loade

[INFO] 2025-10-29 02:38:08 - ‚úÖ Local LLM loaded.


CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
Model metadata: {'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'general.architecture': 'llama', 'llama.context_length': '32768', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '4096', 'llama.block_count': '32', 'llama.feed_forward_length': '14336', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '15', 'llama.attention.head_count_kv': '8', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '1000000.000000', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{{ bos_token }}{% f

In [26]:
# %% [markdown]
# ### Step 4.6: Define RAG Retrieval and Generation Functions
# **Goal:** Define the core functions for **Routing**, **Retrieval**, and **Generation**. This implements the Multi-Index strategy with the "Fallback to All Indexes" logic.

# %%
def route_query_to_index(query):
    """
    Simple routing function based on keyword detection.
    If no keywords match, returns 'FALLBACK_ALL' to trigger a search across all indexes.
    """
    query_lower = query.lower()
    
    if "cv" in query_lower or "resume" in query_lower or "experience" in query_lower:
        return "CV"
    elif "financial" in query_lower or "report" in query_lower or "balance" in query_lower:
        return "FINANCIAL"
    elif "spec" in query_lower or "architecture" in query_lower or "design" in query_lower:
        return "SPECS"
    elif "reimbursement" in query_lower or "expense" in query_lower or "travel" in query_lower:
        return "REIMBURSEMENT"
    else:
        # **FALLBACK LOGIC**
        log("No specific keywords found. Triggering search across ALL available indexes.")
        return "FALLBACK_ALL" # Special marker for the retrieval function

def retrieve_context(query, collections_map, embed_model, top_k=TOP_K_RESULTS):
    """
    Performs the full RAG retrieval process: Routing -> Embedding -> Query.
    
    If routing returns 'FALLBACK_ALL', it searches all collections and combines results.
        
    Returns:
        str: A single string containing all retrieved context chunks, separated by newlines.
    """
    # 1. ROUTING: Determine which index(es) to search
    index_name = route_query_to_index(query)
    
    # 2. Determine which collections to use
    if index_name == "FALLBACK_ALL":
        # Search all collections
        collections_to_search = collections_map.values()
        # For general search, retrieve fewer chunks per index but keep total_chunks high
        # We will split the TOP_K_RESULTS across all collections.
        k_per_index = max(1, TOP_K_RESULTS // len(collections_map)) 
        
    elif index_name in collections_map:
        # Search only the routed collection
        collections_to_search = [collections_map[index_name]]
        k_per_index = TOP_K_RESULTS
        log(f"Searching in single collection: {index_name}")
        
    else:
        return f"Error: Index '{index_name}' could not be found. Context unavailable."

    # 3. EMBEDDING: Vectorize the query once
    query_emb = embed_model.encode(query)
    combined_contexts = []
    
    # 4. QUERY: Loop through the chosen collections
    for collection in collections_to_search:
        log(f"Querying collection: {collection.name} for {k_per_index} chunks.")
        
        # Search the collection
        results = collection.query(
            query_embeddings=[query_emb.tolist()], 
            n_results=k_per_index
        )
        
        # Extract and append the documents (text chunks)
        contexts = results.get('documents', [[]])[0]
        combined_contexts.extend(contexts)
        
    # Combine the final context chunks into a single string
    return "\n\n".join(combined_contexts)

In [27]:
# %%
def rag_answer(query, llm, collections_map, embed_model):
    """
    Generates a RAG-augmented answer from the LLM.
    
    Args:
        query (str): The user's input question.
        llm (LlamaCPP): The loaded LLM instance.
        collections_map (dict): The map of index names to Chroma collection objects.
        embed_model (SentenceTransformer): The embedding model.
        
    Returns:
        str: The final, synthesized answer from the LLM.
    """
    # 1. RETRIEVAL: Get the relevant context chunks (now handles routing and fallback)
    context = retrieve_context(query, collections_map, embed_model)
    
    # 2. PROMPT CONSTRUCTION (Prompt Engineering)
    # The prompt strictly instructs the LLM to use only the provided context.
    prompt = dedent(f"""
        You are a helpful and professional assistant. Your task is to provide a concise and accurate answer 
        based **ONLY** on the context provided below.
        If the answer is not present in the context, you MUST state, "I cannot find a definitive answer in the provided documents."

        Context:
        ---
        {context}
        ---

        Question: {query}
        Answer:
    """)
    
    # 3. GENERATION: Pass the prompt to the local LLM
    output = llm.complete(prompt)
    
    return output.text.strip()

In [28]:
# %% [markdown]
# ### Step 4.7: Test the RAG Pipeline
# **Goal:** Execute a single test query to ensure the routing, retrieval, and generation steps work together.

# %%
test_queries = [
    # 1. ROUTED QUERY: Should hit the 'CV' index
    "What is the experience level of the candidate \"Moid Hassan\" in Microsoft in the CV documents?",
    
    # 2. FALLBACK QUERY: Should hit ALL indexes (triggering the fallback logic)
    "Tell me about the documents I have, focusing on any key dates.",
    
    # 3. IRRELEVANT QUERY: Should test the LLM's constraint
    "What is the main architecture of the Eiffel Tower?",
]

# Run the tests
for i, query in enumerate(test_queries):
    print(f"\n--- TEST QUERY {i+1} ---")
    print(f"üßë‚Äçüíª Query: {query}")
    
    # 1. Determine where the query is routed
    routed_index = route_query_to_index(query)
    print(f"‚û°Ô∏è Routing Decision: {routed_index}")
    
    # 2. Get the RAG Answer
    start_time = time.time()
    response = rag_answer(query, llm, collections_map, embed_model)
    end_time = time.time()
    
    print(f"‚è±Ô∏è Time taken: {end_time - start_time:.2f} seconds")
    print(f"\nüß† RAG Answer:\n{response}")


--- TEST QUERY 1 ---
üßë‚Äçüíª Query: What is the experience level of the candidate "Moid Hassan" in Microsoft in the CV documents?
‚û°Ô∏è Routing Decision: CV
[INFO] 2025-10-29 02:38:52 - Searching in single collection: CV


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 12.26it/s]

[INFO] 2025-10-29 02:38:53 - Querying collection: rag_docs_cv for 5 chunks.



llama_perf_context_print:        load time =   50028.53 ms
llama_perf_context_print: prompt eval time =   50027.87 ms /  4222 tokens (   11.85 ms per token,    84.39 tokens per second)
llama_perf_context_print:        eval time =   54263.91 ms /    99 runs   (  548.12 ms per token,     1.82 tokens per second)
llama_perf_context_print:       total time =  104388.88 ms /  4321 tokens
llama_perf_context_print:    graphs reused =         94


‚è±Ô∏è Time taken: 104.53 seconds

üß† RAG Answer:
Moid Hassan's CV documents do not provide any specific information about his experience level with Microsoft. However, he has mentioned working on Microsoft technologies such as C#, .NET, and SQL. Additionally, he has listed certifications from Microsoft, including "Microsoft Certified: Azure Developer Associate" and "Microsoft Certified: Azure Solutions Architect Expert." These certifications suggest that Moid Hassan has gained some level of proficiency and experience in using Microsoft Azure.

--- TEST QUERY 2 ---
üßë‚Äçüíª Query: Tell me about the documents I have, focusing on any key dates.
[INFO] 2025-10-29 02:40:37 - No specific keywords found. Triggering search across ALL available indexes.
‚û°Ô∏è Routing Decision: FALLBACK_ALL
[INFO] 2025-10-29 02:40:37 - No specific keywords found. Triggering search across ALL available indexes.


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 12.01it/s]


[INFO] 2025-10-29 02:40:37 - Querying collection: rag_docs_cv for 1 chunks.
[INFO] 2025-10-29 02:40:37 - Querying collection: rag_docs_financial for 1 chunks.
[INFO] 2025-10-29 02:40:37 - Querying collection: rag_docs_specs for 1 chunks.
[INFO] 2025-10-29 02:40:37 - Querying collection: rag_docs_reimbursement for 1 chunks.


Llama.generate: 76 prefix-match hit, remaining 1559 prompt tokens to eval
llama_perf_context_print:        load time =   50028.53 ms
llama_perf_context_print: prompt eval time =   14728.55 ms /  1559 tokens (    9.45 ms per token,   105.85 tokens per second)
llama_perf_context_print:        eval time =  198385.76 ms /   388 runs   (  511.30 ms per token,     1.96 tokens per second)
llama_perf_context_print:       total time =  214450.67 ms /  1947 tokens
llama_perf_context_print:    graphs reused =        375


‚è±Ô∏è Time taken: 214.69 seconds

üß† RAG Answer:
The documents provided include a resume, academic transcripts, personal information, and a declaration. The resume lists the individual's core competencies, which include data analytics and visualization, data management, leadership and collaboration, and communication and problem-solving skills. The academic transcripts show the individual's grades from their Post Graduate Diploma in Statistics, Post Graduate Diploma in HRM, and Bachelor of Arts in Economics (Honours). The personal information includes the individual's date of birth, gender, nationality, marital status, and languages. The declaration is a statement that the information provided is true to the best of the individual's knowledge and belief.

        In addition to these documents, there is also a Microsoft Limited Warranty and Protection Plan document. This document contains information about how Microsoft sets the start date for its limited hardware warranty and prote

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 10.78it/s]
Llama.generate: 75 prefix-match hit, remaining 3276 prompt tokens to eval


[INFO] 2025-10-29 02:44:12 - Querying collection: rag_docs_specs for 5 chunks.


llama_perf_context_print:        load time =   50028.53 ms
llama_perf_context_print: prompt eval time =   36875.21 ms /  3276 tokens (   11.26 ms per token,    88.84 tokens per second)
llama_perf_context_print:        eval time =   18172.40 ms /    34 runs   (  534.48 ms per token,     1.87 tokens per second)
llama_perf_context_print:       total time =   55083.71 ms /  3310 tokens
llama_perf_context_print:    graphs reused =         32


‚è±Ô∏è Time taken: 55.28 seconds

üß† RAG Answer:
I cannot find a definitive answer in the provided documents regarding the main architecture of the Eiffel Tower. The context only provides technical specifications of a device.
