In [1]:
# steps to install llama-cpp-python on Windows with CUDA support
# 1. Install Visual Studio 2022 with "Desktop development with C++" workload
# 2. Install CUDA Toolkit from NVIDIA's website if not already installed
# 3. Install latest version of cmake
# 4. Open x64 Native Tools Command Prompt for VS 2022
# 5. Run the following command:
    # pip install llama-index
    # pip install llama-index-llms-llama-cpp
    # pip install sentence-transformers
    # pip install chromadb
    # pip install faiss-cpu
    # pip install pypdf
    # pip install docx2txt
    # pip install python-pptx
    # set FORCE_CMAKE=1
    # set CMAKE_ARGS=-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=89
    # pip install llama-cpp-python --force-reinstall --no-cache-dir --verbose --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125
# 6. Verify installation by running python shell and executing:
    # from llama_cpp import Llama
    # llm = Llama(model_path=r"C:\Users\moidhassan\Downloads\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", n_gpu_layers=-1)
# 7. If you encounter issues, refer to the GitHub repository: https://github.com/abetlen/llama-cpp-python
# Note: Adjust the model_path to point to your downloaded GGUF model file.
# 8. For CUDA support, ensure your GPU is compatible and the correct CUDA version is installed.
# 9. Test with a simple script to ensure everything is working fine.
# Note: Some installations may require restarting the terminal or IDE to recognize new environment variables.


In [31]:
# %% [markdown]
# ## 1. Imports & Configuration
# All necessary libraries and configuration constants.

# %%
import sys
import time
from pathlib import Path
import chromadb
from textwrap import dedent
from sentence_transformers import SentenceTransformer
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.llama_cpp import LlamaCPP

In [2]:
# --- Configuration Constants ---

# 1. Paths
# Adjust these paths to match your folder structure.
CV_PATH = "../data/CV"
FINANCIAL_PATH = "../data/financial"
SPECS_PATH = "../data/specs"
REIMBURSEMENT_PATH = "../data/reimbursement"
DATA_PATHS_MAP = {
    "CV": CV_PATH,
    "FINANCIAL": FINANCIAL_PATH,
    "SPECS": SPECS_PATH,
    "REIMBURSEMENT": REIMBURSEMENT_PATH
}
MODEL_PATH = "./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
CHROMA_DB_PATH = "./chroma_db_v2"
COLLECTION_NAME = "local_rag_demo_v2"

# 2. Embedding Model
# "all-MiniLM-L6-v2" is small (~90MB), fast, and high-quality.
EMBED_MODEL_NAME_1 = "all-MiniLM-L6-v2"
EMBED_MODEL_NAME_2 = "BAAI/bge-small-en-v1.5"

# 3. Chunking Parameters
# Using a semantic splitter to keep sentences intact.
CHUNK_SIZE = 1024      # Target size of each chunk (in characters)
CHUNK_OVERLAP = 192     # Overlap between chunks to maintain context

# 4. LLM Parameters
# These are settings for the Mistral 7B model.
LLM_N_CTX = 8192*3       # Context window size (Mistral has a large one)
LLM_N_GPU_LAYERS = 40  # Layers to offload to GPU (adjust based on your VRAM)
LLM_TEMPERATURE = 0.2  # Low temperature for factual, less "creative" answers
LLM_MAX_NEW_TOKENS = 1024 # Max tokens to generate in an answer

# 5. Retrieval Parameters
TOP_K_RESULTS = 3      # Number of context chunks to retrieve

# 6. Helper Function
def log(message):
    """Helper function for formatted logging."""
    print(f"[INFO] {time.strftime('%Y-%m-%d %H:%M:%S')} - {message}")

log("Configuration loaded.")

[INFO] 2025-10-29 13:09:31 - Configuration loaded.


In [3]:
# %% [markdown]
# ### Step 4.1: Load Documents
# **Goal:** Load documents from each directory path separately and group them. This allows us to create a dedicated index for "CVs", "Financials", etc. (Multi-Index RAG).

# %%
def load_documents(data_paths_map):
    """
    Loads documents from multiple directories and groups them by source name.
    
    Args:
        data_paths_map (dict): A dictionary mapping index names (e.g., "CV")
                               to their directory paths.

    Returns:
        dict: A dictionary where keys are the index names and values are
              lists of LlamaIndex Document objects.
    """
    log("Starting to load documents for multiple indexes...")
    grouped_documents = {}

    # Iterate through the map: (Index Name) -> (Directory Path)
    for name, path in data_paths_map.items():
        if not Path(path).exists():
            log(f"Warning: Path for '{name}' does not exist, skipping: {path}")
            continue

        # SimpleDirectoryReader reads all files in the directory
        reader = SimpleDirectoryReader(path, recursive=True)
        try:
            documents = reader.load_data()
            grouped_documents[name] = documents
            log(f"Successfully loaded {len(documents)} documents for index: '{name}'")
        except Exception as e:
            log(f"Error loading data from {path} for '{name}': {e}")

    total_docs = sum(len(docs) for docs in grouped_documents.values())
    log(f"‚úÖ Total documents loaded for {len(grouped_documents)} indexes: {total_docs}")

    if not grouped_documents:
        log("Error: No documents were loaded. Please check your DATA_PATHS_MAP.")

    return grouped_documents
    """
    log("Starting to load documents for multiple indexes...")
    grouped_documents = {}
    
    # Iterate through the map: (Index Name) -> (Directory Path)
    for name, path in data_paths_map.items():
        if not Path(path).exists():
            log(f"Warning: Path for '{name}' does not exist, skipping: {path}")
            continue
        
        # SimpleDirectoryReader reads all files in the directory
        reader = SimpleDirectoryReader(path, recursive=True)
        try:
            documents = reader.load_data()
            grouped_documents[name] = documents
            log(f"Successfully loaded {len(documents)} documents for index: '{name}'")
        except Exception as e:
            log(f"Error loading data from {path} for '{name}': {e}")
            
    total_docs = sum(len(docs) for docs in grouped_documents.values())
    log(f"‚úÖ Total documents loaded for {len(grouped_documents)} indexes: {total_docs}")
    
    if not grouped_documents:
        log("Error: No documents were loaded. Please check your DATA_PATHS_MAP.")
        
    return grouped_documents
    """

In [4]:
# %%
# --- Run Step 4.1 (Modified for Multi-Index) ---
# This will scan your directories and populate the 'grouped_documents' dictionary.
grouped_documents = load_documents(DATA_PATHS_MAP)
if not grouped_documents:
    log("Error: No documents were loaded. Please check your DATA_PATHS_MAP.")
    

[INFO] 2025-10-29 13:09:33 - Starting to load documents for multiple indexes...




[INFO] 2025-10-29 13:09:47 - Successfully loaded 51 documents for index: 'CV'




[INFO] 2025-10-29 13:10:27 - Successfully loaded 98 documents for index: 'FINANCIAL'




[INFO] 2025-10-29 13:11:28 - Successfully loaded 843 documents for index: 'SPECS'
[INFO] 2025-10-29 13:11:29 - Successfully loaded 15 documents for index: 'REIMBURSEMENT'
[INFO] 2025-10-29 13:11:29 - ‚úÖ Total documents loaded for 4 indexes: 1007


In [5]:
# checking how many documents were loaded per index
#for index_name, docs in grouped_documents.items():
#    log(f"Index '{index_name}' has {len(docs)} documents loaded.")

# checking how many documents were split into multiple chunks
#for index_name, docs in grouped_documents.items():
#    chunked_count = sum(1 for doc in docs if len(doc.get_text()) > CHUNK_SIZE)
#    log(f"Index '{index_name}' has {chunked_count} documents that were chunked into multiple pieces.")

# checking how many raw documents were in DATA_PATHS_MAP and how many were loaded into grouped_documents per category
for index_name, path in DATA_PATHS_MAP.items():
    raw_count = len(list(Path(path).rglob('*.*'))) if Path(path).exists() else 0
    loaded_count = len(grouped_documents.get(index_name, []))
    log(f"Index '{index_name}': Raw files = {raw_count}, Loaded documents = {loaded_count}")
    #chunked_count = sum(1 for doc in grouped_documents.get(index_name, []) if len(doc.text) > CHUNK_SIZE)
    #log(f"Index '{index_name}': Chunked documents = {chunked_count}")
    


[INFO] 2025-10-29 13:18:16 - Index 'CV': Raw files = 37, Loaded documents = 51
[INFO] 2025-10-29 13:18:16 - Index 'FINANCIAL': Raw files = 8, Loaded documents = 98
[INFO] 2025-10-29 13:18:16 - Index 'SPECS': Raw files = 89, Loaded documents = 843
[INFO] 2025-10-29 13:18:16 - Index 'REIMBURSEMENT': Raw files = 10, Loaded documents = 15


In [6]:
# Display a snippet from one of the first documents loaded
for index_name, docs in grouped_documents.items():
    if docs:
        log(f"Sample document from index '{index_name}':\n{dedent(docs[0].text[:500])}...\n")
        print("\n")

[INFO] 2025-10-29 13:18:19 - Sample document from index 'CV':
ABHISHEK  RANJAN                  +91  9040140733   
 Data  Scientist  |  AI  Engineer                     13eee079@gmail.com  

ABOUT  ME  
Data  Scientist  with  3.9   years  of  experience,  with  expertise  in  Machine  Learning,  Deep  Learning,  Generative  AI,  NLP,  
LLMs,

and

RAG.

Skilled

in

fine-tuning

Hugging

Face

models,

building

AI-driven

solutions,

and

deploying

scalable

architectures

using

LangChain

and

OpenAI

API.

Passionate

about

AI...



[INFO] 2025-10-29 13:18:19 - Sample document from index 'FINANCIAL':
...



[INFO] 2025-10-29 13:18:19 - Sample document from index 'SPECS':
Surface USB-C¬Æ Travel Hub
All the connections, wherever you are
Pitch
Turn your laptop into an on-the-go productivity companion with 
this elegant, multi-port travel adapter. Designed for on-the-go 
professionals, it gives you five ways to stay productive anywhere. 
Connect to the internet, project content onto 

In [7]:
# %% [markdown]
# ### Step 4.2: Load Embedding Model
# We define the function to load the `sentence-transformer` model and then call it.

# %%
def load_embedding_model(model_name, cuda_flag=False):
    """
    Loads the SentenceTransformer embedding model.
    
    Args:
        model_name (str): The name of the model from Hugging Face.
    
    Returns:
        SentenceTransformer: The loaded embedding model.
    """
    log(f"Loading embedding model: {model_name}...")
    # The first time you run this, it will download the model.
    if cuda_flag:
        log("Using CUDA for embedding model.")
        embed_model = SentenceTransformer(model_name, device='cuda')
    else:
        embed_model = SentenceTransformer(model_name)
    
    # Example: Test the model
    sample_vec = embed_model.encode("This is a test sentence.")
    log(f"‚úÖ Embedding model loaded. Vector size: {len(sample_vec)}")
    return embed_model

In [8]:
# %%
# --- Run Step 4.2 ---
# This will download the model if you don't have it cached.
print(f"Loading embedding model: {EMBED_MODEL_NAME_2}...")
embed_model = load_embedding_model(EMBED_MODEL_NAME_2, cuda_flag=True)

2025-10-29 13:18:24,428 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


Loading embedding model: BAAI/bge-small-en-v1.5...
[INFO] 2025-10-29 13:18:24 - Loading embedding model: BAAI/bge-small-en-v1.5...
[INFO] 2025-10-29 13:18:24 - Using CUDA for embedding model.


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.16it/s]

[INFO] 2025-10-29 13:18:29 - ‚úÖ Embedding model loaded. Vector size: 384





In [11]:
# %% [markdown]
# ### Step 4.3: Chunk and Embed Documents
# **Goal:** Convert raw documents into semantically meaningful chunks and generate high-quality embedding vectors for each chunk using the loaded model.

# %%
def chunk_and_embed(documents, embed_model, batch_size=64):
    """
    Chunks documents and generates embeddings for each chunk.
    
    Args:
        documents (list): List of LlamaIndex Document objects for a single index group.
        embed_model (SentenceTransformer): The embedding model.
        batch_size (int): Number of chunks to process in each embedding batch.
    
    Returns:
        dict: A dictionary containing the generated IDs, chunk text, and embeddings 
              for the input documents.
    """
    log("Starting document chunking and embedding...")
    
    if not documents:
        log("No documents to chunk. Skipping...")
        return {'ids': [], 'chunks': [], 'embeddings': []}

    # This is the correct, semantic way to chunk text.
    # It splits on sentences and respects word boundaries.
    text_splitter = SentenceSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    
    # LlamaIndex's SentenceSplitter returns "Node" objects.
    nodes = text_splitter.get_nodes_from_documents(documents)
    
    log(f"Split {len(documents)} source documents into {len(nodes)} chunks.")
    
    log("Generating embeddings for all chunks...")
    # 1. Get all the text content first
    all_chunk_text = [node.get_content() for node in nodes]
    
    # 2. Generate embeddings in a single, efficient batch
    # This is *much* faster than embedding chunks one by one.
    embeddings = embed_model.encode(all_chunk_text, show_progress_bar=True, batch_size=batch_size)
    
    # 3. Create the IDs list
    ids = [f"chunk_{i}" for i in range(len(nodes))]
    
    log(f"‚úÖ Generated {len(embeddings)} embeddings.")
    return {
        'ids': ids, 
        'chunks': all_chunk_text, 
        'embeddings': embeddings.tolist()
    }

In [12]:
# %%
# --- Run Step 4.3 (Modified for Multi-Index) ---
# This will chunk and embed the documents for all groups.
# The results are stored in a dictionary matching the group names.
embedded_groups = {}
batch_size = 128  # You can adjust this based on your GPU memory

for name, docs in grouped_documents.items():
    log(f"Processing group: {name} ({len(docs)} documents)")
    
    # We call the function and get the processed data back
    processed_data = chunk_and_embed(docs, embed_model, batch_size)
    
    embedded_groups[name] = processed_data

log(f"‚úÖ Completed chunking and embedding for {len(embedded_groups)} groups.")
# You can now inspect 'embedded_groups' to see IDs, chunks, and embeddings for each index!

[INFO] 2025-10-29 13:23:10 - Processing group: CV (51 documents)
[INFO] 2025-10-29 13:23:10 - Starting document chunking and embedding...
[INFO] 2025-10-29 13:23:10 - Split 51 source documents into 61 chunks.
[INFO] 2025-10-29 13:23:10 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.45it/s]


[INFO] 2025-10-29 13:23:11 - ‚úÖ Generated 61 embeddings.
[INFO] 2025-10-29 13:23:11 - Processing group: FINANCIAL (98 documents)
[INFO] 2025-10-29 13:23:11 - Starting document chunking and embedding...
[INFO] 2025-10-29 13:23:11 - Split 98 source documents into 101 chunks.
[INFO] 2025-10-29 13:23:11 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.32it/s]


[INFO] 2025-10-29 13:23:12 - ‚úÖ Generated 101 embeddings.
[INFO] 2025-10-29 13:23:12 - Processing group: SPECS (843 documents)
[INFO] 2025-10-29 13:23:12 - Starting document chunking and embedding...
[INFO] 2025-10-29 13:23:27 - Split 843 source documents into 2665 chunks.
[INFO] 2025-10-29 13:23:27 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:26<00:00,  1.25s/it]


[INFO] 2025-10-29 13:23:53 - ‚úÖ Generated 2665 embeddings.
[INFO] 2025-10-29 13:23:53 - Processing group: REIMBURSEMENT (15 documents)
[INFO] 2025-10-29 13:23:53 - Starting document chunking and embedding...
[INFO] 2025-10-29 13:23:53 - Split 15 source documents into 15 chunks.
[INFO] 2025-10-29 13:23:53 - Generating embeddings for all chunks...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  7.06it/s]

[INFO] 2025-10-29 13:23:53 - ‚úÖ Generated 15 embeddings.
[INFO] 2025-10-29 13:23:53 - ‚úÖ Completed chunking and embedding for 4 groups.





In [14]:
# %% [markdown]
# ### Step 4.4: Initialize Vector Store (Modified for Multi-Index)
# **Goal:** Initialize and populate a FAISS index for each document group. Uses GPU acceleration automatically if available.

# %%
import faiss
import numpy as np

def initialize_vector_stores(embedded_groups):
    """
    Initializes and populates a FAISS index for each document group.
    Uses GPU acceleration automatically if available.

    Args:
        embedded_groups (dict): Dictionary from embedding step, containing:
            - ids: list of unique IDs
            - chunks: list of document text chunks
            - embeddings: list or np.array of vector embeddings

    Returns:
        dict: A dictionary mapping group names to FAISS index + metadata:
              {
                group_name: {
                    "index": faiss.Index,
                    "ids": [...],
                    "chunks": [...]
                }
              }
    """
    log("üîç Initializing FAISS Vector Stores...")
    all_collections = {}

    # Detect GPU availability
    use_gpu = False
    try:
        ngpu = faiss.get_num_gpus()
        if ngpu > 0:
            use_gpu = True
            log(f"‚ö° Detected {ngpu} GPU(s) ‚Äî will use FAISS GPU acceleration.")
        else:
            log("üí° No GPU FAISS available ‚Äî running on CPU.")
    except Exception as e:
        log(f"‚ö†Ô∏è GPU detection failed, defaulting to CPU: {e}")

    for name, data in embedded_groups.items():
        ids = data.get("ids", [])
        document_chunks = data.get("chunks", [])
        embeddings = data.get("embeddings", [])

        collection_name = f"rag_docs_{name.lower()}"
        log(f"Creating FAISS index for '{collection_name}'")

        if not ids or len(ids) != len(embeddings):
            log(f"‚ö†Ô∏è Skipping '{collection_name}' (empty or invalid data).")
            continue

        # Convert embeddings to float32 numpy
        embeddings = np.array(embeddings, dtype="float32")

        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)

        # Create FAISS index (inner product = cosine similarity)
        dim = embeddings.shape[1]
        cpu_index = faiss.IndexFlatIP(dim)

        # Move to GPU if available
        if use_gpu:
            try:
                index = faiss.index_cpu_to_all_gpus(cpu_index)
                log(f"‚úÖ Using GPU FAISS index for '{collection_name}'")
            except Exception as e:
                log(f"‚ö†Ô∏è GPU transfer failed for '{collection_name}', using CPU instead: {e}")
                index = cpu_index
        else:
            index = cpu_index

        # Add embeddings to index
        index.add(embeddings)
        log(f"‚úÖ Stored {len(ids)} vectors in '{collection_name}'")

        # Store index + metadata
        all_collections[name] = {
            "index": index,
            "ids": ids,
            "chunks": document_chunks
        }

    log("üéØ All FAISS vector stores initialized successfully.")
    return all_collections


2025-10-29 13:44:35,651 - INFO - Loading faiss with AVX2 support.
2025-10-29 13:44:35,745 - INFO - Successfully loaded faiss with AVX2 support.
