In [5]:
# %% [markdown]
# ## 1. Imports & Configuration
# All necessary libraries and configuration constants.

# %%
# Step 0 - Import all necessary libraries
# ---------------------------------------

# Basic utilities
import os
import torch
from pathlib import Path

# Transformers pipeline for LLM inference
from transformers import pipeline

# LangChain ecosystem (community, text splitters, embeddings, FAISS, and HuggingFace LLM)
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_huggingface import HuggingFacePipeline

# Step 1 - Check environment
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ Using device: {device}")

# Step 2 - Base data path
# -----------------------
#DATA_DIR = "data"
#os.makedirs(DATA_DIR, exist_ok=True)
#print(f"üìÅ Base data directory: {os.path.abspath(DATA_DIR)}")

# Expected folder structure:
# data/
# ‚îú‚îÄ‚îÄ CV/
# ‚îú‚îÄ‚îÄ financial/
# ‚îú‚îÄ‚îÄ reimbursement/
# ‚îî‚îÄ‚îÄ specs/
# 
# Each folder should contain PDFs, DOCX, or TXT files for that category.

print("‚úÖ Cell 1 executed successfully ‚Äî environment ready.")


  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Using device: cuda
‚úÖ Cell 1 executed successfully ‚Äî environment ready.


In [6]:
# %% [markdown]
# ## 2A. Configuration & Helper Functions
# 
# This cell defines:
# - Folder structure and configuration constants
# - Supported file extensions and their respective loaders
# - Utility functions for:
#   - Loading documents from multiple folders
#   - Splitting them into overlapping chunks for embedding

# %%
# Step 2A - Define configuration and helper functions

# --- Global configuration ---
DATA_DIR = "../data"
CATEGORY_PATHS = {
    "CV": os.path.join(DATA_DIR, "CV"),
    "FINANCIAL": os.path.join(DATA_DIR, "financial"),
    "REIMBURSEMENT": os.path.join(DATA_DIR, "reimbursement"),
    "SPECS": os.path.join(DATA_DIR, "specs"),
}

# --- Map supported file extensions to document loaders ---
EXT_TO_LOADER = {
    ".pdf": (PyPDFLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf-8"}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
}

def load_documents_from_folder(folder_path):
    """
    Loads all supported documents recursively from the given folder.
    Returns a list of LangChain Document objects.
    """
    docs = []
    folder = Path(folder_path)
    if not folder.exists():
        return docs

    for file_path in folder.rglob("*"):
        if file_path.is_dir():
            continue
        ext = file_path.suffix.lower()
        if ext not in EXT_TO_LOADER:
            continue
        loader_cls, loader_kwargs = EXT_TO_LOADER[ext]
        try:
            loader = loader_cls(str(file_path), **loader_kwargs)
            loaded = loader.load()
            docs.extend(loaded if isinstance(loaded, list) else [loaded])
        except Exception as e:
            print(f"[WARN] Could not load {file_path}: {e}")
    return docs


def process_all_documents(category_paths, chunk_size=1000, chunk_overlap=100):
    """
    Loads and chunks all documents from all categories.
    Returns a dictionary:
        {
          "CV": [chunk_dicts...],
          "FINANCIAL": [chunk_dicts...],
          ...
        }
    """
    grouped_chunks = {}
    total_docs = 0
    total_chunks = 0

    print(f"using chunk_size-{chunk_size} and chunk_overlap-{chunk_overlap} for splitting")
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    for category, folder_path in category_paths.items():
        print(f"\n[INFO] Processing category: {category}")
        docs = load_documents_from_folder(folder_path)
        num_docs = len(docs)
        total_docs += num_docs
        print(f"   Loaded {num_docs} documents from {folder_path}")

        if not docs:
            grouped_chunks[category] = []
            continue

        try:
            chunks = splitter.split_documents(docs)
            chunked_data = [
                {
                    "text": getattr(doc, "page_content", str(doc)),
                    "meta": {
                        "source": getattr(doc, "metadata", {}).get("source", None),
                        "chunk_id": f"{category}_chunk_{i}"
                    },
                }
                for i, doc in enumerate(chunks)
            ]
            grouped_chunks[category] = chunked_data
            total_chunks += len(chunked_data)
            print(f"   -> Created {len(chunked_data)} chunks for '{category}'")
        except Exception as e:
            print(f"[ERROR] Failed to chunk '{category}': {e}")
            grouped_chunks[category] = []

    print(f"\n‚úÖ Finished processing {total_docs} documents across all categories")
    print(f"‚úÖ Total chunks created: {total_chunks}")

    return grouped_chunks


print("‚úÖ Cell 2A executed successfully ‚Äî configuration and helper functions defined.")


‚úÖ Cell 2A executed successfully ‚Äî configuration and helper functions defined.


In [18]:
1024+512+64

1600

In [19]:
# %% [markdown]
# ## 2B. Load & Chunk All Category Documents (Refactored)
# 
# This cell:
# - Uses `process_all_documents()` to load and chunk all documents.
# - Displays a brief summary of the results and a preview of chunks.

# %%
# Step 2B - Load and chunk all category documents

# --- Chunking parameters (easy to tune globally) ---
CHUNK_SIZE = 1600
CHUNK_OVERLAP = 256

grouped_chunks = process_all_documents(CATEGORY_PATHS, CHUNK_SIZE, CHUNK_OVERLAP)

# --- Display a quick summary ---
print("\n--- Chunk Summary ---")
for category, chunks in grouped_chunks.items():
    print(f"{category}: {len(chunks)} chunks")
    if chunks:
        sample_text = chunks[0]['text'][:200].replace('\n', ' ')
        print(f"  Example: {sample_text}...\n")

print("‚úÖ Cell 2B executed successfully ‚Äî documents loaded & chunked for all categories.")


using chunk_size-1600 and chunk_overlap-256 for splitting

[INFO] Processing category: CV


Ignoring wrong pointing object 41 0 (offset 0)


   Loaded 51 documents from ../data\CV
   -> Created 139 chunks for 'CV'

[INFO] Processing category: FINANCIAL


invalid pdf header: b'\xac\xed\x00\x05u'
incorrect startxref pointer(1)
parsing for Object Streams


   Loaded 98 documents from ../data\financial
   -> Created 117 chunks for 'FINANCIAL'

[INFO] Processing category: REIMBURSEMENT
   Loaded 15 documents from ../data\reimbursement
   -> Created 22 chunks for 'REIMBURSEMENT'

[INFO] Processing category: SPECS


Ignoring wrong pointing object 41 0 (offset 0)
parsing for Object Streams
parsing for Object Streams
parsing for Object Streams


   Loaded 403 documents from ../data\specs
   -> Created 585 chunks for 'SPECS'

‚úÖ Finished processing 567 documents across all categories
‚úÖ Total chunks created: 863

--- Chunk Summary ---
CV: 139 chunks
  Example: ABHISHEK  RANJAN                  +91  9040140733     Data  Scientist  |  AI  Engineer                     13eee079@gmail.com     ABOUT  ME   Data  Scientist  with  3.9   years  of  experience,  with ...

FINANCIAL: 117 chunks
  Example: STOCK PLAN SERVICES REPORT April 1, 2024 - April 30, 2024 Envelope # BQGGQMBBBHRBR 1 of 8 MOID HASSAN FLAT NO 1002, BLOCK 1 MOHINDER APARTMENTS, SECTOR 12 DWARKA DELHI 110078 DL INDIA MR_CE _BQGGQMBBB...

REIMBURSEMENT: 22 chunks
  Example: moid hassan RD17471298637220809 Ramaswamy KA04AA9727 Car May 13th 2025, 3:25 PM Booking History Customer Name Ride ID Driver name Vehicle Number Mode of Vehicle Time of Ride Selected Price ‚Çπ  1181 Clu...

SPECS: 585 chunks
  Example: Surface USB-C¬Æ Travel Hub All the connections, wherever you are 

In [20]:
# %% [markdown]
# ## 3A. Embeddings & FAISS Helper Functions
# 
# This cell:
# - Defines global embedding configuration
# - Initializes the HuggingFace embedding model
# - Defines functions to build and persist FAISS vector stores

# --- Embedding configuration ---
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # small, fast, good quality
EMBED_CACHE_DIR = "embeddings"
INDEX_DIR = "faiss_indexes"

os.makedirs(EMBED_CACHE_DIR, exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)

# Initialize embeddings model
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL_NAME,
    cache_folder=EMBED_CACHE_DIR,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)

print(f"‚úÖ Embedding model '{EMBED_MODEL_NAME}' initialized.")


def build_faiss_index_for_category(category, chunks):
    """
    Builds a FAISS index for the given category from chunked text data.
    Saves the index to disk in INDEX_DIR/<category>/
    Returns the FAISS object.
    """
    if not chunks:
        print(f"[WARN] No chunks found for category '{category}'. Skipping index creation.")
        return None

    texts = [item["text"] for item in chunks]
    metadatas = [item["meta"] for item in chunks]

    try:
        db = FAISS.from_texts(texts=texts, embedding=embedding_model, metadatas=metadatas)
        save_path = os.path.join(INDEX_DIR, category)
        db.save_local(save_path)
        print(f"‚úÖ FAISS index for '{category}' saved to '{save_path}' ({len(texts)} vectors)")
        return db
    except Exception as e:
        print(f"[ERROR] Failed to build FAISS for '{category}': {e}")
        return None


def build_all_faiss_indexes(grouped_chunks_dict):
    """
    Builds FAISS indexes for all categories and returns a dictionary of FAISS stores.
    """
    faiss_indexes = {}
    for category, chunks in grouped_chunks_dict.items():
        print(f"\n[INFO] Building FAISS index for category: {category}")
        index = build_faiss_index_for_category(category, chunks)
        if index:
            faiss_indexes[category] = index
    print("\n‚úÖ All FAISS indexes built successfully.")
    return faiss_indexes


print("‚úÖ Cell 3A executed successfully ‚Äî embedding & FAISS helper functions ready.")


‚úÖ Embedding model 'sentence-transformers/all-MiniLM-L6-v2' initialized.
‚úÖ Cell 3A executed successfully ‚Äî embedding & FAISS helper functions ready.


In [8]:
grouped_chunks['CV']

[{'text': 'ABHISHEK  RANJAN                  +91  9040140733   \n Data  Scientist  |  AI  Engineer                     13eee079@gmail.com  \n \nABOUT  ME  \nData  Scientist  with  3.9   years  of  experience,  with  expertise  in  Machine  Learning,  Deep  Learning,  Generative  AI,  NLP,  \nLLMs,\n \nand\n \nRAG.\n \nSkilled\n \nin\n \nfine-tuning\n \nHugging\n \nFace\n \nmodels,\n \nbuilding\n \nAI-driven\n \nsolutions,\n \nand\n \ndeploying\n \nscalable\n \narchitectures\n \nusing\n \nLangChain\n \nand\n \nOpenAI\n \nAPI.\n \nPassionate\n \nabout\n \nAI\n \ninnovation\n \nand\n \nsolving\n \ncomplex\n \nchallenges\n \nwith\n \ndata-driven\n \nsolutions.\nSKILLS  \nProgramming  Languages:  \nPython,\n \nSQL\n Deep  Learning  &  AI  Architectures:  \nArtificial\n \nNeural\n \nNetworks\n \n(ANN),\n \nConvolutional\n \nNeural\n \nNetworks\n \n(CNN),\n \nRecurrent\n \nNeural\n \nNetworks\n \n(RNN),\n \nLong\n \nShort-Term\n \nMemory\n \n(LSTM),\n \nTransformers,\n \nLarge\n \nLanguage\n 

In [21]:
# %% [markdown]
# ## 3B. Generate Embeddings & Build FAISS Indexes
# 
# This cell:
# - Uses the `build_all_faiss_indexes()` function
# - Creates a FAISS index for each category
# - Displays summary information about the created indexes

# %%
faiss_indexes = build_all_faiss_indexes(grouped_chunks)

# --- Summary ---
print("\n--- FAISS Index Summary ---")
for category, index in faiss_indexes.items():
    print(f"{category}: {index.index.ntotal} vectors")

print("‚úÖ Cell 3B executed successfully ‚Äî FAISS indexes created and saved.")



[INFO] Building FAISS index for category: CV
‚úÖ FAISS index for 'CV' saved to 'faiss_indexes\CV' (139 vectors)

[INFO] Building FAISS index for category: FINANCIAL
‚úÖ FAISS index for 'FINANCIAL' saved to 'faiss_indexes\FINANCIAL' (117 vectors)

[INFO] Building FAISS index for category: REIMBURSEMENT
‚úÖ FAISS index for 'REIMBURSEMENT' saved to 'faiss_indexes\REIMBURSEMENT' (22 vectors)

[INFO] Building FAISS index for category: SPECS
‚úÖ FAISS index for 'SPECS' saved to 'faiss_indexes\SPECS' (585 vectors)

‚úÖ All FAISS indexes built successfully.

--- FAISS Index Summary ---
CV: 139 vectors
FINANCIAL: 117 vectors
REIMBURSEMENT: 22 vectors
SPECS: 585 vectors
‚úÖ Cell 3B executed successfully ‚Äî FAISS indexes created and saved.


In [22]:
faiss_indexes

{'CV': <langchain_community.vectorstores.faiss.FAISS at 0x14aa95ef910>,
 'FINANCIAL': <langchain_community.vectorstores.faiss.FAISS at 0x14aa95ece10>,
 'REIMBURSEMENT': <langchain_community.vectorstores.faiss.FAISS at 0x147c5f00810>,
 'SPECS': <langchain_community.vectorstores.faiss.FAISS at 0x146b485bb50>}

In [23]:
cv_index = faiss_indexes["CV"]
results = cv_index.similarity_search("Data Scientist at Microsoft", k=3)
#print(results)
for r in results:
    print(r.page_content[:200])
    print("=================")

Employment
Dun & Bradstreet Bengaluru
Data Scientist II Aug. 2021 to Current
-Responsible for delivering Data Engineering and Data Science workÔøΩows.
-Working on automating ML Solutions using Python, P
DATA SCIENTIST 2 |
KAGGLE 2X EXPERT
RAVINDER
KUMAR¬†
TANWAR
rtanwar616@gmail.com
https://www.kaggle.com
/ravijoe
+918847095358
https://www.linkedin.com
/in/ravi-tanwar-12bb3811a/
https://github.com/rav
Data Science Math Skills
 (06/2020 - 07/2020)
 
OÔ¨Äered by Duke University through Coursera 
Getting Started with AWS Machine Learning
 (10/2020 - 11/2020)
 
OÔ¨Äered by Amazon through Coursera 
Using Py


In [12]:
# %% [markdown]
# ## 4A. LLM Configuration & Loader Function
# Define all configuration parameters and helper functions
# to load a quantized Mistral-7B GGUF model using llama-cpp-python with CUDA acceleration.

# %%
import os
from langchain_community.llms import LlamaCpp

# --- Global configuration for the quantized model ---
MODEL_DIR = r"models"
MODEL_FILE = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"  # adjust if needed
#mistral-7b-instruct-v0.2.Q4_K_M.gguf
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)
print(f"LLM MODEL_PATH - {MODEL_PATH}")

# Model parameters
N_GPU_LAYERS = -1       # use all layers on GPU
N_CTX = 8192+8192           # max context window
N_BATCH = 512
TEMPERATURE = 0.1
MAX_TOKENS = 512
VERBOSE = True

# --- Helper function to load the quantized model ---
def load_llm(model_path: str = MODEL_PATH):
    """
    Load the quantized Mistral-7B-Instruct model using llama-cpp-python with CUDA acceleration.
    """
    print(f"üöÄ Loading model from: {model_path}")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at: {model_path}")

    llm = LlamaCpp(
        model_path=model_path,
        n_gpu_layers=N_GPU_LAYERS,
        n_ctx=N_CTX,
        n_batch=N_BATCH,
        f16_kv=True,
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
        verbose=VERBOSE,
    )

    print("‚úÖ Model loaded successfully with CUDA acceleration enabled.")
    return llm


LLM MODEL_PATH - models\mistral-7b-instruct-v0.2.Q4_K_M.gguf


In [13]:
# %% [markdown]
# ## 4B. Load the Quantized Model and Test
# Actually load the Mistral-7B GGUF model and run a simple test prompt
# to confirm it's working with CUDA.

# %%
# Load the LLM using the helper defined above
llm = load_llm(MODEL_PATH)


üöÄ Loading model from: models\mistral-7b-instruct-v0.2.Q4_K_M.gguf


ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4060 Laptop GPU, compute capability 8.9, VMM: yes
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4060 Laptop GPU) - 6766 MiB free
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from models\mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader:

‚úÖ Model loaded successfully with CUDA acceleration enabled.


CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
Model metadata: {'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'general.architecture': 'llama', 'llama.context_length': '32768', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '4096', 'llama.block_count': '32', 'llama.feed_forward_length': '14336', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '15', 'llama.attention.head_count_kv': '8', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '1000000.000000', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{{ bos_token }}{% f

In [14]:
# --- Sanity check ---
test_prompt = "Where is Nice."
print("\nüß† Test Prompt:", test_prompt)

response = llm.invoke(test_prompt)
print("\nüí¨ LLM Response:\n", response)


üß† Test Prompt: Where is Nice.


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     488.03 ms /     5 tokens (   97.61 ms per token,    10.25 tokens per second)
llama_perf_context_print:        eval time =   11615.50 ms /   511 runs   (   22.73 ms per token,    43.99 tokens per second)
llama_perf_context_print:       total time =   14516.15 ms /   516 tokens
llama_perf_context_print:    graphs reused =        494



üí¨ LLM Response:
  Nice is a city located in the southeastern part of France, on the French Riviera. It is situated between the eastern end of the Baie des Anges (Bay of Angels) and the western tip of Cap de Nice (Nice Cape). The city center is about 12 kilometers (7 miles) west of the Italian border. Nice is the second-largest French city on the Mediterranean coast, after Marseille. It is also the fifth-most populous urban area in France, with a population of over 340,000 inhabitants in the metropolitan area. Nice is known for its beautiful beaches, crystal-clear waters, and mild Mediterranean climate. The city is also famous for its vibrant cultural scene, rich history, and stunning architecture. Some of the most popular tourist attractions in Nice include the Old Town (Vieille Ville), Colline du Ch√¢teau (Castle Hill), Promenade des Anglais (English Promenade), Mus√©e Matisse (Matisse Museum), and Mus√©e Marc Chagall (Chagall Museum). Nice is also home to many beautiful parks, ga

In [17]:
# %% [markdown]
# ## 5A. Retrieval-Augmented Generation (RAG) Setup
# Define helper functions that connect the FAISS vectorstores with the quantized Mistral LLM.
# Each category (CV, FINANCIAL, REIMBURSEMENT, SPECS) will have its own retriever.

# %%
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# %
number_of_retrievals = 5

# --- Prompt template for RAG responses ---
RAG_TEMPLATE = """You are an intelligent assistant that answers questions based on the provided context.
If the answer cannot be found in the context, say "The answer is not available in the provided documents."

Context:
{context}

Question:
{question}

Answer:"""

qa_prompt = PromptTemplate(template=RAG_TEMPLATE, input_variables=["context", "question"])

# --- Helper function to create RAG QA chain ---
def create_rag_chain(llm, vectorstore):
    """
    Create a RetrievalQA chain for a given FAISS vectorstore and the loaded LLM.
    """
    retriever = vectorstore.as_retriever(search_kwargs={"k": number_of_retrievals})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": qa_prompt},
        return_source_documents=True,
    )
    return qa_chain

# --- Build QA chains for all available categories ---
def build_all_rag_chains(llm, vectorstores_dict):
    """
    Create and store RAG QA chains for each category (CV, FINANCIAL, REIMBURSEMENT, SPECS).
    Returns a dictionary of QA chains keyed by category name.
    """
    rag_chains = {}
    for category, vs in vectorstores_dict.items():
        print(f"üîó Building RAG chain for: {category}")
        rag_chains[category] = create_rag_chain(llm, vs)
    print("‚úÖ All RAG chains initialized.")
    return rag_chains


In [15]:
# %% [markdown]
# ## 5A. Enhanced RAG Setup (Return Sources)
# Modified version of RAG setup to:
# 1. Return source documents and context
# 2. Use a stricter prompt to reduce hallucination

# %%
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# --- Stricter prompt template to reduce hallucination ---
RAG_TEMPLATE = """You are an intelligent assistant that answers questions based only on the provided context.
If the answer cannot be found in the context, respond with:
"The answer is not available in the provided documents."

Use only the facts from the context and do not add your own assumptions.

Context:
{context}

Question:
{question}

Answer (with factual reasoning):"""

RAG_TEMPLATE = """You are a precise and intelligent assistant that answers questions. Use ONLY the following retrieved context to answer the question. If you are not certain or answer is not in the provided Context, say "I don't know based on the provided context."


Context:
{context}

Question:
{question}

Answer (with factual reasoning):"""

qa_prompt = PromptTemplate(template=RAG_TEMPLATE, input_variables=["context", "question"])

# --- Helper function to create RAG QA chain with sources ---
def create_rag_chain_with_sources(llm, vectorstore):
    """
    Create a RetrievalQA chain that returns source documents and reduces hallucination.
    """
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": qa_prompt},
        return_source_documents=True,   # important for sources
    )
    return qa_chain

# --- Build all category-wise QA chains ---
def build_all_rag_chains_with_sources(llm, vectorstores_dict):
    """
    Build RAG chains for all categories that return both answers and source documents.
    """
    rag_chains = {}
    for category, vs in vectorstores_dict.items():
        print(f"üîó Building RAG chain with sources for: {category}")
        rag_chains[category] = create_rag_chain_with_sources(llm, vs)
    print("‚úÖ All RAG chains (with source return) initialized.")
    return rag_chains


In [16]:
# %% [markdown]
# ## 5B. Execute RAG Queries
# Use the quantized Mistral model + FAISS retrievers to answer questions from your local documents.

# %%
# Build all category-wise RAG pipelines
#rag_chains = build_all_rag_chains(llm, faiss_indexes)
rag_chains = build_all_rag_chains_with_sources(llm, faiss_indexes)



üîó Building RAG chain with sources for: CV
üîó Building RAG chain with sources for: FINANCIAL
üîó Building RAG chain with sources for: REIMBURSEMENT
üîó Building RAG chain with sources for: SPECS
‚úÖ All RAG chains (with source return) initialized.


In [24]:
# Example queries for testing
sample_queries = {
    "CV": ["What programming languages does Moid Hassan know?",
    "Where is Moid Hassan currently working?","Where has Moid Hassan worked in the past?"],
    "FINANCIAL": ["What is the total dividend income in August 2025?",
    "What was the Debit in DoubleTree by Hilton in Bangalore?"],
    "REIMBURSEMENT": ["What documents are required for travel reimbursement?"],
    "SPECS": ["What is the screen size of Surface Pro 9?", "Which GPU Surface Laptop Studio 2 has?"],
}

# --- Run all queries ---
for category, questions in sample_queries.items():
    print(f"\nüóÇÔ∏è Category: {category}")
    qa_chain = rag_chains[category]

    for i, question in enumerate(questions, start=1):
        print(f"\n‚ùì Q{i}: {question}")
        result = qa_chain.invoke({"query": question})
        print(f"üß† Answer:\n{result['result']}\n{'-'*80}")


üóÇÔ∏è Category: CV

‚ùì Q1: What programming languages does Moid Hassan know?


Llama.generate: 58 prefix-match hit, remaining 2734 prompt tokens to eval
llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =    2304.29 ms /  2734 tokens (    0.84 ms per token,  1186.48 tokens per second)
llama_perf_context_print:        eval time =     922.31 ms /    38 runs   (   24.27 ms per token,    41.20 tokens per second)
llama_perf_context_print:       total time =    3294.91 ms /  2772 tokens
llama_perf_context_print:    graphs reused =         36
Llama.generate: 58 prefix-match hit, remaining 2245 prompt tokens to eval


üß† Answer:

Moid Hassan is proficient in several programming languages, including Python, C, C++, SQL, Kotlin, Java, HTML, CSS, JavaScript, and TypeScript.
--------------------------------------------------------------------------------

‚ùì Q2: Where is Moid Hassan currently working?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =    1629.91 ms /  2245 tokens (    0.73 ms per token,  1377.38 tokens per second)
llama_perf_context_print:        eval time =     376.79 ms /    16 runs   (   23.55 ms per token,    42.46 tokens per second)
llama_perf_context_print:       total time =    2028.27 ms /  2261 tokens
llama_perf_context_print:    graphs reused =         14
Llama.generate: 1374 prefix-match hit, remaining 773 prompt tokens to eval


üß† Answer:

Moid Hassan is currently working as a Vice President at Morgan Stanley.
--------------------------------------------------------------------------------

‚ùì Q3: Where has Moid Hassan worked in the past?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     621.69 ms /   773 tokens (    0.80 ms per token,  1243.39 tokens per second)
llama_perf_context_print:        eval time =    1113.90 ms /    47 runs   (   23.70 ms per token,    42.19 tokens per second)
llama_perf_context_print:       total time =    1819.40 ms /   820 tokens
llama_perf_context_print:    graphs reused =         45
Llama.generate: 58 prefix-match hit, remaining 2110 prompt tokens to eval


üß† Answer:

Moid Hassan has worked for Argusoft India Ltd. in the past. This information can be deduced from the context provided, which mentions that Moid Hassan has previously worked for Argusoft India Ltd.
--------------------------------------------------------------------------------

üóÇÔ∏è Category: FINANCIAL

‚ùì Q1: What is the total dividend income in August 2025?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =    1525.63 ms /  2110 tokens (    0.72 ms per token,  1383.03 tokens per second)
llama_perf_context_print:        eval time =    2956.72 ms /   125 runs   (   23.65 ms per token,    42.28 tokens per second)
llama_perf_context_print:       total time =    4713.10 ms /  2235 tokens
llama_perf_context_print:    graphs reused =        120
Llama.generate: 58 prefix-match hit, remaining 1454 prompt tokens to eval


üß† Answer:

The table does not provide the total dividend income in August 2025. The table only shows the estimated monthly cash flow for various securities, including stocks, bonds, and mutual funds. The table also includes other relevant information, such as transaction costs, taxes withheld, and core fund activity. However, the table does not provide a comprehensive breakdown of dividend income by month for all securities held in an investment portfolio over a given period of time. Therefore, it is not possible to determine the total dividend income in August 2025 based on the information provided in the table.
--------------------------------------------------------------------------------

‚ùì Q2: What was the Debit in DoubleTree by Hilton in Bangalore?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =    1019.17 ms /  1454 tokens (    0.70 ms per token,  1426.65 tokens per second)
llama_perf_context_print:        eval time =     915.76 ms /    39 runs   (   23.48 ms per token,    42.59 tokens per second)
llama_perf_context_print:       total time =    2009.00 ms /  1493 tokens
llama_perf_context_print:    graphs reused =         37
Llama.generate: 58 prefix-match hit, remaining 1321 prompt tokens to eval


üß† Answer:

The provided context does not contain any information about a debit or debt in DoubleTree by Hilton in Bangalore. Therefore, I cannot provide an answer based on the given context.
--------------------------------------------------------------------------------

üóÇÔ∏è Category: REIMBURSEMENT

‚ùì Q1: What documents are required for travel reimbursement?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     922.69 ms /  1321 tokens (    0.70 ms per token,  1431.68 tokens per second)
llama_perf_context_print:        eval time =    2208.55 ms /    95 runs   (   23.25 ms per token,    43.01 tokens per second)
llama_perf_context_print:       total time =    3304.46 ms /  1416 tokens
llama_perf_context_print:    graphs reused =         91
Llama.generate: 58 prefix-match hit, remaining 769 prompt tokens to eval


üß† Answer:

Based on the context provided, there is no explicit mention of documents required for travel reimbursement. However, the context does mention that important traveler information regarding Check-in times, Insurance, Health & Vaccinations, USA entry requirement, Pricing & Taxes can be found on the Travel Itinerary. Therefore, it is recommended to refer to the Travel Itinerary for any specific document requirements related to travel reimbursement.
--------------------------------------------------------------------------------

üóÇÔ∏è Category: SPECS

‚ùì Q1: What is the screen size of Surface Pro 9?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     535.46 ms /   769 tokens (    0.70 ms per token,  1436.16 tokens per second)
llama_perf_context_print:        eval time =    1892.49 ms /    83 runs   (   22.80 ms per token,    43.86 tokens per second)
llama_perf_context_print:       total time =    2558.31 ms /   852 tokens
llama_perf_context_print:    graphs reused =         79
Llama.generate: 59 prefix-match hit, remaining 1321 prompt tokens to eval


üß† Answer:

The context provided does not mention the screen size of Surface Pro 9. However, it does provide the technical specifications for Surface Pro X, which includes its display screen size and resolution. Based on this information, I cannot definitively answer your question about the screen size of Surface Pro 9 based on the provided context alone.

I don't know based on the provided context.
--------------------------------------------------------------------------------

‚ùì Q2: Which GPU Surface Laptop Studio 2 has?


llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     935.78 ms /  1321 tokens (    0.71 ms per token,  1411.66 tokens per second)
llama_perf_context_print:        eval time =    1642.06 ms /    71 runs   (   23.13 ms per token,    43.24 tokens per second)
llama_perf_context_print:       total time =    2690.14 ms /  1392 tokens
llama_perf_context_print:    graphs reused =         68


üß† Answer:

Surface Laptop Studio 2 comes with NVIDIA¬Æ GeForce RTX‚Ñ¢ 3060 Laptop GPU. This GPU is built with the latest RT Cores, Tensor Cores, and streaming multiprocessors. Surface Laptop Studio 2 has double the graphics performance than Surface Studio 2.
--------------------------------------------------------------------------------


In [25]:
# Define multiple questions per category
sample_queries = {
    #"CV": 
    #[
    #    "Who has worked in both American Express and Microsoft?"
    #],
    #"FINANCIAL": 
    #[
    #    "What is the total dividend income in August 2025?",
    #    "What was the Debit in DoubleTree by Hilton in Bangalore?"
    #],
    #"REIMBURSEMENT": 
    #[
    #    "What documents are required for travel reimbursement?"
    #],
    "SPECS": 
    [
        "Suggest a laptop which comes with a dedicated GPU not intergrated GPU. Give me the name of the laptop with GPU name as well.",
        "What is the screen size of Surface Pro 9?",
        "What is the maximum RAM and storage for Surface Laptop 7 Copilot+PC Snapdragon?",
        "Which GPU Surface Laptop Studio 2 has?"
    ],
}


# --- Run all queries and show answers + sources ---
for category, questions in sample_queries.items():
    print(f"\nüóÇÔ∏è Category: {category}")
    qa_chain = rag_chains[category]

    for i, question in enumerate(questions, start=1):
        print(f"\n‚ùì Q{i}: {question}")
        result = qa_chain.invoke({"query": question})
        print(f"üß† Answer:\n{result['result']}\n")

        # Show source documents used for this answer
        print("üìÑ Sources:")
        for j, doc in enumerate(result.get("source_documents", []), start=1):
            print(f"   {j}. Source: {getattr(doc.metadata, 'source', 'Unknown')}")
            snippet = doc.page_content[:300].replace("\n", " ")
            print(f"      Snippet: {snippet}...")
        print("-" * 100)


üóÇÔ∏è Category: SPECS

‚ùì Q1: Suggest a laptop which comes with a dedicated GPU not intergrated GPU. Give me the name of the laptop with GPU name as well.


Llama.generate: 58 prefix-match hit, remaining 1136 prompt tokens to eval
llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =    1134.02 ms /  1136 tokens (    1.00 ms per token,  1001.75 tokens per second)
llama_perf_context_print:        eval time =    1078.44 ms /    47 runs   (   22.95 ms per token,    43.58 tokens per second)
llama_perf_context_print:       total time =    2285.40 ms /  1183 tokens
llama_perf_context_print:    graphs reused =         45
Llama.generate: 58 prefix-match hit, remaining 769 prompt tokens to eval


üß† Answer:

Based on the provided context, a laptop that comes with a dedicated GPU is the Surface Laptop 15‚Äù 7th Edition. The dedicated GPU for this laptop is the Qualcomm¬Æ Adreno‚Ñ¢ GPU.

üìÑ Sources:
   1. Source: Unknown
      Snippet: Processor 13th Gen Intel Core‚Ñ¢ i7-13700H Processor Built on the Intel Evo‚Ñ¢ platform Intel Gen3 Movidius 3700VC VPU AI Accelerator Graphics Graphics options: NVIDIAGeForce RTX‚Ñ¢ 4050 Laptop GPU with 6GB GDDR6 vRAM 2130 MHz boost clock speed, 80W maximum graphics power NVIDIA GeForce RTX‚Ñ¢ 4060 Laptop G...
   2. Source: Unknown
      Snippet: Surface Laptop 13.8‚Äù  7th Edition    Surface Laptop 15‚Äù  7th Edition  Processor        Snapdragon¬Æ X Plus   Snapdragon¬Æ X Elite    Snapdragon¬Æ X Elite  Neural Processing Unit (NPU) Qualcomm¬Æ Hexagon‚Ñ¢ with 45 TOPS  Graphics Qualcomm¬Æ Adreno‚Ñ¢ GPU  Memory and Storage1 Memory options:  16GB, 32GB, 64GB ...
   3. Source: Unknown
      Snippet: Surface Laptop 13.8‚Äù  7th Edition    Surface Lapt

llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     528.34 ms /   769 tokens (    0.69 ms per token,  1455.51 tokens per second)
llama_perf_context_print:        eval time =    1391.84 ms /    61 runs   (   22.82 ms per token,    43.83 tokens per second)
llama_perf_context_print:       total time =    2012.52 ms /   830 tokens
llama_perf_context_print:    graphs reused =         58
Llama.generate: 59 prefix-match hit, remaining 895 prompt tokens to eval


üß† Answer:

The context provided does not include the screen size of Surface Pro 9. However, it does mention that the Surface Pro 9 is compatible with Surface Pro 9. Since the context does not provide the answer to the question, I don't know based on the provided context.

üìÑ Sources:
   1. Source: Unknown
      Snippet: Surface Pro X technical specs Dimensions 11.3‚Äù x 8.2‚Äù x 0.28‚Äù (287 mm x 208 mm x 7.3 mm) Display Screen: 13‚Äù PixelSense‚Ñ¢ Display Resolution: 2880x1920 (267 PPI) Aspect ratio: 3:2 Touch: 10 point multi-touch Memory 8GB or 16GB LPDDR4x RAM Processor Microsoft SQ¬Æ 1 Microsoft SQ¬Æ 2 Security Firmware TP...
   2. Source: Unknown
      Snippet: Surface Pro Flex Keyboard  Compatibility33 Surface Pro (11th Edition)  Surface Pro 10 For Business  Surface Pro 9  Surface Pro 8  Size and Weight Length: 11.38 inches (289 mm)  Width: 8.71 inches (221 mm)  Height: 0.21 inches (5.25 mm)  Weight: 0.75 lbs (340 g)   Battery life  Up to 41 hours of cont...
   3. Source: Un

llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     588.13 ms /   895 tokens (    0.66 ms per token,  1521.76 tokens per second)
llama_perf_context_print:        eval time =    4721.26 ms /   209 runs   (   22.59 ms per token,    44.27 tokens per second)
llama_perf_context_print:       total time =    5710.85 ms /  1104 tokens
llama_perf_context_print:    graphs reused =        201
Llama.generate: 62 prefix-match hit, remaining 1318 prompt tokens to eval


üß† Answer:

Based on the provided context, the Surface Laptop 7 Copilot+PC Snapdragon has two different models: one with a 13.8-inch display and another with a 15-inch display.

The 13.8-inch model comes with up to 32GB LPDDR5x RAM and up to 1TB Gen 4 SSD storage.

On the other hand, the 15-inch model comes with up to 16GB or 32GB LPDDR5x RAM and up to 512GB Gen 4 SSD storage.

Therefore, the maximum RAM for Surface Laptop 7 Copilot+PC Snapdragon is 32GB for both models. The maximum storage for the 13.8-inch model is 1TB, while for the 15-inch model, it is 512GB.

üìÑ Sources:
   1. Source: Unknown
      Snippet: Surface Laptop 13.8‚Äù  7th Edition    Surface Laptop 15‚Äù  7th Edition  Processor        Snapdragon¬Æ X Plus   Snapdragon¬Æ X Elite    Snapdragon¬Æ X Elite  Neural Processing Unit (NPU) Qualcomm¬Æ Hexagon‚Ñ¢ with 45 TOPS  Graphics Qualcomm¬Æ Adreno‚Ñ¢ GPU  Memory and Storage1 Memory options:  16GB, 32GB, 64GB ...
   2. Source: Unknown
      Snippet: Surface Laptop 13.8‚Äù

llama_perf_context_print:        load time =     488.28 ms
llama_perf_context_print: prompt eval time =     932.03 ms /  1318 tokens (    0.71 ms per token,  1414.11 tokens per second)
llama_perf_context_print:        eval time =    1648.09 ms /    71 runs   (   23.21 ms per token,    43.08 tokens per second)
llama_perf_context_print:       total time =    2696.92 ms /  1389 tokens
llama_perf_context_print:    graphs reused =         68


üß† Answer:

Surface Laptop Studio 2 comes with NVIDIA¬Æ GeForce RTX‚Ñ¢ 3060 Laptop GPU. This GPU is built with the latest RT Cores, Tensor Cores, and streaming multiprocessors. Surface Laptop Studio 2 has double the graphics performance than Surface Studio 2.

üìÑ Sources:
   1. Source: Unknown
      Snippet: Surface Laptop Studio 2  Fact sheet | September 2023    Meet Surface Laptop Studio 2, a laptop like no other.   Surface Laptop Studio 2  brings together the versatility to create and the power to perform ,  combining cutting-edge design with incredible performance to power the most demanding apps.  ...
   2. Source: Unknown
      Snippet: Surface Studio 2+  Fact Sheet  October 2022    Meet Surface Studio 2+, Stand-out design, fluid productivity   Find fuel for inspiration with professional-grade performance on a sleek, versatile all-in-one device that  commands attention. Dive into brilliant color, blazing-fast graphics, and immersiv...
   3. Source: Unknown
      Snippet: Sign