<a href="https://colab.research.google.com/github/jm7n7/week-5-adv-rag/blob/main/ADV_RAG_Hands_On.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Track A, B, C: Reranking & Context Optimization | Multimodal RAG | Evaluation & Guardrails**

##1. Install & Setup
*   Install
    - numpy
    - pandas
    - matplotlib
    - sentence-transformers
    - faiss
    - langchain
    - openai
*   Log environment to env_rag_adv.json
##2. Load Your Project Materials
*   Use the same documents from week 4 (optional add new documents)
    - PDFs (research papers, survey articles, datasets)
    - Text/Markdown notes
*   Include 2-3 images / charts for Track B
##3. Retrieval Upgrades (Track A)
*   Implement RRF (BM25 + dense)
    - Add reranker + compression
*   Log
    - Recall@k
    - latency
    - avg context length
    - token cost
##4. Multimodal Retrieval (Track B)
*   Caption / encode images with CLIP/BLIP2/Gemini-Vision
*   Show at least _one image-only query_ retrieving a relevant chart with citations
##5. Evaluation & Guardrails (Track C)
*   Build eval_queries.jsonl
*   Compute
    - correctness/faithfullness
    - latency before guardrails
    - latency after guardrails
*   Include at least _one adversarial/unsafe/PII query_ to test guardrails
##6. Ablation Study
*   Fill ablation_results.csv:
    - Baseline
    - +Rerank
    - +Compression
    - +Multimodal
    - +Guardrails
*   Plot recall versus latency using matplotlib
##7. Reproducibility log
*   Save configs in rag_adv_run_config.json
    - embedding models
    - reranker
    - chunking
    - multimodal pipeline
    - guardrails
    - retriever (k)

## Step 1

In [None]:
# Install
%pip install langchain chromadb sentence-transformers transformers langchain-community pypdf

In [None]:
# import packages
import sys
import platform
import transformers
import sentence_transformers
import chromadb
import json
import os
try:
    import torch
    torch_v = torch.__version__
    cuda_ok = torch.cuda.is_available()
    device_name = torch.cuda.get_device_name(0) if cuda_ok else "CPU"
except:
    torch_v, cuda_ok, device_name = "N/A", False, "CPU"

In [None]:
# Log versions
env_info = {
    "python": sys.version,
    "platform": platform.platform(),
    "torch": torch_v,
    "cuda": cuda_ok,
    "device": device_name,
    "transformers": transformers.__version__,
    "sentence_transformers": sentence_transformers.__version__,
    "chromadb": chromadb.__version__
}

# Save results in env_rag.json
output_dir = '/content/drive/MyDrive/Capstone/Week 4_RAG'
file_path = os.path.join(output_dir, "env_rag.json")

# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Check if the file exists and load existing data
existing_data = {}
if os.path.exists(file_path):
    try:
        with open(file_path, 'r') as f:
            existing_data = json.load(f)
    except json.JSONDecodeError:
        existing_data = {} # Handle empty or invalid JSON

# Update existing data with new environment info
existing_data.update(env_info)

with open(file_path, 'w') as f:
    json.dump(existing_data, f, indent=4)

print(f"Environment information saved to {file_path}")

## Step 2

In [None]:
from langchain.document_loaders import PyPDFLoader

# Define the directory where the PDF files are located
pdf_dir = '/content/drive/MyDrive/Capstone/Week 4_RAG'

# List of PDF files to load
pdf_files = ["maia-2.pdf", "Amortized_chess.pdf", "chessgpt.pdf"]

# Load the documents
documents = []
for pdf_file in pdf_files:
    file_path = os.path.join(pdf_dir, pdf_file)
    loader = PyPDFLoader(file_path)
    documents.extend(loader.load())

print(f"Loaded {len(documents)} documents.")

## Step 3

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define chunking parameters
chunk_size = 500
chunk_overlap = 100

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Split the documents into chunks
chunks = text_splitter.split_documents(documents)

# Preview chunk count and first chunk
print(f"Created {len(chunks)} chunks.")
if chunks:
    print("\nFirst chunk:")
    print(chunks[0].page_content)

In [None]:
# Define the file path
file_path = os.path.join(output_dir, "rag_run_config.json")

# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Check if the file exists and load existing data
existing_data = {}
if os.path.exists(file_path):
    try:
        with open(file_path, 'r') as f:
            existing_data = json.load(f)
    except json.JSONDecodeError:
        existing_data = {} # Handle empty or invalid JSON

# Update existing data with chunk parameters
existing_data.update({
    "chunk_size": chunk_size,
    "chunk_overlap": chunk_overlap
})

# Save the updated data to the file
with open(file_path, 'w') as f:
    json.dump(existing_data, f, indent=4)

print(f"Chunk parameters saved to {file_path}")

## Step 4

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

# Initialize the embedding model
embedding_model_name = "all-MiniLM-L6-v2"
embedding_function = SentenceTransformerEmbeddings(model_name=embedding_model_name)

# Create the Chroma vector database
# We'll store the database in the same output directory
db_dir = os.path.join(output_dir, "chroma_db")
vectorstore = Chroma.from_documents(chunks, embedding_function, persist_directory=db_dir)

# Create a retriever from the vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Verify retrieval with a sample query
sample_query = "What is the main idea of the Maia-2 paper?"
docs = retriever.invoke(sample_query)

print(f"\nSample Query: {sample_query}")
print(f"\nRetrieved {len(docs)} documents:")
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content)

# Save embedding model and retriever k value to rag_run_config.json
file_path = os.path.join(output_dir, "rag_run_config.json")

# Check if the file exists and load existing data
existing_data = {}
if os.path.exists(file_path):
    try:
        with open(file_path, 'r') as f:
            existing_data = json.load(f)
    except json.JSONDecodeError:
        existing_data = {} # Handle empty or invalid JSON

# Update existing data with new information
existing_data.update({
    "embedding_model": embedding_model_name,
    "retriever_k": 4
})

# Save the updated data to the file
with open(file_path, 'w') as f:
    json.dump(existing_data, f, indent=4)

print(f"\nConfiguration updated in {file_path}")

## Step 5

In [None]:
from langchain.llms import HuggingFaceHub
import os
from google.colab import userdata

# Define the model to use (e.g., TinyLlama or distilgpt2)
# Make sure to choose a model that fits within your computational resources
model_id = "distilgpt2" # Changing to a different model
task = "text-generation" # Update task for text generation models

# Get the Hugging Face API token from Colab secrets
# Make sure you have added your token to Colab secrets with the name 'HF_TOKEN'
huggingface_api_token = userdata.get("HF_TOKEN")

# Initialize the Hugging Face LLM
llm = HuggingFaceHub(
    repo_id=model_id,
    task=task, # Use the updated task
    huggingfacehub_api_token=huggingface_api_token,
)

print(f"Connected to Hugging Face model: {model_id}")

# Note: You might need to install the 'huggingface_hub' library if not already installed
# %pip install huggingface_hub

## Step 6

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import pipeline

# Define the model to use (using the same model as before)
model_id = "distilgpt2"
task = "text-generation"

# Create a Hugging Face pipeline
pipe = pipeline(task, model=model_id)

# Initialize the LangChain LLM with the pipeline
llm_pipeline = HuggingFacePipeline(pipeline=pipe)

print(f"Initialized LLM using HuggingFacePipeline with model: {model_id}")

# Now, you can use 'llm_pipeline' in your RetrievalQA chain
# I will modify the next cell to use this new LLM object.

In [None]:
from langchain.chains import RetrievalQA

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_pipeline, # Use the llm_pipeline object
    chain_type="stuff", # Other options include "map_reduce", "refine", "map_rerank"
    retriever=retriever,
    return_source_documents=True # Set to True to see the source documents
)

# Ask three domain-specific question
query_1 = "What is the main idea of the Maia-2 paper?" # Same as first question
query_2 = "What is the conclusion of the Maia-2 paper?" # Related to first question
query_3 = "What models were used in the chessGPT paper?" # Brand new qestion context

In [None]:
# Run the query_1
result = qa_chain.invoke(query_1)

print(f"Query: {query_1}")
print(f"\nAnswer: {result['result']}")

# Optionally print source documents
if 'source_documents' in result:
    print("\nSource Documents:")
    for i, doc in enumerate(result['source_documents']):
        print(f"\nDocument {i+1}:")
        print(f"Content: {doc.page_content[:200]}...") # Print first 200 characters
        print(f"Source: {doc.metadata.get('source')}")

In [None]:
# Run the query_2
result = qa_chain.invoke(query_2)

print(f"Query: {query_2}")
print(f"\nAnswer: {result['result']}")

# Optionally print source documents
if 'source_documents' in result:
    print("\nSource Documents:")
    for i, doc in enumerate(result['source_documents']):
        print(f"\nDocument {i+1}:")
        print(f"Content: {doc.page_content[:200]}...") # Print first 200 characters
        print(f"Source: {doc.metadata.get('source')}")

In [None]:
# Run the query_3
result = qa_chain.invoke(query_3)

print(f"Query: {query_3}")
print(f"\nAnswer: {result['result']}")

# Optionally print source documents
if 'source_documents' in result:
    print("\nSource Documents:")
    for i, doc in enumerate(result['source_documents']):
        print(f"\nDocument {i+1}:")
        print(f"Content: {doc.page_content[:200]}...") # Print first 200 characters
        print(f"Source: {doc.metadata.get('source')}")

## Step 7

In [None]:
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

# Initialize a new embedding function
embedding_model_name_e5 = "intfloat/e5-small-v2"
embedding_function_e5 = SentenceTransformerEmbeddings(model_name=embedding_model_name_e5)

# Create a new Chroma vector database
db_dir_e5 = os.path.join(output_dir, f"chroma_db_{embedding_model_name_e5.replace('-', '_')}") # Update db_dir name
vectorstore_e5 = Chroma.from_documents(chunks, embedding_function_e5, persist_directory=db_dir_e5)

# Create a new retriever
retriever_e5 = vectorstore_e5.as_retriever(search_kwargs={"k": 4})

# Create a new RetrievalQA chain instance
qa_chain_e5 = RetrievalQA.from_chain_type(
    llm=llm_pipeline,  # Use the same LLM pipeline
    chain_type="stuff",
    retriever=retriever_e5,
    return_source_documents=True
)

In [None]:
# Define the same three domain-specific queries
query_1 = "What is the main idea of the Maia-2 paper?"
query_2 = "What is the conclusion of the Maia-2 paper?"
query_3 = "What models were used in the chessGPT paper?"

queries = [query_1, query_2, query_3]
results_e5 = {}

# Invoke the new RetrievalQA chain and print results
print(f"\n--- Results with {embedding_model_name_e5} embeddings ---")
for i, query in enumerate(queries):
    print(f"\nQuery: {query}")
    result_e5 = qa_chain_e5.invoke(query)
    print(f"\nAnswer: {result_e5['result']}")

    results_e5[f"query_{i+1}"] = {
        "query": query,
        "answer": result_e5['result'],
        "source_documents": [{"content": doc.page_content, "source": doc.metadata.get('source')} for doc in result_e5['source_documents']]
    }

    if 'source_documents' in result_e5:
        print("\nSource Documents:")
        for j, doc in enumerate(result_e5['source_documents']):
            print(f"\nDocument {j+1}:")
            print(f"Content: {doc.page_content[:200]}...")
            print(f"Source: {doc.metadata.get('source')}")

In [None]:
# Load the existing data from rag_run_config.json
file_path = os.path.join(output_dir, "rag_run_config.json")
existing_data = {}
if os.path.exists(file_path):
    try:
        with open(file_path, 'r') as f:
            existing_data = json.load(f)
    except json.JSONDecodeError:
        existing_data = {} # Handle empty or invalid JSON

# Update the loaded data
existing_data[f"embedding_experiment_{embedding_model_name_e5.replace('-', '_')}"] = {
    "embedding_model": embedding_model_name_e5,
    "retriever_k": 4,
    "results": results_e5
}

# Save the updated data to the file
with open(file_path, 'w') as f:
    json.dump(existing_data, f, indent=4)

print(f"\nConfiguration updated with {embedding_model_name_e5} results in {file_path}")

In [None]:
# Define new chunking parameters
new_chunk_size = 300
new_chunk_overlap = 50

print(f"Performing Chunk Sensitivity Experiment with chunk_size={new_chunk_size} and chunk_overlap={new_chunk_overlap}")

# Initialize the text splitter with new parameters
text_splitter_new = RecursiveCharacterTextSplitter(
    chunk_size=new_chunk_size,
    chunk_overlap=new_chunk_overlap
)

# Split the original documents into chunks using new parameters
chunks_new = text_splitter_new.split_documents(documents) # Using the 'documents' variable from Step 2

print(f"Created {len(chunks_new)} new chunks.")

# Initialize the same embedding model used in Step 4
embedding_model_name_original = "all-MiniLM-L6-v2"
embedding_function_original = SentenceTransformerEmbeddings(model_name=embedding_model_name_original)

# Create a new Chroma vector database with new chunks and the original embedding model
db_dir_new_chunks = os.path.join(output_dir, f"chroma_db_chunk_{new_chunk_size}_{new_chunk_overlap}")
vectorstore_new_chunks = Chroma.from_documents(chunks_new, embedding_function_original, persist_directory=db_dir_new_chunks)

# Create a new retriever from this vector store
retriever_new_chunks = vectorstore_new_chunks.as_retriever(search_kwargs={"k": 4}) # Using the same k as before

# Create a new RetrievalQA chain instance
qa_chain_new_chunks = RetrievalQA.from_chain_type(
    llm=llm_pipeline,  # Use the same LLM pipeline from Step 6
    chain_type="stuff",
    retriever=retriever_new_chunks,
    return_source_documents=True
)


In [None]:
# Define the same three domain-specific queries from Step 6
query_1 = "What is the main idea of the Maia-2 paper?"
query_2 = "What is the conclusion of the Maia-2 paper?"
query_3 = "What models were used in the chessGPT paper?"

queries = [query_1, query_2, query_3]
results_new_chunks = {}

# Invoke the new RetrievalQA chain and print results
print(f"\n--- Results with chunk_size={new_chunk_size}, chunk_overlap={new_chunk_overlap} ---")
for i, query in enumerate(queries):
    print(f"\nQuery: {query}")
    result_new_chunks = qa_chain_new_chunks.invoke(query)
    print(f"\nAnswer: {result_new_chunks['result']}")

    results_new_chunks[f"query_{i+1}"] = {
        "query": query,
        "answer": result_new_chunks['result'],
        "source_documents": [{"content": doc.page_content, "source": doc.metadata.get('source')} for doc in result_new_chunks['source_documents']]
    }

    if 'source_documents' in result_new_chunks:
        print("\nSource Documents:")
        for j, doc in enumerate(result_new_chunks['source_documents']):
            print(f"\nDocument {j+1}:")
            print(f"Content: {doc.page_content[:200]}...")
            print(f"Source: {doc.metadata.get('source')}")



In [None]:
# Load the existing data from rag_run_config.json
file_path = os.path.join(output_dir, "rag_run_config.json")
existing_data = {}
if os.path.exists(file_path):
    try:
        with open(file_path, 'r') as f:
            existing_data = json.load(f)
    except json.JSONDecodeError:
        existing_data = {} # Handle empty or invalid JSON

# Update the loaded data with new chunking experiment results
existing_data[f"chunk_experiment_{new_chunk_size}_{new_chunk_overlap}"] = {
    "chunk_size": new_chunk_size,
    "chunk_overlap": new_chunk_overlap,
    "embedding_model": embedding_model_name_original,
    "retriever_k": 4,
    "results": results_new_chunks
}

# Save the updated data to the file
with open(file_path, 'w') as f:
    json.dump(existing_data, f, indent=4)

print(f"\nConfiguration updated with chunk_size={new_chunk_size}, chunk_overlap={new_chunk_overlap} results in {file_path}")

## Step 8
*Skipped*

## Step 9

In [None]:
file_path = os.path.join(output_dir, "rag_run_config.json")

# Load the data from the file
try:
    with open(file_path, 'r') as f:
        config_data = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
    print(f"Error loading data from {file_path}")
    config_data = {} # Initialize empty if file not found or invalid

print("--- Experiment Summary ---")

# Summarize Embedding Swap Experiment
embedding_experiment_key = None
for key in config_data:
    if key.startswith("embedding_experiment_"):
        embedding_experiment_key = key
        break

if embedding_experiment_key:
    embedding_experiment_data = config_data[embedding_experiment_key]
    original_embedding_model = config_data.get("embedding_model", "N/A") # Get original embedding model
    print(f"\nEmbedding Swap Experiment:")
    print(f"  Original Embedding Model: {original_embedding_model}")
    print(f"  Compared Against: {embedding_experiment_data.get('embedding_model', 'N/A')}")
    print("  Review the 'results' section in rag_run_config.json for detailed output.")
else:
    print("\nEmbedding Swap Experiment data not found in rag_run_config.json")


# Summarize Chunk Sensitivity Experiment
chunk_experiment_key = None
for key in config_data:
    if key.startswith("chunk_experiment_"):
        chunk_experiment_key = key
        break

if chunk_experiment_key:
    chunk_experiment_data = config_data[chunk_experiment_key]
    original_chunk_size = config_data.get("chunk_size", "N/A")
    original_chunk_overlap = config_data.get("chunk_overlap", "N/A")
    print(f"\nChunk Sensitivity Experiment:")
    print(f"  Original Chunk Settings: chunk_size={original_chunk_size}, chunk_overlap={original_chunk_overlap}")
    print(f"  Compared Against: chunk_size={chunk_experiment_data.get('chunk_size', 'N/A')}, chunk_overlap={chunk_experiment_data.get('chunk_overlap', 'N/A')}")
    print("  Review the 'results' section in rag_run_config.json for detailed output.")
else:
    print("\nChunk Sensitivity Experiment data not found in rag_run_config.json")

print("\n--- End of Summary ---")