# RAG Demo Notebook (LlamaIndex + Chroma + OpenRouter)

Use this notebook to experiment with chunking, filters, and retrieval strategies without rerunning a script.

**Prereqs**
- `pip install -r requirements-rag.txt`
- Add `.env` with `OPENROUTER_API_KEY=sk-or-...`
- Data sources: `./data/*.pdf` (optional) + web page `SOURCE_URL`.

**Tips**
- Toggle `force_rebuild` to rebuild the Chroma index.
- Adjust `chunk_size`, `chunk_overlap`, `top_k`, and reranker model to compare quality/latency.

In [2]:
import os
from pathlib import Path
from typing import Iterable, List, Tuple

import chromadb
from dotenv import load_dotenv
from llama_index.core import (
    Settings,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.schema import Document
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openrouter import OpenRouter
from llama_index.readers.web import SimpleWebPageReader
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.vector_stores.chroma import ChromaVectorStore

# Load .env for API keys and other secrets
load_dotenv()

# In Jupyter, __file__ is not defined, so use current working directory
BASE_DIR = Path(os.getcwd())
DATA_DIR = BASE_DIR / "data"
PERSIST_DIR = BASE_DIR / "storage" / "chroma"
COLLECTION_NAME = "rag_demo"

api_key = os.getenv("OPENROUTER_API_KEY")
# Fail fast so later OpenRouter calls have a valid key
if not api_key:
    raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")


In [3]:
# Hyperparameters you can tweak
chunk_size = 512
chunk_overlap = 20
use_pdfs = True
# Set True to drop existing Chroma data; keep False for incremental runs
force_rebuild = True

# Retrieval params
vector_top_k_baseline = 4
vector_top_k_high = 8
vector_top_k_rerank = 10
# Choose a reranker; swap to multilingual variant(bge-reranker-v2-m3) if needed
reranker_model = "BAAI/bge-reranker-base"

# LLM model on OpenRouter
llm_model = "openai/gpt-4o-mini"

# RAG doc URLs (web pages)
RAG_URLS = [
    "https://developers.llamaindex.ai/python/framework/understanding/rag/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/indexing/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/loading/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/loading/llamacloud/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/loading/llamahub/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/querying/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/storing/",
    "https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top"
]

# Default question with anti-hallucination guard
QUESTION = (
    "how to call openrouter in llamaindex? please give me a code example. "
    "If no code snippet is present in the provided docs, say 'No code snippet found in docs' "
    "and do not fabricate code."
)



In [None]:
def configure_settings():
    # Set global defaults so all readers/retrievers share the same models
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_overlap
    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    Settings.llm = OpenRouter(model=llm_model, api_key=api_key)


def load_web_documents(urls: List[str]) -> List[Document]:
    # Fetch web pages and normalize source metadata for later citation
    docs = SimpleWebPageReader(html_to_text=True).load_data(urls)
    for doc in docs:
        doc.metadata["source_url"] = doc.metadata.get("url") or doc.metadata.get("source")
    return docs


def load_pdf_documents(data_dir: Path) -> List[Document]:
    # Best-effort load PDFs; skip silently if folder is absent
    if not data_dir.exists():
        return []
    reader = SimpleDirectoryReader(
        input_dir=str(data_dir), recursive=True, required_exts=[".pdf"]
    )
    pdf_docs = reader.load_data()
    for doc in pdf_docs:
        doc.metadata["source_url"] = str(doc.metadata.get("file_path", "pdf"))
    return pdf_docs


def load_all_documents(include_pdf: bool = True) -> List[Document]:
    # Combine web and optional PDF sources into a single doc list
    docs: List[Document] = []
    docs.extend(load_web_documents(RAG_URLS))
    if include_pdf:
        docs.extend(load_pdf_documents(DATA_DIR))
    return docs


def init_vector_store(force_rebuild: bool = False) -> Tuple[ChromaVectorStore, chromadb.api.models.Collection.Collection]:
    # Prepare persistent Chroma collection; optionally drop for a clean rebuild
    PERSIST_DIR.mkdir(parents=True, exist_ok=True)
    client = chromadb.PersistentClient(path=str(PERSIST_DIR))
    if force_rebuild:
        try:
            client.delete_collection(name=COLLECTION_NAME)
        except Exception:
            pass
    collection = client.get_or_create_collection(
        name=COLLECTION_NAME, metadata={"description": "LlamaIndex RAG demo"}
    )
    vector_store = ChromaVectorStore(chroma_collection=collection)
    return vector_store, collection


def build_or_load_index(documents: List[Document], force_rebuild: bool = False) -> VectorStoreIndex:
    # Reuse existing collection when possible; otherwise rebuild with documents
    vector_store, collection = init_vector_store(force_rebuild=force_rebuild)
    if not force_rebuild and collection.count() > 0:
        return VectorStoreIndex.from_vector_store(vector_store)
    if not documents:
        raise RuntimeError("No documents to index. Provide at least one source.")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, show_progress=True
    )


def append_citations(text: str, sources: Iterable[str]) -> str:
    # Deduplicate citations and append to the generated answer
    unique_sources = []
    for src in sources:
        if src and src not in unique_sources:
            unique_sources.append(src)
    if not unique_sources:
        return text
    return f"{text} [Source: {', '.join(unique_sources)}]"


def run_query_with_citation(query_engine, question: str, label: str) -> None:
    # Guardrail prompt to avoid hallucinated code; print answer plus sources
    guarded_question = (
        question
        + "\nOnly use provided context. If no code or answer is present, reply 'No code snippet found in docs' and do not make up code."
    )
    response = query_engine.query(guarded_question)
    cited = append_citations(
        response.response,
        (n.metadata.get("source_url") or n.metadata.get("url") for n in response.source_nodes),
    )
    print(f"\n=== {label} ===")
    print(cited)
    print("\nRetrieved sources:")
    for idx, node in enumerate(response.source_nodes, start=1):
        src = node.metadata.get("source_url") or node.metadata.get("url") or "unknown"
        print(f"- {idx}. score={node.score:.3f} source={src}")

    # Print the exact retrieved text that was fed to the LLM
    print("\n--- Retrieved context fed to LLM ---")
    for idx, node in enumerate(response.source_nodes, start=1):
        src = node.metadata.get("source_url") or node.metadata.get("url") or "unknown"
        content = (node.get_content() or "").strip()
        print(f"[{idx}] source={src} score={node.score:.3f}")
        print(content)
        print("-" * 40)


def demonstrate_metadata_filter(index: VectorStoreIndex):
    # Example: constrain retrieval to a single source URL via metadata filters
    first_url = RAG_URLS[0]
    filters = MetadataFilters(filters=[ExactMatchFilter(key="source_url", value=first_url)])
    retriever = index.as_retriever(similarity_top_k=3, filters=filters)
    nodes = retriever.retrieve("RAG system core value and main challenges?")
    print("\n=== Filtered retrieval (source_url constrained) ===")
    for idx, node in enumerate(nodes, start=1):
        src = node.metadata.get("source_url") or node.metadata.get("url") or "unknown"
        snippet = (node.get_content() or "").replace("\n", " ")[:160]
        print(f"- {idx}. source={src}, score={node.score:.3f}")
        print(f"  snippet: {snippet}...")


def create_hybrid_retriever(index: VectorStoreIndex) -> QueryFusionRetriever:
    # Merge dense (vector) and sparse (BM25) signals for better recall
    vector_retriever = index.as_retriever(similarity_top_k=5)
    nodes = list(index.docstore.docs.values())
    valid_nodes = [n for n in nodes if n.get_content() and n.get_content().strip()]
    bm25_retriever = BM25Retriever(nodes=valid_nodes, similarity_top_k=5)
    fusion_retriever = QueryFusionRetriever(
        retrievers=[vector_retriever, bm25_retriever],
        similarity_top_k=4,
        num_queries=1,
        use_async=False,
    )
    return fusion_retriever


def create_reranker() -> SentenceTransformerRerank:
    # Lightweight wrapper to pick the reranker model
    return SentenceTransformerRerank(model=reranker_model, top_n=4)


def compare_retrieval_strategies(index: VectorStoreIndex, question: str):
    print("\n" + "=" * 80)
    print("COMPARISON: Different Retrieval Strategies")
    print("=" * 80)

    print("\n[1/3] Baseline: Vector retrieval (top_k=4)...")
    vector_engine = index.as_query_engine(similarity_top_k=vector_top_k_baseline)
    run_query_with_citation(vector_engine, question, "Strategy 1: Baseline Vector")

    print("\n[2/3] Higher recall: Vector retrieval (top_k=8)...")
    vector_engine_high = index.as_query_engine(similarity_top_k=vector_top_k_high)
    run_query_with_citation(vector_engine_high, question, "Strategy 2: Higher Recall")

    print("\n[3/3] Advanced: Vector + BGE Reranker...")
    reranker = create_reranker()
    vector_retriever = index.as_retriever(similarity_top_k=vector_top_k_rerank)
    rerank_engine = RetrieverQueryEngine.from_args(
        vector_retriever, node_postprocessors=[reranker]
    )
    run_query_with_citation(rerank_engine, question, "Strategy 3: Vector + Reranker")

    print("\n" + "=" * 80)
    print("Comparison complete! Review answers and sources above.")
    print("=" * 80)



In [5]:
configure_settings()
print("Settings configured. chunk_size=", chunk_size, "chunk_overlap=", chunk_overlap)



Settings configured. chunk_size= 512 chunk_overlap= 20


In [6]:
docs = load_all_documents(include_pdf=use_pdfs)
print(f"Loaded {len(docs)} documents")


Loaded 47 documents


In [7]:
index = build_or_load_index(docs, force_rebuild=force_rebuild)
print("Index ready. Collection size:", index._vector_store._collection.count())


2025-12-11 21:05:21,281 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Parsing nodes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47/47 [00:01<00:00, 40.43it/s]
Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 411/411 [00:20<00:00, 19.85it/s]


Index ready. Collection size: 411


In [8]:
demonstrate_metadata_filter(index)



=== Filtered retrieval (source_url constrained) ===
- 1. source=https://developers.llamaindex.ai/python/framework/understanding/rag/, score=0.585
  snippet: * Querying            * [ Querying ](/python/framework/understanding/rag/querying/)         * Storing            * [ Storing ](/python/framework/understanding/r...
- 2. source=https://developers.llamaindex.ai/python/framework/understanding/rag/, score=0.573
  snippet: [install LlamaIndex](/python/framework/getting_started/installation) and complete the [starter tutorial](/python/framework/getting_started/starter_example) befo...
- 3. source=https://developers.llamaindex.ai/python/framework/understanding/rag/, score=0.551
  snippet: * Observability          * [ Observability ](/python/framework/module_guides/observability/)         * Callbacks            * [ Callbacks ](/python/framework/mo...


In [9]:
compare_retrieval_strategies(index, QUESTION)



COMPARISON: Different Retrieval Strategies

[1/3] Baseline: Vector retrieval (top_k=4)...


2025-12-11 21:05:48,235 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



=== Strategy 1: Baseline Vector ===
To call OpenRouter in LlamaIndex, you can use the following code example:

```python
from llama_index.llms.openrouter import OpenRouter
from llama_index.core.llms import ChatMessage

llm = OpenRouter(
    api_key="<your-api-key>",
    max_tokens=256,
    context_window=4096,
    model="gryphe/mythomax-l2-13b",
)

message = ChatMessage(role="user", content="Tell me a joke")
resp = llm.chat([message])
print(resp)
```

This snippet demonstrates how to set up the OpenRouter and make a chat call. [Source: https://developers.llamaindex.ai/python/framework/understanding/rag/loading/llamahub/, https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top]

Retrieved sources:
- 1. score=0.619 source=https://developers.llamaindex.ai/python/framework/understanding/rag/loading/llamahub/
- 2. score=0.616 source=https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top
- 3. score=0.615 source=https://developers.llamaindex.ai/python/examples/ll

2025-12-11 21:05:52,607 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



=== Strategy 2: Higher Recall ===
```python
from llama_index.llms.openrouter import OpenRouter
from llama_index.core.llms import ChatMessage

llm = OpenRouter(
    api_key="<your-api-key>",
    max_tokens=256,
    context_window=4096,
    model="gryphe/mythomax-l2-13b",
)

message = ChatMessage(role="user", content="Tell me a joke")
resp = llm.chat([message])
print(resp)
``` [Source: https://developers.llamaindex.ai/python/framework/understanding/rag/loading/llamahub/, https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top]

Retrieved sources:
- 1. score=0.619 source=https://developers.llamaindex.ai/python/framework/understanding/rag/loading/llamahub/
- 2. score=0.616 source=https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top
- 3. score=0.615 source=https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top
- 4. score=0.609 source=https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top
- 5. score=0.605 source=https://developers

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.46s/it]
2025-12-11 21:06:05,507 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



=== Strategy 3: Vector + Reranker ===
To call OpenRouter in LlamaIndex, you can use the following code example:

```python
from llama_index.llms.openrouter import OpenRouter
from llama_index.core.llms import ChatMessage

llm = OpenRouter(
    api_key="<your-api-key>",
    max_tokens=256,
    context_window=4096,
    model="gryphe/mythomax-l2-13b",
)

message = ChatMessage(role="user", content="Tell me a joke")
resp = llm.chat([message])
print(resp)
```

For streaming, you can use:

```python
message = ChatMessage(role="user", content="Tell me a story in 250 words")
resp = llm.stream_chat([message])
for r in resp:
    print(r.delta, end="")
``` [Source: https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top]

Retrieved sources:
- 1. score=0.991 source=https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top
- 2. score=0.764 source=https://developers.llamaindex.ai/python/examples/llm/openrouter/#_top
- 3. score=0.693 source=https://developers.llamaindex.ai/py