# RAG Chatbot (Scales to ~1M Tokens of Source Text)

This notebook upgrades your simple employee-policy chatbot into a **RAG** system that can index and query a much larger dataset (e.g., ~1,000,000 tokens across many files).

## What changes vs your original notebook
- Uses a **persistent vector database (Chroma)** on disk.
- Adds an **ingestion/indexing pipeline** that can handle large corpora: load → chunk → embed → store (batched).
- Supports **incremental re-runs** (won't duplicate already-indexed chunks).
- Removes Google Colab-only bits (no `/content`, no `userdata`).

## Folder structure (recommended)
```
project/
  data/                 # put your PDFs/TXTs/MDs here
  chroma_db/            # created automatically
  ChatBot_RAG_1M.ipynb
```

> “1 million tokens” refers to the total text size across all documents after tokenization, not a single prompt.


In [None]:
# Install dependencies (run once)
%pip -q install -U langchain langchain-openai langchain-community langchain-chroma chromadb pypdf tiktoken


In [None]:
import os
from pathlib import Path

# ====== CONFIG ======
DATA_DIR = Path("data")            # put your large dataset here
CHROMA_DIR = Path("chroma_db")     # persistent vector DB folder
COLLECTION_NAME = "employee_policy_kb"

# Retrieval tuning
TOP_K = 8
USE_MMR = True  # MMR increases diversity of retrieved chunks; often improves answers


In [None]:
# API key
# Set in your shell before starting VS Code:
#   export OPENAI_API_KEY="..."
# (Windows PowerShell):
#   setx OPENAI_API_KEY "..."  # then restart terminal

assert os.getenv("OPENAI_API_KEY"), "Missing OPENAI_API_KEY environment variable."


## 1) Load and chunk documents

In [None]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_documents(data_dir: Path):
    """Load supported files from a folder."""
    data_dir = Path(data_dir)
    if not data_dir.exists():
        raise FileNotFoundError(f"DATA_DIR not found: {data_dir.resolve()}")

    docs = []

    # PDFs
    pdf_loader = DirectoryLoader(
        str(data_dir),
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    docs.extend(pdf_loader.load())

    # Plain text / markdown
    txt_loader = DirectoryLoader(
        str(data_dir),
        glob="**/*.[tm][xd]",  # .txt and .md
        loader_cls=lambda p: TextLoader(p, encoding="utf-8"),
        show_progress=True
    )
    docs.extend(txt_loader.load())

    return docs

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)


In [None]:
docs = load_documents(DATA_DIR)
print(f"Loaded {len(docs)} document pages/files")

splits = splitter.split_documents(docs)
print(f"Created {len(splits)} chunks")

print(splits[0].metadata)
print(splits[0].page_content[:300])


## 2) (Optional) Estimate token size

In [None]:
import tiktoken

def estimate_tokens(texts, model_encoding="cl100k_base"):
    enc = tiktoken.get_encoding(model_encoding)
    return sum(len(enc.encode(t)) for t in texts)

total_tokens_est = estimate_tokens([d.page_content for d in splits])
print(f"Estimated tokens across chunks: {total_tokens_est:,}")
print(f"Approx words (very rough): {int(total_tokens_est * 0.75):,}")


## 3) Build / Update the Vector Database (persistent, incremental)

In [None]:
import hashlib
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings = OpenAIEmbeddings()

def chunk_id(doc) -> str:
    """Deterministic id so re-ingesting doesn't create duplicates."""
    src = str(doc.metadata.get("source", ""))
    page = str(doc.metadata.get("page", ""))
    content = doc.page_content
    return hashlib.sha1((src + "|" + page + "|" + content).encode("utf-8")).hexdigest()

def upsert_documents(vectorstore: Chroma, documents, batch_size=128):
    """Add documents in batches, skipping chunks already present."""
    ids = [chunk_id(d) for d in documents]

    existing = set()
    for i in range(0, len(ids), 2000):
        batch_ids = ids[i:i+2000]
        got = vectorstore.get(ids=batch_ids, include=[])
        existing.update(got.get("ids", []))

    to_add_docs, to_add_ids = [], []
    for d, id_ in zip(documents, ids):
        if id_ not in existing:
            d.metadata = dict(d.metadata)
            d.metadata["chunk_id"] = id_
            to_add_docs.append(d)
            to_add_ids.append(id_)

    print(f"Already indexed: {len(existing):,}")
    print(f"New chunks to add: {len(to_add_docs):,}")

    for i in range(0, len(to_add_docs), batch_size):
        vectorstore.add_documents(to_add_docs[i:i+batch_size], ids=to_add_ids[i:i+batch_size])
        if (i // batch_size) % 10 == 0:
            print(f"...added {min(i+batch_size, len(to_add_docs)):,}/{len(to_add_docs):,}")

    vectorstore.persist()
    return len(to_add_docs)

CHROMA_DIR.mkdir(parents=True, exist_ok=True)
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=str(CHROMA_DIR),
)

added = upsert_documents(vectorstore, splits, batch_size=128)
print(f"Vector DB ready. Added {added:,} new chunks.")


## 4) Retrieval + Chat (RAG)

In [None]:
from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a helpful assistant. Answer ONLY using the provided context. "
     "If the answer is not in the context, say you don't know. "
     "Be concise and, when possible, cite the source filenames/pages."),
    ("human", "Context:\n{context}\n\nQuestion:\n{question}")
])

if USE_MMR:
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={"k": TOP_K, "fetch_k": max(20, TOP_K*3)}
    )
else:
    retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

def format_context(docs):
    lines = []
    for d in docs:
        src = Path(d.metadata.get("source","")).name
        page = d.metadata.get("page", None)
        loc = f"{src}" + (f" p.{page}" if page is not None else "")
        lines.append(f"[Source: {loc}]\n{d.page_content}")
    return "\n\n".join(lines)

def ask_question(question: str):
    docs = retriever.invoke(question)
    context = format_context(docs)
    resp = llm.invoke(prompt.format_messages(context=context, question=question))
    return resp.content, docs


In [None]:
answer, sources = ask_question("What is the purpose of the UK workplace employment policy?")
print(answer)
print("\n--- Retrieved sources ---")
for s in sources[:5]:
    print(Path(s.metadata.get("source","")).name, "page:", s.metadata.get("page", None))


## 5) Simple CLI loop

In [None]:
while True:
    q = input("\nYou: ").strip()
    if q.lower() in {"exit","quit"}:
        break
    answer, _ = ask_question(q)
    print("\nBot:", answer)
