# 02 — Pipeline RAG
Demostración del pipeline completo: chunking → embeddings → ChromaDB → retrieval.

In [None]:
import sys
from pathlib import Path
ROOT = Path("..").resolve()
sys.path.insert(0, str(ROOT))
from dotenv import load_dotenv
load_dotenv(ROOT / ".env")
print("Setup OK")

In [None]:
from src.ingestion import load_papers
from src.chunking import chunk_papers
papers = load_papers(verbose=False)
chunks_small = chunk_papers(papers, strategy="small")
chunks_large = chunk_papers(papers, strategy="large")
print(f"Strategy 'small' (256 tok): {len(chunks_small)} chunks")
print(f"Strategy 'large' (1024 tok): {len(chunks_large)} chunks")
sample = chunks_small[0]
print(f"
Sample chunk: {sample.chunk_id}")
print(f"Tokens: {sample.token_count}")
print(f"Text preview: {sample.text[:300]!r}")

In [None]:
import matplotlib.pyplot as plt
token_counts = [c.token_count for c in chunks_small]
fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(token_counts, bins=30, color="steelblue", edgecolor="white")
ax.set_title("Distribución de tamaños de chunks (strategy=small, 256 tok)")
ax.set_xlabel("Tokens por chunk")
ax.set_ylabel("Frecuencia")
plt.tight_layout()
plt.show()

In [None]:
from src.vectorstore import build_index
print("Building index (this may take a few minutes)...")
collection = build_index(strategy="small", force_rebuild=False)
print(f"Collection count: {collection.count()} chunks")

In [None]:
from src.retrieval import search
query = "¿Cómo afecta la extorsión a las pequeñas empresas en América Latina?"
print(f"Query: {query}
")
results = search(query, top_k=5, strategy="small")
for i, r in enumerate(results, 1):
    print(f"{i}. [{r.score:.3f}] {r.title[:70]}")
    print(f"   Authors: {', '.join(r.authors[:2])} ({r.year})")
    print(f"   Preview: {r.text[:150].strip()!r}")
    print()