In [1]:
# dnAI Javier Castro 20/09/25
# ----------------------------------------------------------------
# Advanced RAG EVALs sentence window, auto merging, reranking. Contex precision/recall answer relevance and groundesness=faithfulness
# ----------------------------------------------------------------
import os, sys, json, logging, warnings, subprocess, time
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# 0) Bootstrap compatible deps
def _ensure(pkgs):
    to_install=[]
    for mod, spec in pkgs:
        try: __import__(mod)
        except Exception: to_install.append(spec)
    if to_install:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + to_install)

_ensure([
    ("llama_index", "llama-index>=0.10.0"),
    ("llama_index.llms.openai", "llama-index-llms-openai>=0.2.0"),
    ("llama_index.embeddings.openai", "llama-index-embeddings-openai>=0.2.0"),
    ("ragas", "ragas>=0.1.9"),
    ("pandas", "pandas>=2.0.0"),
    ("tabulate", "tabulate>=0.9.0"),
    ("openai", "openai>=1.40.0"),
])

try:
    import cohere
    _COHERE_AVAILABLE = True
except Exception:
    _COHERE_AVAILABLE = False

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.WARNING, force=True)
for _name in ("openai", "httpx", "httpcore", "ragas", "llama_index"):
    logging.getLogger(_name).setLevel(logging.WARNING)

# Keep RAGAS quiet & robust
os.environ.setdefault("RAGAS_DISABLE_ANALYTICS", "1")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# 1) Imports
import pandas as pd
from tabulate import tabulate
from openai import OpenAI as OpenAIClient

from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI as LlamaOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.retrievers import AutoMergingRetriever

from ragas import evaluate
import ragas.metrics as RM
from ragas.metrics import faithfulness as FAITH
try:
    from ragas.run_config import RunConfig
except Exception:
    RunConfig = None
from ragas.llms import llm_factory

# --- Markdown/HTML helpers (work in notebooks; graceful fallback in terminals) ---
_IS_NOTEBOOK = False
try:
    from IPython.display import display, Markdown
    _IS_NOTEBOOK = True
except Exception:
    pass

def md_line(text: str, small: bool = False, bold: bool = False):
    """Print a single markdown line; in notebooks render markdown, else print raw."""
    s = text
    if small:
        s = f"<small>{s}</small>"
    if bold:
        s = f"**{s}**"
    if _IS_NOTEBOOK:
        try:
            display(Markdown(s))
            return
        except Exception:
            pass
    print(s)

def md_table(markdown_table_str: str, title: Optional[str] = None, small: bool = False, bold: bool = False):
    """Render a full markdown table string nicely; in terminals falls back to print."""
    if title:
        md_line(title, bold=True)
    s = markdown_table_str
    if small:
        s = f"<small>\n{s}\n</small>"
    if bold:
        s = f"**\n{s}\n**"
    if _IS_NOTEBOOK:
        try:
            display(Markdown(s))
            return
        except Exception:
            pass
    print(s)

# 2) Config
CONFIG = {
    "data_path": "./data",
    "create_sample_if_empty": True,
    "eval_questions": [
        "What concrete steps does the document recommend for finding projects to work on?",
        "How should a newcomer prioritise learning versus shipping projects, according to the text?",
        "What pitfalls does the text warn about when building a portfolio?",
        "Summarise the recommended networking tactics."
    ],
    # If empty, we'll auto-generate short references from the corpus (needed for recall/precision)
    "eval_references": [],
    "embedding": {"model": "text-embedding-3-small"},
    "llm": {"model": "gpt-4o-mini", "temperature": 0.0},
    "experiments": [
        {"name": "baseline-vector",        "retriever": "vector",          "top_k": 6},
        {"name": "sentence-window",        "retriever": "sentence_window", "window_size": 3, "top_k": 6},
        {"name": "auto-merging",           "retriever": "auto_merging",    "top_k": 6},
        {"name": "sentence-window+rerank", "retriever": "sentence_window", "window_size": 3, "top_k": 50,
         "reranker": {"type": "cohere", "top_n": 8, "model": "rerank-english-v3.0"}},
        {"name": "auto-merging+rerank",    "retriever": "auto_merging",    "top_k": 50,
         "reranker": {"type": "cohere", "top_n": 8, "model": "rerank-english-v3.0"}},
    ],
    "ragas": {"llm_model": "gpt-4o-mini", "embed_model": "text-embedding-3-small"},
    "output_dir": "./outputs",
    "print_context_chars": 0,
}

# 3) Builders & utilities
def _ensure_dir(p: str): Path(p).mkdir(parents=True, exist_ok=True)

def _maybe_create_sample(data_path: str):
    d = Path(data_path); d.mkdir(parents=True, exist_ok=True)
    if not any(d.rglob("*")):
        (d / "sample.txt").write_text(
            "This is a tiny sample document about career building in AI. "
            "Ship small projects, learn in public, and network kindly. "
            "Avoid vague portfolios; show real, runnable things. "
            "Concrete steps: pick a small problem, scope it to a weekend, build an MVP, and share a demo write-up."
        )

def _load_corpus(data_path: str):
    docs = SimpleDirectoryReader(input_dir=data_path, recursive=True).load_data()
    if not docs:
        raise RuntimeError(f"No documents found under: {data_path}")
    return docs

def _build_embedder(cfg): return OpenAIEmbedding(model=cfg.get("model", "text-embedding-3-small"))
def _build_llm(cfg): return LlamaOpenAI(model=cfg.get("model", "gpt-4o-mini"), temperature=cfg.get("temperature", 0.0))

class OpenAIEmbeddingsAdapter:
    """Minimal adapter providing the methods RAGAS expects (embed_query/documents)."""
    def __init__(self, client: OpenAIClient, model: str):
        self.client = client; self.model = model
    def embed_query(self, text: str):
        return self.client.embeddings.create(model=self.model, input=text).data[0].embedding
    def embed_documents(self, texts: List[str]):
        return [self.embed_query(t) for t in texts]

def _build_vector_retriever(docs, embed_model, top_k):
    index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
    return index.as_retriever(similarity_top_k=top_k), index

def _build_sentence_window_retriever(docs, embed_model, window_size, top_k):
    parser = SentenceWindowNodeParser.from_defaults(window_size=window_size)
    nodes = parser.get_nodes_from_documents(docs)
    index = VectorStoreIndex(nodes, embed_model=embed_model)
    return index.as_retriever(similarity_top_k=top_k), index

def _build_auto_merging_retriever(docs, embed_model, top_k):
    index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
    base = index.as_retriever(similarity_top_k=top_k)
    am = AutoMergingRetriever(base, index.storage_context)
    return am, index

def _cosine(u: List[float], v: List[float]) -> float:
    dot = sum(a*b for a,b in zip(u,v))
    nu = (sum(a*a for a in u)) ** 0.5 or 1.0
    nv = (sum(b*b for b in v)) ** 0.5 or 1.0
    return dot/(nu*nv)

def _cohere_rerank(question: str, texts: List[str], top_n: int, model: str) -> List[int]:
    api_key = os.environ.get("COHERE_API_KEY")
    if not (_COHERE_AVAILABLE and api_key):
        raise RuntimeError("Cohere not available")
    c = cohere.Client(api_key)
    res = c.rerank(model=model, query=question, documents=texts, top_n=min(top_n, len(texts)))
    return [r.index for r in res.results]

def _emb_rerank(question: str, texts: List[str], top_n: int, emb_adapter) -> List[int]:
    qv = emb_adapter.embed_query(question)
    dvs = emb_adapter.embed_documents(texts)
    order = sorted(range(len(texts)), key=lambda i: _cosine(qv, dvs[i]), reverse=True)
    return order[:min(top_n, len(order))]

def _llm_batch_rerank(question: str, texts: List[str], top_n: int, client: OpenAIClient, model="gpt-4o-mini") -> List[int]:
    if not texts: return []
    prompt = (
        "You are a reranker. Given a QUESTION and numbered PASSAGES, return JSON {\"top_indices\":[...]} "
        f"for the top {min(top_n, len(texts))} most relevant in descending order.\n\n"
        f"QUESTION:\n{question}\n\nPASSAGES:\n" +
        "\n".join([f"[{i}] {t[:1500]}" for i,t in enumerate(texts)])
    )
    out = client.chat.completions.create(model=model, messages=[{"role":"user","content":prompt}], temperature=0)
    txt = out.choices[0].message.content.strip()
    try:
        data = json.loads(txt)
        idxs = [int(i) for i in data.get("top_indices", []) if 0 <= int(i) < len(texts)]
        if idxs: return idxs[:min(top_n, len(idxs))]
    except Exception:
        pass
    return _emb_rerank(question, texts, top_n, OpenAIEmbeddingsAdapter(client, "text-embedding-3-small"))

def _apply_rerank(question: str, nodes, rer_cfg: Optional[Dict[str, Any]], client: OpenAIClient):
    if not rer_cfg or not nodes: return nodes
    texts=[]
    for sn in nodes:
        n = getattr(sn, "node", None) or sn
        t = getattr(n, "text", None) or getattr(n, "get_text", lambda: None)()
        texts.append(t or "")
    top_n = int(rer_cfg.get("top_n", 8))
    rtype  = (rer_cfg.get("type") or "llm").lower()
    try:
        if rtype == "cohere":
            idxs = _cohere_rerank(question, texts, top_n, rer_cfg.get("model", "rerank-english-v3.0"))
        else:
            idxs = _llm_batch_rerank(question, texts, top_n, client)
    except Exception:
        print("[WARN] COHERE_API_KEY not set or 'cohere' not installed → falling back to LLM/embedding reranker.")
        sys.stdout.flush()
        idxs = _llm_batch_rerank(question, texts, top_n, client)
    return [nodes[i] for i in idxs]

def _answer_extractive(question: str, contexts: List[str], client: OpenAIClient, model="gpt-4o-mini") -> Tuple[str, List[str]]:
    sys_msg = {"role": "system", "content": (
        "You are an extractive assistant. Answer ONLY using the provided snippets. "
        "Quote exact lines (with [#idx]) that support each claim. If insufficient info, say you don't know."
    )}
    user_msg = {"role": "user", "content": (
        f"QUESTION:\n{question}\n\n"
        "SNIPPETS (numbered):\n" + "\n".join([f"[{i}] {c}" for i,c in enumerate(contexts)]) +
        "\n\nReturn:\n- 1–3 sentence answer.\n- Bullet list of quoted evidence with [#idx].\n"
    )}
    out = client.chat.completions.create(model=model, messages=[sys_msg, user_msg], temperature=0)
    return out.choices[0].message.content.strip(), contexts

def _generate_references_from_corpus(docs, questions: List[str], client: OpenAIClient, model="gpt-4o-mini", max_chars=16000) -> List[str]:
    corpus = "\n\n".join(getattr(d, "text", "") for d in docs)[:max_chars]
    refs=[]
    for q in questions:
        out = client.chat.completions.create(
            model=model,
            messages=[
                {"role":"system","content":"Answer concisely (<=100 words) using ONLY the provided corpus."},
                {"role":"user","content":f"CORPUS:\n{corpus}\n\nQUESTION:\n{q}\n\nReturn a concise, factual reference answer grounded only in the corpus."}
            ],
            temperature=0.0,
        )
        refs.append(out.choices[0].message.content.strip())
    return refs

def _make_answer_relevance_metric(ragas_llm, ragas_emb):
    try:
        from ragas.metrics.answer_relevancy import AnswerRelevancy
        return AnswerRelevancy(llm=ragas_llm, embeddings=ragas_emb)
    except Exception: pass
    try:
        from ragas.metrics.answer_relevance import AnswerRelevance
        return AnswerRelevance(llm=ragas_llm, embeddings=ragas_emb)
    except Exception: pass
    for name in ("answer_relevancy", "answer_relevance"):
        m = getattr(RM, name, None)
        if m is not None:
            if hasattr(m, "llm"): setattr(m, "llm", ragas_llm)
            if hasattr(m, "embeddings"): setattr(m, "embeddings", ragas_emb)
            return m
    return None

def _bind_models_to_metrics(metrics, ragas_llm, ragas_emb):
    for m in metrics:
        if hasattr(m, "llm") and getattr(m, "llm", None) is None:
            try: setattr(m, "llm", ragas_llm)
            except Exception: pass
        if hasattr(m, "embeddings") and getattr(m, "embeddings", None) is None:
            try: setattr(m, "embeddings", ragas_emb)
            except Exception: pass
        if hasattr(m, "set_llm"):
            try: m.set_llm(ragas_llm)
            except Exception: pass
        if hasattr(m, "set_embeddings"):
            try: m.set_embeddings(ragas_emb)
            except Exception: pass
    return metrics

def _build_ragas_metrics(ragas_llm, ragas_emb):
    METRICS=[]
    ar = _make_answer_relevance_metric(ragas_llm, ragas_emb)
    if ar is not None: METRICS.append(ar)
    faith = FAITH
    if hasattr(faith, "llm"): setattr(faith, "llm", ragas_llm)
    METRICS.append(faith)
    if hasattr(RM, "context_recall"):    METRICS.append(RM.context_recall)
    if hasattr(RM, "context_precision"): METRICS.append(RM.context_precision)
    return _bind_models_to_metrics(METRICS, ragas_llm, ragas_emb)

def _to_ragas_dataset(df: pd.DataFrame):
    sdf=df.copy()
    sdf["contexts"] = sdf["contexts"].apply(lambda x: list(x) if isinstance(x, (list, tuple)) else ([] if x is None else [str(x)]))
    # Preferred: new ragas Dataset
    try:
        from ragas.dataset import Dataset as RagasDataset, SingleTurnSample
        params = set(__import__("inspect").signature(SingleTurnSample).parameters.keys())
        samples=[]
        for _, row in sdf.iterrows():
            kwargs = dict(question=row["question"], answer=row["answer"], contexts=list(row["contexts"]))
            if "reference" in row and "reference" in params:
                kwargs["reference"] = row["reference"]
            elif "reference" in row and "ground_truth" in params:
                kwargs["ground_truth"] = row["reference"]
            samples.append(SingleTurnSample(**kwargs))
        return RagasDataset(samples=samples)
    except Exception:
        pass
    # Fallback: HF Dataset or raw DataFrame
    try:
        from datasets import Dataset as HFDataset
        return HFDataset.from_pandas(sdf, preserve_index=False)
    except Exception:
        return sdf

def _evaluate_safe(dataset_in, metrics, ragas_llm, ragas_emb):
    rc = None
    if RunConfig is not None:
        try: rc = RunConfig(max_workers=1, timeout=180, max_retries=10, max_wait=60)
        except TypeError: rc = RunConfig()
    try:
        return evaluate(dataset_in, metrics=metrics, llm=ragas_llm, embeddings=ragas_emb,
                        show_progress=False, run_config=rc, batch_size=1)
    except TypeError:
        return evaluate(dataset_in, metrics=metrics, llm=ragas_llm, embeddings=ragas_emb,
                        show_progress=False)

def _scores_from_result(res) -> Dict[str, float]:
    try:
        df = res.to_pandas()
    except Exception:
        df = pd.DataFrame(res)
    if isinstance(df, pd.DataFrame) and not df.empty:
        nums = {c: float(df[c].iloc[0]) for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and len(df)==1}
        if nums: return nums
        if {"metric","score"}.issubset(df.columns):
            out = {}
            for k, v in zip(df["metric"], df["score"]):
                try: out[str(k)] = float(v)
                except Exception: pass
            if out: return out
    return {}

def _friendly_scores(d: Dict[str,float]) -> Dict[str,float]:
    rename = {
        "answer_relevancy": "Answer Relevance",
        "answer_relevance": "Answer Relevance",
        "faithfulness": "Faithfulness (≈ Groundedness)",
        "context_recall": "Context Recall",
        "context_precision": "Context Precision",
        "context_relevancy": "Context Relevance",
    }
    return {rename.get(k, k): v for k, v in d.items()}

def _has_gold_refs(cfg: Dict[str, Any]) -> bool:
    refs = cfg.get("eval_references", []); qs = cfg.get("eval_questions", [])
    return isinstance(refs, list) and len(refs) == len(qs) and all(isinstance(x, str) and x.strip() for x in refs)

# 4) Main
def main(CONFIG: Dict[str, Any]):
    assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY"
    outdir = Path(CONFIG["output_dir"]); outdir.mkdir(parents=True, exist_ok=True)
    if CONFIG.get("create_sample_if_empty", True): _maybe_create_sample(CONFIG["data_path"])
    docs = _load_corpus(CONFIG["data_path"])

    embed_model = _build_embedder(CONFIG["embedding"])
    llm = _build_llm(CONFIG["llm"])
    Settings.embed_model = embed_model
    Settings.llm = llm
    client = OpenAIClient()

    # Refs for recall/precision
    if not _has_gold_refs(CONFIG):
        print("[INFO] No references provided → auto-generating short references from corpus for recall/precision.")
        sys.stdout.flush()
        CONFIG["eval_references"] = _generate_references_from_corpus(docs, CONFIG["eval_questions"], client)

    # RAGAS backends
    try:
        ragas_llm = llm_factory(provider="openai", model=CONFIG["ragas"]["llm_model"], client=client)
    except TypeError:
        try: ragas_llm = llm_factory(model=CONFIG["ragas"]["llm_model"], client=client)
        except TypeError: ragas_llm = llm_factory(model=CONFIG["ragas"]["llm_model"])
    ragas_emb = OpenAIEmbeddingsAdapter(client=client, model=CONFIG["ragas"]["embed_model"])
    METRICS   = _build_ragas_metrics(ragas_llm, ragas_emb)

    # Config summary (Markdown)
    cfg_view = {
        "embedding": CONFIG["embedding"]["model"],
        "llm": CONFIG["llm"]["model"],
        "ragas_llm": CONFIG["ragas"]["llm_model"],
        "ragas_emb": CONFIG["ragas"]["embed_model"],
        "experiments": [f"{e['name']} ({e['retriever']})" + (" + rerank" if e.get('reranker') else "") for e in CONFIG["experiments"]],
        "questions": len(CONFIG["eval_questions"]),
        "references": "provided" if _has_gold_refs(CONFIG) else "auto-generated",
    }
    cfg_md = pd.DataFrame(cfg_view.items(), columns=["Key","Value"]).to_markdown(index=False)
    md_table(cfg_md, title="### CONFIG", small=False, bold=False)

    print("\n=== TRACES (Question ➜ Answer by experiment) ===")
    sys.stdout.flush()

    # Warn once if Cohere rerank is configured but unavailable
    for exp in CONFIG["experiments"]:
        if exp.get("reranker", {}).get("type","").lower() == "cohere":
            if not (_COHERE_AVAILABLE and os.environ.get("COHERE_API_KEY")):
                print("[WARN] COHERE_API_KEY not set or 'cohere' not installed → falling back to LLM/embedding reranker.")

    # Build experiments
    rows=[]  # for eval + saving
    for exp in CONFIG["experiments"]:
        name, retr, top_k = exp["name"], exp["retriever"], int(exp.get("top_k", 6))

        # retriever
        if retr == "vector":
            retriever, _ = _build_vector_retriever(docs, embed_model, top_k)
        elif retr == "sentence_window":
            retriever, _ = _build_sentence_window_retriever(docs, embed_model, window_size=int(exp.get("window_size", 3)), top_k=top_k)
        elif retr == "auto_merging":
            retriever, _ = _build_auto_merging_retriever(docs, embed_model, top_k)
        else:
            raise ValueError(f"Unknown retriever: {retr}")

        rer_cfg = exp.get("reranker")
        header = f"--- Experiment: {name}  | retriever={retr}  | rerank={'on' if rer_cfg else 'off'} ---"
        print(f"\n{header}")
        sys.stdout.flush()

        for i, (q, ref) in enumerate(zip(CONFIG["eval_questions"], CONFIG["eval_references"])):
            # retrieve + (optional) rerank
            nodes = retriever.retrieve(q)
            nodes2 = _apply_rerank(q, nodes, rer_cfg, client) if rer_cfg else nodes[:min(top_k, len(nodes))]
            ctx_texts=[]
            for sn in nodes2:
                n = getattr(sn, "node", None) or sn
                t = getattr(n, "text", None) or getattr(n, "get_text", lambda: None)()
                if t: ctx_texts.append(t)

            # answer (extractive)
            ans, ctxs = _answer_extractive(q, ctx_texts, client)

            # trace (plain prints, as requested earlier)
            print(f"\nQ{ i+1 }: {q}")
            print(f"A{ i+1 }: {ans}")
            print(f"Ref: {ref}")
            if CONFIG["print_context_chars"] > 0:
                maxc = CONFIG["print_context_chars"]
                for ci, c in enumerate(ctxs[:5]):
                    print(f"   ctx[{ci}]: {c[:maxc].replace('\\n',' ')}{'…' if len(c)>maxc else ''}")
            sys.stdout.flush()

            rows.append({
                "experiment": name,
                "retriever": retr,
                "question": q,
                "answer": ans,
                "contexts": ctxs,
                "reference": ref,
            })

    # ---- Incremental evaluation: one-sample-at-a-time (so you SEE progress) ----
    all_rows = pd.DataFrame(rows)
    scored_rows = []

    md_line("### EVALUATION (per-sample progress)", bold=True)
    sys.stdout.flush()

    for ridx, r in all_rows.iterrows():
        sub = pd.DataFrame([r[["question","answer","contexts","reference"]]])
        ds  = _to_ragas_dataset(sub)
        try:
            res = _evaluate_safe(ds, METRICS, ragas_llm, ragas_emb)
            scores = _friendly_scores(_scores_from_result(res))
        except Exception as e:
            scores = {}
            md_line(f"[WARN] Eval failed for '{r['experiment']}' / Q: {r['question'][:48]}… → {e}", small=True, bold=True)

        # per-line feedback in small + bold
        parts = [f"{k}={scores.get(k):.3f}" for k in ["Answer Relevance","Faithfulness (≈ Groundedness)","Context Recall","Context Precision"] if k in scores]
        line = f"• {r['experiment']}  |  {r['question'][:64]}…  |  " + (", ".join(parts) if parts else "no-metrics")
        md_line(line, small=True, bold=True)

        # store
        row_out = {"experiment": r["experiment"], "retriever": r["retriever"], "question": r["question"]}
        for k,v in scores.items(): row_out[k] = v
        scored_rows.append(row_out)

    scored_df = pd.DataFrame(scored_rows)

    # Leaderboard (means) — Markdown
    if not scored_df.empty:
        metric_cols = [c for c in ["Answer Relevance","Faithfulness (≈ Groundedness)","Context Recall","Context Precision"] if c in scored_df.columns]
        if metric_cols:
            leaderboard = (scored_df.groupby(["experiment","retriever"], as_index=False)[metric_cols]
                           .mean(numeric_only=True))
            md_table(leaderboard.to_markdown(index=False), title="### RAGAS Leaderboard (means)")
        else:
            leaderboard = scored_df[["experiment","retriever"]].drop_duplicates()
            md_table(leaderboard.to_markdown(index=False), title="### RAGAS Leaderboard (means)")
    else:
        md_line("### RAGAS Leaderboard (means)\n(no scores)", bold=True)

    # Per-question Scores — Markdown
    if not scored_df.empty:
        cols_show = ["experiment","retriever","question"] + [c for c in ["Answer Relevance","Faithfulness (≈ Groundedness)","Context Recall","Context Precision"] if c in scored_df.columns]
        md_table(scored_df[cols_show].to_markdown(index=False), title="### Per-question Scores (Markdown)")

    # --- END: Experiments summary table in Markdown (requested) ---
    exp_rows=[]
    for e in CONFIG["experiments"]:
        row = {
            "Experiment": e["name"],
            "Retriever": e["retriever"],
            "TopK": e.get("top_k", ""),
            "Rerank": "on" if e.get("reranker") else "off",
        }
        if e["retriever"] == "sentence_window":
            row["Window"] = e.get("window_size", "")
        if e.get("reranker"):
            row["Reranker Type"] = e["reranker"].get("type", "llm")
            row["Rerank Model"] = e["reranker"].get("model", "")
            row["Rerank TopN"]  = e["reranker"].get("top_n", "")
        exp_rows.append(row)
    exp_df = pd.DataFrame(exp_rows)
    md_table(exp_df.to_markdown(index=False), title="### Experiments Summary (Markdown)")

    # Save artifacts
    out = Path(CONFIG["output_dir"]); out.mkdir(parents=True, exist_ok=True)
    all_rows.to_csv(out / "traces_all.csv", index=False)
    (out / "traces").mkdir(exist_ok=True, parents=True)
    for exp_name, sub in all_rows.groupby("experiment"):
        sub[["experiment","retriever","question","answer","contexts","reference"]].to_csv(out / "traces" / f"{exp_name}.csv", index=False)
    if not scored_df.empty: scored_df.to_csv(out / "ragas_raw_scores.csv", index=False)
    try:
        if "leaderboard" in locals() and isinstance(leaderboard, pd.DataFrame) and not leaderboard.empty:
            leaderboard.to_csv(out / "ragas_leaderboard.csv", index=False)
    except Exception:
        pass
    print(f"\n[Saved] traces → {out/'traces_all.csv'}  |  raw scores → {out/'ragas_raw_scores.csv'}  |  leaderboard → {out/'ragas_leaderboard.csv'}")

# 5) Run
main(CONFIG)

[INFO] No references provided → auto-generating short references from corpus for recall/precision.


**### CONFIG**

| Key         | Value                                                                                                                                                                                                |
|:------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| embedding   | text-embedding-3-small                                                                                                                                                                               |
| llm         | gpt-4o-mini                                                                                                                                                                                          |
| ragas_llm   | gpt-4o-mini                                                                                                                                                                                          |
| ragas_emb   | text-embedding-3-small                                                                                                                                                                               |
| experiments | ['baseline-vector (vector)', 'sentence-window (sentence_window)', 'auto-merging (auto_merging)', 'sentence-window+rerank (sentence_window) + rerank', 'auto-merging+rerank (auto_merging) + rerank'] |
| questions   | 4                                                                                                                                                                                                    |
| references  | provided                                                                                                                                                                                             |


=== TRACES (Question ➜ Answer by experiment) ===
[WARN] COHERE_API_KEY not set or 'cohere' not installed → falling back to LLM/embedding reranker.
[WARN] COHERE_API_KEY not set or 'cohere' not installed → falling back to LLM/embedding reranker.

--- Experiment: baseline-vector  | retriever=vector  | rerank=off ---

Q1: What concrete steps does the document recommend for finding projects to work on?
A1: The document recommends shipping small projects, learning in public, and networking kindly as concrete steps for finding projects to work on. It emphasizes the importance of showcasing real, runnable things instead of vague portfolios.

- "Ship small projects, learn in public, and network kindly." [#0]
- "Avoid vague portfolios; show real, runnable things." [#0]
Ref: The document recommends shipping small projects, learning in public, and networking kindly as concrete steps for finding projects to work on.

Q2: How should a newcomer prioritise learning versus shipping projects, accordin

**### EVALUATION (per-sample progress)**

**<small>• baseline-vector  |  What concrete steps does the document recommend for finding proj…  |  Answer Relevance=1.000, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• baseline-vector  |  How should a newcomer prioritise learning versus shipping projec…  |  Answer Relevance=0.582, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• baseline-vector  |  What pitfalls does the text warn about when building a portfolio…  |  Answer Relevance=0.754, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• baseline-vector  |  Summarise the recommended networking tactics.…  |  Answer Relevance=0.288, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window  |  What concrete steps does the document recommend for finding proj…  |  Answer Relevance=1.000, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=0.333</small>**

**<small>• sentence-window  |  How should a newcomer prioritise learning versus shipping projec…  |  Answer Relevance=0.582, Faithfulness (≈ Groundedness)=0.714, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window  |  What pitfalls does the text warn about when building a portfolio…  |  Answer Relevance=0.769, Faithfulness (≈ Groundedness)=0.833, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window  |  Summarise the recommended networking tactics.…  |  Answer Relevance=0.589, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging  |  What concrete steps does the document recommend for finding proj…  |  Answer Relevance=1.000, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging  |  How should a newcomer prioritise learning versus shipping projec…  |  Answer Relevance=0.584, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging  |  What pitfalls does the text warn about when building a portfolio…  |  Answer Relevance=0.754, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging  |  Summarise the recommended networking tactics.…  |  Answer Relevance=0.288, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window+rerank  |  What concrete steps does the document recommend for finding proj…  |  Answer Relevance=1.000, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window+rerank  |  How should a newcomer prioritise learning versus shipping projec…  |  Answer Relevance=0.578, Faithfulness (≈ Groundedness)=0.714, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window+rerank  |  What pitfalls does the text warn about when building a portfolio…  |  Answer Relevance=0.752, Faithfulness (≈ Groundedness)=0.833, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• sentence-window+rerank  |  Summarise the recommended networking tactics.…  |  Answer Relevance=0.590, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging+rerank  |  What concrete steps does the document recommend for finding proj…  |  Answer Relevance=1.000, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging+rerank  |  How should a newcomer prioritise learning versus shipping projec…  |  Answer Relevance=0.602, Faithfulness (≈ Groundedness)=0.833, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging+rerank  |  What pitfalls does the text warn about when building a portfolio…  |  Answer Relevance=0.754, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**<small>• auto-merging+rerank  |  Summarise the recommended networking tactics.…  |  Answer Relevance=0.288, Faithfulness (≈ Groundedness)=1.000, Context Recall=1.000, Context Precision=1.000</small>**

**### RAGAS Leaderboard (means)**

| experiment             | retriever       |   Answer Relevance |   Faithfulness (≈ Groundedness) |   Context Recall |   Context Precision |
|:-----------------------|:----------------|-------------------:|--------------------------------:|-----------------:|--------------------:|
| auto-merging           | auto_merging    |           0.656498 |                        1        |                1 |            1        |
| auto-merging+rerank    | auto_merging    |           0.66113  |                        0.958333 |                1 |            1        |
| baseline-vector        | vector          |           0.656055 |                        1        |                1 |            1        |
| sentence-window        | sentence_window |           0.735091 |                        0.886905 |                1 |            0.833333 |
| sentence-window+rerank | sentence_window |           0.729676 |                        0.886905 |                1 |            1        |

**### Per-question Scores (Markdown)**

| experiment             | retriever       | question                                                                                   |   Answer Relevance |   Faithfulness (≈ Groundedness) |   Context Recall |   Context Precision |
|:-----------------------|:----------------|:-------------------------------------------------------------------------------------------|-------------------:|--------------------------------:|-----------------:|--------------------:|
| baseline-vector        | vector          | What concrete steps does the document recommend for finding projects to work on?           |           1        |                        1        |                1 |            1        |
| baseline-vector        | vector          | How should a newcomer prioritise learning versus shipping projects, according to the text? |           0.582144 |                        1        |                1 |            1        |
| baseline-vector        | vector          | What pitfalls does the text warn about when building a portfolio?                          |           0.754002 |                        1        |                1 |            1        |
| baseline-vector        | vector          | Summarise the recommended networking tactics.                                              |           0.288076 |                        1        |                1 |            1        |
| sentence-window        | sentence_window | What concrete steps does the document recommend for finding projects to work on?           |           1        |                        1        |                1 |            0.333333 |
| sentence-window        | sentence_window | How should a newcomer prioritise learning versus shipping projects, according to the text? |           0.581859 |                        0.714286 |                1 |            1        |
| sentence-window        | sentence_window | What pitfalls does the text warn about when building a portfolio?                          |           0.769015 |                        0.833333 |                1 |            1        |
| sentence-window        | sentence_window | Summarise the recommended networking tactics.                                              |           0.589488 |                        1        |                1 |            1        |
| auto-merging           | auto_merging    | What concrete steps does the document recommend for finding projects to work on?           |           1        |                        1        |                1 |            1        |
| auto-merging           | auto_merging    | How should a newcomer prioritise learning versus shipping projects, according to the text? |           0.583928 |                        1        |                1 |            1        |
| auto-merging           | auto_merging    | What pitfalls does the text warn about when building a portfolio?                          |           0.753991 |                        1        |                1 |            1        |
| auto-merging           | auto_merging    | Summarise the recommended networking tactics.                                              |           0.288073 |                        1        |                1 |            1        |
| sentence-window+rerank | sentence_window | What concrete steps does the document recommend for finding projects to work on?           |           1        |                        1        |                1 |            1        |
| sentence-window+rerank | sentence_window | How should a newcomer prioritise learning versus shipping projects, according to the text? |           0.577576 |                        0.714286 |                1 |            1        |
| sentence-window+rerank | sentence_window | What pitfalls does the text warn about when building a portfolio?                          |           0.751598 |                        0.833333 |                1 |            1        |
| sentence-window+rerank | sentence_window | Summarise the recommended networking tactics.                                              |           0.58953  |                        1        |                1 |            1        |
| auto-merging+rerank    | auto_merging    | What concrete steps does the document recommend for finding projects to work on?           |           0.999989 |                        1        |                1 |            1        |
| auto-merging+rerank    | auto_merging    | How should a newcomer prioritise learning versus shipping projects, according to the text? |           0.602485 |                        0.833333 |                1 |            1        |
| auto-merging+rerank    | auto_merging    | What pitfalls does the text warn about when building a portfolio?                          |           0.753991 |                        1        |                1 |            1        |
| auto-merging+rerank    | auto_merging    | Summarise the recommended networking tactics.                                              |           0.288055 |                        1        |                1 |            1        |

**### Experiments Summary (Markdown)**

| Experiment             | Retriever       |   TopK | Rerank   |   Window | Reranker Type   | Rerank Model        |   Rerank TopN |
|:-----------------------|:----------------|-------:|:---------|---------:|:----------------|:--------------------|--------------:|
| baseline-vector        | vector          |      6 | off      |      nan | nan             | nan                 |           nan |
| sentence-window        | sentence_window |      6 | off      |        3 | nan             | nan                 |           nan |
| auto-merging           | auto_merging    |      6 | off      |      nan | nan             | nan                 |           nan |
| sentence-window+rerank | sentence_window |     50 | on       |        3 | cohere          | rerank-english-v3.0 |             8 |
| auto-merging+rerank    | auto_merging    |     50 | on       |      nan | cohere          | rerank-english-v3.0 |             8 |


[Saved] traces → outputs/traces_all.csv  |  raw scores → outputs/ragas_raw_scores.csv  |  leaderboard → outputs/ragas_leaderboard.csv
