huggingface · zrobertson466920 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx
@@ -61,3 +61,4 @@ These metrics need the model to generate an output. They are therefore slower.
 - `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.
 - `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
 - `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
+- `tvd_mi`: Corpus-level LLM-as-a-judge metric that estimates a lower bound on total variation mutual information using paired responses. It assumes each example has two responses and a binary label indicating whether they are from the same underlying item (`1`) or from different items (`0`), and computes `TPR + TNR - 1` from the judge’s binary decisions.
diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/README.md b/examples/custom_tasks_templates/tvd_mi_synthetic/README.md
@@ -0,0 +1,33 @@
+# TVD-MI synthetic example (paired-response)
+
+This folder contains a tiny, synthetic paired-response dataset intended to demonstrate how to run the `tvd_mi` metric.
+
+## Data format
+
+The dataset is a `.jsonl` file where each line is a JSON object with:
+
+- `response_a` (str): first response in the pair
+- `response_b` (str): second response in the pair
+- `pair_label` (int): `1` if the two responses come from the same underlying item/task/source, `0` otherwise
+
+Example line:
+
+```json
+{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital of France.","pair_label":1}
+````
+
+## What this example is (and isn’t)
+
+* ✅ A minimal, copyable example showing the expected fields for `tvd_mi`
+* ✅ Useful as a template for building larger paired-response benchmarks
+* ❌ Not intended to be a scientifically meaningful benchmark by itself
+
+## Running
+
+`tvd_mi` is an LLM-as-judge metric. To run with the OpenAI backend, set:
+
+```bash
+export OPENAI_API_KEY=...
+```
+
+You can then load this dataset as Docs and evaluate with `tvd_mi` (see the Python loader in `tvd_mi_synthetic.py`).
diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.jsonl
@@ -0,0 +1,5 @@
+{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital city of France.","pair_label":1}
+{"response_a":"2 + 2 = 4.","response_b":"Four is the sum of two plus two.","pair_label":1}
+{"response_a":"Water freezes at 0°C at standard atmospheric pressure.","response_b":"At 1 atm, water’s freezing point is 0 degrees Celsius.","pair_label":1}
+{"response_a":"The capital of France is Paris.","response_b":"The mitochondria is the powerhouse of the cell.","pair_label":0}
+{"response_a":"2 + 2 = 4.","response_b":"Photosynthesis converts light into chemical energy in plants.","pair_label":0}
diff --git a/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py b/examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py
@@ -0,0 +1,121 @@
+# MIT License
+# Copyright (c) 2024 The HuggingFace Team
+
+"""
+Minimal loader for the TVD-MI paired-response synthetic example.
+
+This module intentionally avoids tight coupling to task registries so it can be
+used as a simple reference/template. It provides `read_jsonl()` and `build_docs()`
+helpers to construct lighteval `Doc` objects with the fields expected by TVD-MI.
+
+Expected JSONL schema per line:
+  - response_a: str
+  - response_b: str
+  - pair_label: int (1=same, 0=different)
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+
+try:
+    # lighteval Doc type (preferred if available)
+    from lighteval.tasks.requests import Doc  # type: ignore
+except Exception:
+    # Fallback: minimal doc type for local testing / documentation purposes
+    @dataclass
+    class Doc:  # type: ignore
+        query: str = ""
+        choices: list[str] | None = None
+        gold_index: int | list[int] | None = None
+        task_name: str | None = None
+        specific: dict[str, Any] | None = None
+
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_DATA_PATH = HERE / "tvd_mi_synthetic.jsonl"
+
+
+def read_jsonl(path: str | Path) -> list[dict[str, Any]]:
+    path = Path(path)
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line_num, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                rows.append(json.loads(line))
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON on line {line_num} of {path}: {e}") from e
+    return rows
+
+
+def _set_attr_if_possible(obj: Any, name: str, value: Any) -> None:
+    """
+    Try to set `obj.name = value`. Some Doc implementations may forbid new attributes.
+    """
+    try:
+        setattr(obj, name, value)
+    except Exception:
+        # It's fine if Doc is strict; we always store in `specific` too.
+        pass
+
+
+def build_docs(rows: Iterable[dict[str, Any]], task_name: str = "tvd_mi_synthetic") -> list[Doc]:
+    docs: list[Doc] = []
+    for i, r in enumerate(rows):
+        if "response_a" not in r or "response_b" not in r or "pair_label" not in r:
+            raise ValueError(
+                f"Row {i} missing required keys. Expected response_a/response_b/pair_label. Got keys={list(r.keys())}"
+            )
+
+        response_a = str(r["response_a"])
+        response_b = str(r["response_b"])
+        pair_label = int(r["pair_label"])
+
+        # Create a minimal Doc. Many metrics/tests assume `query`/`choices` exist.
+        doc = Doc(
+            query="",
+            choices=[],
+            gold_index=0,
+            task_name=task_name,
+            specific={
+                "response_a": response_a,
+                "response_b": response_b,
+                "pair_label": pair_label,
+            },
+        )
+
+        # Also set direct attributes for compatibility with JudgeLLMTVDMI.compute as currently implemented.
+        _set_attr_if_possible(doc, "response_a", response_a)
+        _set_attr_if_possible(doc, "response_b", response_b)
+        _set_attr_if_possible(doc, "pair_label", pair_label)
+
+        docs.append(doc)
+
+    return docs
+
+
+def load_default_docs() -> list[Doc]:
+    """
+    Convenience helper to load the default JSONL shipped with this example folder.
+    """
+    rows = read_jsonl(DEFAULT_DATA_PATH)
+    return build_docs(rows)
+
+
+if __name__ == "__main__":
+    docs = load_default_docs()
+    print(f"Loaded {len(docs)} docs from {DEFAULT_DATA_PATH}")
+    print(
+        "First doc has attrs:",
+        hasattr(docs[0], "response_a"),
+        hasattr(docs[0], "response_b"),
+        hasattr(docs[0], "pair_label"),
+    )
+    print("First doc specific keys:", list((docs[0].specific or {}).keys()))