Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/metric-list.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,4 @@ These metrics need the model to generate an output. They are therefore slower.
- `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.
- `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
- `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
- `tvd_mi`: Corpus-level LLM-as-a-judge metric that estimates a lower bound on total variation mutual information using paired responses. It assumes each example has two responses and a binary label indicating whether they are from the same underlying item (`1`) or from different items (`0`), and computes `TPR + TNR - 1` from the judge’s binary decisions.
33 changes: 33 additions & 0 deletions examples/custom_tasks_templates/tvd_mi_synthetic/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# TVD-MI synthetic example (paired-response)

This folder contains a tiny, synthetic paired-response dataset intended to demonstrate how to run the `tvd_mi` metric.

## Data format

The dataset is a `.jsonl` file where each line is a JSON object with:

- `response_a` (str): first response in the pair
- `response_b` (str): second response in the pair
- `pair_label` (int): `1` if the two responses come from the same underlying item/task/source, `0` otherwise

Example line:

```json
{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital of France.","pair_label":1}
````

## What this example is (and isn’t)

* ✅ A minimal, copyable example showing the expected fields for `tvd_mi`
* ✅ Useful as a template for building larger paired-response benchmarks
* ❌ Not intended to be a scientifically meaningful benchmark by itself

## Running

`tvd_mi` is an LLM-as-judge metric. To run with the OpenAI backend, set:

```bash
export OPENAI_API_KEY=...
```

You can then load this dataset as Docs and evaluate with `tvd_mi` (see the Python loader in `tvd_mi_synthetic.py`).
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"response_a":"The capital of France is Paris.","response_b":"Paris is the capital city of France.","pair_label":1}
{"response_a":"2 + 2 = 4.","response_b":"Four is the sum of two plus two.","pair_label":1}
{"response_a":"Water freezes at 0°C at standard atmospheric pressure.","response_b":"At 1 atm, water’s freezing point is 0 degrees Celsius.","pair_label":1}
{"response_a":"The capital of France is Paris.","response_b":"The mitochondria is the powerhouse of the cell.","pair_label":0}
{"response_a":"2 + 2 = 4.","response_b":"Photosynthesis converts light into chemical energy in plants.","pair_label":0}
121 changes: 121 additions & 0 deletions examples/custom_tasks_templates/tvd_mi_synthetic/tvd_mi_synthetic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# MIT License
# Copyright (c) 2024 The HuggingFace Team

"""
Minimal loader for the TVD-MI paired-response synthetic example.

This module intentionally avoids tight coupling to task registries so it can be
used as a simple reference/template. It provides `read_jsonl()` and `build_docs()`
helpers to construct lighteval `Doc` objects with the fields expected by TVD-MI.

Expected JSONL schema per line:
- response_a: str
- response_b: str
- pair_label: int (1=same, 0=different)
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable


try:
# lighteval Doc type (preferred if available)
from lighteval.tasks.requests import Doc # type: ignore
except Exception:
# Fallback: minimal doc type for local testing / documentation purposes
@dataclass
class Doc: # type: ignore
query: str = ""
choices: list[str] | None = None
gold_index: int | list[int] | None = None
task_name: str | None = None
specific: dict[str, Any] | None = None


HERE = Path(__file__).resolve().parent
DEFAULT_DATA_PATH = HERE / "tvd_mi_synthetic.jsonl"


def read_jsonl(path: str | Path) -> list[dict[str, Any]]:
path = Path(path)
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
rows.append(json.loads(line))
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON on line {line_num} of {path}: {e}") from e
return rows


def _set_attr_if_possible(obj: Any, name: str, value: Any) -> None:
"""
Try to set `obj.name = value`. Some Doc implementations may forbid new attributes.
"""
try:
setattr(obj, name, value)
except Exception:
# It's fine if Doc is strict; we always store in `specific` too.
pass


def build_docs(rows: Iterable[dict[str, Any]], task_name: str = "tvd_mi_synthetic") -> list[Doc]:
docs: list[Doc] = []
for i, r in enumerate(rows):
if "response_a" not in r or "response_b" not in r or "pair_label" not in r:
raise ValueError(
f"Row {i} missing required keys. Expected response_a/response_b/pair_label. Got keys={list(r.keys())}"
)

response_a = str(r["response_a"])
response_b = str(r["response_b"])
pair_label = int(r["pair_label"])

# Create a minimal Doc. Many metrics/tests assume `query`/`choices` exist.
doc = Doc(
query="",
choices=[],
gold_index=0,
task_name=task_name,
specific={
"response_a": response_a,
"response_b": response_b,
"pair_label": pair_label,
},
)

# Also set direct attributes for compatibility with JudgeLLMTVDMI.compute as currently implemented.
_set_attr_if_possible(doc, "response_a", response_a)
_set_attr_if_possible(doc, "response_b", response_b)
_set_attr_if_possible(doc, "pair_label", pair_label)

docs.append(doc)

return docs


def load_default_docs() -> list[Doc]:
"""
Convenience helper to load the default JSONL shipped with this example folder.
"""
rows = read_jsonl(DEFAULT_DATA_PATH)
return build_docs(rows)


if __name__ == "__main__":
docs = load_default_docs()
print(f"Loaded {len(docs)} docs from {DEFAULT_DATA_PATH}")
print(
"First doc has attrs:",
hasattr(docs[0], "response_a"),
hasattr(docs[0], "response_b"),
hasattr(docs[0], "pair_label"),
)
print("First doc specific keys:", list((docs[0].specific or {}).keys()))
Loading