From ed5e78093f17701ff81abe76df61d861b5ed9f61 Mon Sep 17 00:00:00 2001 From: Nancy Date: Tue, 24 Mar 2026 11:24:36 -0700 Subject: [PATCH 1/5] feat: port core SDLC graders to hud.native --- hud/native/__init__.py | 6 +- hud/native/graders.py | 215 +++++++++++++++++++++++++++++++ hud/native/tests/__init__.py | 1 + hud/native/tests/test_graders.py | 164 +++++++++++++++++++++++ 4 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 hud/native/graders.py create mode 100644 hud/native/tests/__init__.py create mode 100644 hud/native/tests/test_graders.py diff --git a/hud/native/__init__.py b/hud/native/__init__.py index f4901d9d9..52584c622 100644 --- a/hud/native/__init__.py +++ b/hud/native/__init__.py @@ -1 +1,5 @@ -"""Native environments bundled with the HUD SDK.""" +"""Native environments and helpers bundled with the HUD SDK.""" + +from hud.native.graders import BashGrader, Grade, Grader + +__all__ = ["BashGrader", "Grade", "Grader"] diff --git a/hud/native/graders.py b/hud/native/graders.py new file mode 100644 index 000000000..2cf5df39a --- /dev/null +++ b/hud/native/graders.py @@ -0,0 +1,215 @@ +"""Generic graders for native HUD evaluation.""" + +from __future__ import annotations + +import json +import logging +import subprocess +from typing import Any + +from hud.tools.types import EvaluationResult, SubScore + +logger = logging.getLogger(__name__) + +__all__ = ["BashGrader", "Grade", "Grader"] + + +def _safe_params(kwargs: dict[str, Any]) -> dict[str, Any]: + """Return a JSON-safe copy of grader parameters for metadata storage.""" + result: dict[str, Any] = {} + for key, value in kwargs.items(): + try: + json.dumps(value) + result[key] = value + except (TypeError, ValueError): + result[key] = f"<{type(value).__name__}: not serializable>" + return result + + +class Grade: + """Factory for building ``EvaluationResult`` objects from ``SubScore`` items.""" + + @staticmethod + def from_subscores(subscores: list[SubScore]) -> EvaluationResult: + """Combine subscores into a clipped reward and ready-to-yield result. + + Positive weights are normalized to sum to ``1.0`` so the returned + ``EvaluationResult`` lines up with the SDK's subscore semantics. + Negative weights are preserved as penalties. + """ + + if not subscores: + raise ValueError("subscores must not be empty") + + positive_weight_sum = sum(item.weight for item in subscores if item.weight > 0) + if positive_weight_sum <= 0: + raise ValueError("subscores must include at least one positive weight") + + name_counts: dict[str, int] = {} + for item in subscores: + name_counts[item.name] = name_counts.get(item.name, 0) + 1 + + name_usage: dict[str, int] = {} + normalized_subscores: list[SubScore] = [] + metadata: dict[str, Any] = {} + + for item in subscores: + if name_counts[item.name] == 1: + final_name = item.name + else: + name_usage[item.name] = name_usage.get(item.name, 0) + 1 + final_name = f"{item.name}-{name_usage[item.name]}" + + normalized_weight = ( + item.weight / positive_weight_sum if item.weight > 0 else item.weight + ) + normalized_subscores.append( + SubScore( + name=final_name, + weight=normalized_weight, + value=item.value, + metadata=item.metadata, + ) + ) + if item.metadata is not None: + metadata[final_name] = item.metadata + + reward = float( + min( + max(sum(item.value * item.weight for item in normalized_subscores), 0.0), + 1.0, + ) + ) + + return EvaluationResult( + reward=reward, + done=True, + subscores=normalized_subscores, + info=metadata, + ) + + +class Grader: + """Base class for reusable graders that emit ``SubScore`` objects.""" + + name: str = "BaseGrader" + + @classmethod + def grade(cls, weight: float, name: str | None = None, **kwargs: Any) -> SubScore: + """Run the grader and package the result as a ``SubScore``.""" + result = cls.compute_score(**kwargs) + + if isinstance(result, tuple): + score, metadata = result + else: + score = result + metadata = {} + + return SubScore( + name=name or cls.name, + weight=weight, + value=float(score), + metadata={**metadata, "_parameters": _safe_params(kwargs)}, + ) + + @classmethod + def compute_score(cls, *args: Any, **kwargs: Any) -> float | tuple[float, dict[str, Any]]: + """Compute a score between ``0.0`` and ``1.0``.""" + raise NotImplementedError("Subclasses must implement compute_score") + + @classmethod + def any(cls, weight: float, subscores: list[SubScore]) -> SubScore: + """Return a subscore that passes if any input subscore passes.""" + if not subscores: + raise ValueError("subscores must not be empty") + + return SubScore( + name=f"{cls.name}_any", + value=max(subscore.value for subscore in subscores), + weight=weight, + metadata={ + "subscores": [subscore.name for subscore in subscores], + "subscore_metadata": { + subscore.name: subscore.metadata + for subscore in subscores + if subscore.metadata is not None + }, + }, + ) + + @classmethod + def all(cls, weight: float, subscores: list[SubScore]) -> SubScore: + """Return a subscore that passes only if all input subscores pass.""" + if not subscores: + raise ValueError("subscores must not be empty") + + return SubScore( + name=f"{cls.name}_all", + value=min(subscore.value for subscore in subscores), + weight=weight, + metadata={ + "subscores": [subscore.name for subscore in subscores], + "subscore_metadata": { + subscore.name: subscore.metadata + for subscore in subscores + if subscore.metadata is not None + }, + }, + ) + + +class BashGrader(Grader): + """Run a shell command and score it by exit code.""" + + name = "BashGrader" + + @classmethod + def compute_score( + cls, + command: str, + cwd: str | None = None, + timeout: int = 60, + **kwargs: Any, + ) -> tuple[float, dict[str, Any]]: + """Run ``command`` via ``bash -lc`` and return score plus execution metadata.""" + del kwargs + logger.info("Running grader command: %s (cwd=%s, timeout=%ss)", command, cwd, timeout) + try: + result = subprocess.run( + ["/bin/bash", "-lc", command], + cwd=cwd, + capture_output=True, + text=True, + timeout=timeout, + ) + except subprocess.TimeoutExpired as exc: + stdout = ( + (exc.stdout or b"").decode(errors="replace") + if isinstance(exc.stdout, bytes) + else (exc.stdout or "") + ) + stderr = ( + (exc.stderr or b"").decode(errors="replace") + if isinstance(exc.stderr, bytes) + else (exc.stderr or "") + ) + return ( + 0.0, + { + "exit_code": None, + "stdout": stdout, + "stderr": stderr, + "timed_out": True, + "timeout": timeout, + }, + ) + + score = 1.0 if result.returncode == 0 else 0.0 + return ( + score, + { + "exit_code": result.returncode, + "stdout": result.stdout, + "stderr": result.stderr, + }, + ) diff --git a/hud/native/tests/__init__.py b/hud/native/tests/__init__.py new file mode 100644 index 000000000..c14ccf20b --- /dev/null +++ b/hud/native/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for native HUD helpers.""" diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py new file mode 100644 index 000000000..5deff2f2b --- /dev/null +++ b/hud/native/tests/test_graders.py @@ -0,0 +1,164 @@ +"""Tests for first-party HUD native graders.""" + +from __future__ import annotations + +import pytest + +from hud.environment import Environment +from hud.native.graders import BashGrader, Grade, Grader +from hud.tools.types import EvaluationResult, SubScore + + +class TestGrade: + def test_from_subscores_returns_evaluation_result(self) -> None: + result = Grade.from_subscores([SubScore(name="alpha", value=1.0, weight=1.0)]) + assert isinstance(result, EvaluationResult) + assert result.reward == 1.0 + assert result.done is True + + def test_from_subscores_normalizes_positive_weights(self) -> None: + result = Grade.from_subscores( + [ + SubScore(name="alpha", value=1.0, weight=2.0), + SubScore(name="beta", value=0.0, weight=1.0), + ] + ) + assert result.reward == pytest.approx(2.0 / 3.0) + assert result.subscores is not None + by_name = {subscore.name: subscore for subscore in result.subscores} + assert by_name["alpha"].weight == pytest.approx(2.0 / 3.0) + assert by_name["beta"].weight == pytest.approx(1.0 / 3.0) + + def test_from_subscores_preserves_negative_penalties(self) -> None: + result = Grade.from_subscores( + [ + SubScore(name="correct", value=1.0, weight=1.0), + SubScore(name="penalty", value=1.0, weight=-0.2), + ] + ) + assert result.reward == pytest.approx(0.8) + assert result.subscores is not None + by_name = {subscore.name: subscore for subscore in result.subscores} + assert by_name["correct"].weight == pytest.approx(1.0) + assert by_name["penalty"].weight == pytest.approx(-0.2) + + def test_from_subscores_duplicate_names_are_deduped(self) -> None: + result = Grade.from_subscores( + [ + SubScore(name="same", value=1.0, weight=0.5), + SubScore(name="same", value=0.0, weight=0.5), + ] + ) + assert result.subscores is not None + assert [subscore.name for subscore in result.subscores] == ["same-1", "same-2"] + + def test_from_subscores_propagates_metadata(self) -> None: + metadata = {"stdout": "ok"} + result = Grade.from_subscores( + [SubScore(name="grader", value=1.0, weight=1.0, metadata=metadata)] + ) + assert result.info["grader"] == metadata + assert result.subscores is not None + assert result.subscores[0].metadata == metadata + + +class TestGrader: + def test_grade_returns_subscore_and_stores_parameters(self) -> None: + class DummyGrader(Grader): + name = "DummyGrader" + + @classmethod + def compute_score(cls, **kwargs: object) -> tuple[float, dict[str, object]]: + return 0.75, {"source": "dummy", "kwargs_seen": sorted(kwargs)} + + subscore = DummyGrader.grade(weight=0.4, marker="ok", payload=object()) + assert isinstance(subscore, SubScore) + assert subscore.name == "DummyGrader" + assert subscore.value == pytest.approx(0.75) + assert subscore.weight == pytest.approx(0.4) + assert subscore.metadata is not None + assert subscore.metadata["source"] == "dummy" + assert subscore.metadata["_parameters"]["marker"] == "ok" + assert subscore.metadata["_parameters"]["payload"] == "" + + +class TestGraderCombinators: + def test_any_picks_max(self) -> None: + combined = Grader.any( + weight=1.0, + subscores=[ + SubScore(name="a", value=1.0, weight=0.5), + SubScore(name="b", value=0.0, weight=0.5), + ], + ) + assert combined.name == "BaseGrader_any" + assert combined.value == 1.0 + + def test_all_picks_min(self) -> None: + combined = Grader.all( + weight=1.0, + subscores=[ + SubScore(name="a", value=1.0, weight=0.5), + SubScore(name="b", value=0.0, weight=0.5), + ], + ) + assert combined.name == "BaseGrader_all" + assert combined.value == 0.0 + + +class TestBashGrader: + def test_compute_score_for_passing_command(self) -> None: + score, metadata = BashGrader.compute_score(command="echo hello") + assert score == 1.0 + assert metadata["exit_code"] == 0 + assert "hello" in metadata["stdout"] + + def test_compute_score_for_failing_command(self) -> None: + score, metadata = BashGrader.compute_score(command="echo oops >&2 && false") + assert score == 0.0 + assert metadata["exit_code"] != 0 + assert "oops" in metadata["stderr"] + + def test_compute_score_timeout(self) -> None: + score, metadata = BashGrader.compute_score(command="sleep 2", timeout=1) + assert score == 0.0 + assert metadata["timed_out"] is True + assert metadata["timeout"] == 1 + + def test_compute_score_invalid_cwd_raises(self, tmp_path) -> None: + with pytest.raises(FileNotFoundError): + BashGrader.compute_score(command="true", cwd=str(tmp_path / "missing")) + + def test_grade_and_from_subscores_compose(self) -> None: + passing = BashGrader.grade(weight=0.5, command="true") + failing = BashGrader.grade(weight=0.5, command="false") + result = Grade.from_subscores([passing, failing]) + assert result.reward == pytest.approx(0.5) + assert result.info["BashGrader-1"]["exit_code"] == 0 + assert result.info["BashGrader-2"]["exit_code"] != 0 + + +class TestScenarioIntegration: + @pytest.mark.asyncio + async def test_scenario_can_yield_grade_from_subscores(self) -> None: + env = Environment("test-env") + + @env.scenario("bash-graded") + async def bash_graded_scenario(): + yield "Run the verification" + yield Grade.from_subscores( + [BashGrader.grade(weight=1.0, command="echo verified")] + ) + + prompt = await env.run_scenario_setup("bash-graded", {}) + assert prompt == "Run the verification" + + assert env._active_session is not None + env._active_session.answer = "done" + result = await env.run_scenario_evaluate("bash-graded") + + assert result is not None + assert result.reward == 1.0 + assert result.subscores is not None + assert result.subscores[0].name == "BashGrader" + assert "verified" in result.info["BashGrader"]["stdout"] From b84d7717abe920eb586aeb0c2b0b861953aad032 Mon Sep 17 00:00:00 2001 From: Nancy Date: Tue, 24 Mar 2026 11:39:32 -0700 Subject: [PATCH 2/5] style: format grader tests --- hud/native/tests/test_graders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py index 5deff2f2b..76032d2b1 100644 --- a/hud/native/tests/test_graders.py +++ b/hud/native/tests/test_graders.py @@ -146,9 +146,7 @@ async def test_scenario_can_yield_grade_from_subscores(self) -> None: @env.scenario("bash-graded") async def bash_graded_scenario(): yield "Run the verification" - yield Grade.from_subscores( - [BashGrader.grade(weight=1.0, command="echo verified")] - ) + yield Grade.from_subscores([BashGrader.grade(weight=1.0, command="echo verified")]) prompt = await env.run_scenario_setup("bash-graded", {}) assert prompt == "Run the verification" From 521b1fef331d7efff6ea83329a7c1fea1fa0538c Mon Sep 17 00:00:00 2001 From: Nancy Date: Tue, 24 Mar 2026 11:49:22 -0700 Subject: [PATCH 3/5] fix: handle grader name collisions and penalties --- hud/native/graders.py | 30 +++++++++++++++++---------- hud/native/tests/test_graders.py | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/hud/native/graders.py b/hud/native/graders.py index 2cf5df39a..5d7288825 100644 --- a/hud/native/graders.py +++ b/hud/native/graders.py @@ -31,11 +31,12 @@ class Grade: @staticmethod def from_subscores(subscores: list[SubScore]) -> EvaluationResult: - """Combine subscores into a clipped reward and ready-to-yield result. + """Combine subscores into a weighted reward and ready-to-yield result. Positive weights are normalized to sum to ``1.0`` so the returned ``EvaluationResult`` lines up with the SDK's subscore semantics. - Negative weights are preserved as penalties. + Negative weights are preserved as penalties, including when they drive + the final reward below zero. """ if not subscores: @@ -49,16 +50,28 @@ def from_subscores(subscores: list[SubScore]) -> EvaluationResult: for item in subscores: name_counts[item.name] = name_counts.get(item.name, 0) + 1 + reserved_names = {item.name for item in subscores} name_usage: dict[str, int] = {} + used_names: set[str] = set() normalized_subscores: list[SubScore] = [] metadata: dict[str, Any] = {} for item in subscores: - if name_counts[item.name] == 1: + if name_counts[item.name] == 1 and item.name not in used_names: final_name = item.name else: - name_usage[item.name] = name_usage.get(item.name, 0) + 1 - final_name = f"{item.name}-{name_usage[item.name]}" + suffix = name_usage.get(item.name, 0) + while True: + suffix += 1 + candidate = f"{item.name}-{suffix}" + if candidate in used_names: + continue + if candidate in reserved_names: + continue + name_usage[item.name] = suffix + final_name = candidate + break + used_names.add(final_name) normalized_weight = ( item.weight / positive_weight_sum if item.weight > 0 else item.weight @@ -74,12 +87,7 @@ def from_subscores(subscores: list[SubScore]) -> EvaluationResult: if item.metadata is not None: metadata[final_name] = item.metadata - reward = float( - min( - max(sum(item.value * item.weight for item in normalized_subscores), 0.0), - 1.0, - ) - ) + reward = float(sum(item.value * item.weight for item in normalized_subscores)) return EvaluationResult( reward=reward, diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py index 76032d2b1..806f2efd8 100644 --- a/hud/native/tests/test_graders.py +++ b/hud/native/tests/test_graders.py @@ -2,6 +2,8 @@ from __future__ import annotations +import warnings + import pytest from hud.environment import Environment @@ -52,6 +54,24 @@ def test_from_subscores_duplicate_names_are_deduped(self) -> None: assert result.subscores is not None assert [subscore.name for subscore in result.subscores] == ["same-1", "same-2"] + def test_from_subscores_duplicate_names_avoid_existing_suffix_collisions(self) -> None: + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = Grade.from_subscores( + [ + SubScore(name="x-1", value=1.0, weight=0.3), + SubScore(name="x", value=1.0, weight=0.4), + SubScore(name="x", value=0.0, weight=0.6), + ] + ) + + assert result.subscores is not None + assert [subscore.name for subscore in result.subscores] == ["x-1", "x-2", "x-3"] + assert set(result.info) == set() + assert not [ + warning for warning in caught if "Duplicate subscore names" in str(warning.message) + ] + def test_from_subscores_propagates_metadata(self) -> None: metadata = {"stdout": "ok"} result = Grade.from_subscores( @@ -61,6 +81,21 @@ def test_from_subscores_propagates_metadata(self) -> None: assert result.subscores is not None assert result.subscores[0].metadata == metadata + def test_from_subscores_preserves_negative_reward_without_validator_warning(self) -> None: + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = Grade.from_subscores( + [ + SubScore(name="correct", value=0.0, weight=1.0), + SubScore(name="penalty", value=1.0, weight=-0.2), + ] + ) + + assert result.reward == pytest.approx(-0.2) + assert not [ + warning for warning in caught if "Subscores don't match reward" in str(warning.message) + ] + class TestGrader: def test_grade_returns_subscore_and_stores_parameters(self) -> None: From 7edd5870567f0305aeeeec342a511d2ccc3042fd Mon Sep 17 00:00:00 2001 From: Nancy Date: Wed, 25 Mar 2026 19:16:38 -0700 Subject: [PATCH 4/5] refactor grader serialization and add docs --- docs/docs.json | 1 + docs/reference/native-graders.mdx | 118 ++++++++++++++++++++++++++ hud/native/graders.py | 16 +--- hud/telemetry/instrument.py | 7 +- hud/utils/serialization.py | 26 ++++++ hud/utils/tests/test_serialization.py | 31 +++++++ 6 files changed, 180 insertions(+), 19 deletions(-) create mode 100644 docs/reference/native-graders.mdx create mode 100644 hud/utils/serialization.py create mode 100644 hud/utils/tests/test_serialization.py diff --git a/docs/docs.json b/docs/docs.json index 9463d6981..586485faa 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -87,6 +87,7 @@ "pages": [ "reference/environments", "reference/tools", + "reference/native-graders", "reference/evals", "reference/agents", "reference/types" diff --git a/docs/reference/native-graders.mdx b/docs/reference/native-graders.mdx new file mode 100644 index 000000000..79f3a061f --- /dev/null +++ b/docs/reference/native-graders.mdx @@ -0,0 +1,118 @@ +--- +title: "Native Graders" +description: "SDK reference for reusable first-party evaluation helpers in hud.native" +icon: "scale-balanced" +--- + +`hud.native` includes reusable grader helpers for scenarios that want structured scoring without hand-building `EvaluationResult` objects each time. + +## Quick Example + +```python +from hud import Environment +from hud.native import BashGrader, Grade + +env = Environment("coding-env") + +@env.scenario("fix-tests") +async def fix_tests(): + yield "Make the checkout tests pass" + + yield Grade.from_subscores( + [ + BashGrader.grade( + weight=1.0, + command="pytest tests/test_checkout.py -q", + timeout=120, + ) + ] + ) +``` + +`Grade.from_subscores(...)` returns a normal `EvaluationResult`, so the result can be yielded directly from a scenario. + +## Grade + +`Grade.from_subscores(subscores)` combines `SubScore` values into a single `EvaluationResult`. + +Behavior: + +- Positive weights are normalized to sum to `1.0` +- Negative weights are preserved as penalties +- Duplicate subscore names are de-duplicated +- Per-subscore metadata is copied into `EvaluationResult.info` + +```python +from hud.native import Grade +from hud.tools.types import SubScore + +result = Grade.from_subscores( + [ + SubScore(name="tests", value=1.0, weight=0.8), + SubScore(name="style", value=0.5, weight=0.2), + ] +) +``` + +## Grader + +`Grader` is the base class for reusable scoring helpers. Subclasses implement `compute_score(...)`, and `grade(...)` packages the result as a `SubScore`. + +```python +from hud.native import Grader + +class MyGrader(Grader): + name = "MyGrader" + + @classmethod + def compute_score(cls, passed: bool) -> float: + return 1.0 if passed else 0.0 + +subscore = MyGrader.grade(weight=1.0, passed=True) +``` + +`grade(...)` also records JSON-safe copies of the grader parameters in subscore metadata under `_parameters`. + +## BashGrader + +`BashGrader` runs a command with `/bin/bash -lc` and scores it by exit code. + +```python +from hud.native import BashGrader + +subscore = BashGrader.grade( + weight=1.0, + command="pytest tests/test_checkout.py -q", + timeout=120, +) +``` + +Behavior: + +- exit code `0` -> score `1.0` +- non-zero exit code -> score `0.0` +- timeout -> score `0.0` with timeout metadata +- metadata includes `stdout`, `stderr`, and `exit_code` + +## Combinators + +`Grader.any(...)` and `Grader.all(...)` combine multiple subscores into a single summary subscore. + +```python +from hud.native import BashGrader, Grader + +tests = BashGrader.grade(weight=0.5, command="pytest -q") +lint = BashGrader.grade(weight=0.5, command="ruff check .") + +any_passes = Grader.any(weight=1.0, subscores=[tests, lint]) +all_pass = Grader.all(weight=1.0, subscores=[tests, lint]) +``` + +- `any(...)` uses the maximum input score +- `all(...)` uses the minimum input score + +## See Also + +- [Environments](/reference/environments) +- [Evals](/reference/evals) +- [Types](/reference/types) diff --git a/hud/native/graders.py b/hud/native/graders.py index 5d7288825..a88d210d8 100644 --- a/hud/native/graders.py +++ b/hud/native/graders.py @@ -2,30 +2,18 @@ from __future__ import annotations -import json import logging import subprocess from typing import Any from hud.tools.types import EvaluationResult, SubScore +from hud.utils.serialization import json_safe_dict logger = logging.getLogger(__name__) __all__ = ["BashGrader", "Grade", "Grader"] -def _safe_params(kwargs: dict[str, Any]) -> dict[str, Any]: - """Return a JSON-safe copy of grader parameters for metadata storage.""" - result: dict[str, Any] = {} - for key, value in kwargs.items(): - try: - json.dumps(value) - result[key] = value - except (TypeError, ValueError): - result[key] = f"<{type(value).__name__}: not serializable>" - return result - - class Grade: """Factory for building ``EvaluationResult`` objects from ``SubScore`` items.""" @@ -117,7 +105,7 @@ def grade(cls, weight: float, name: str | None = None, **kwargs: Any) -> SubScor name=name or cls.name, weight=weight, value=float(score), - metadata={**metadata, "_parameters": _safe_params(kwargs)}, + metadata={**metadata, "_parameters": json_safe_dict(kwargs)}, ) @classmethod diff --git a/hud/telemetry/instrument.py b/hud/telemetry/instrument.py index 94a62828e..13394a49f 100644 --- a/hud/telemetry/instrument.py +++ b/hud/telemetry/instrument.py @@ -29,6 +29,7 @@ async def my_function(arg1, arg2): from hud.telemetry.exporter import queue_span from hud.types import MCPToolResult, TraceStep +from hud.utils.serialization import json_safe_value def _get_trace_id() -> str | None: @@ -75,11 +76,7 @@ def _serialize_value(value: Any, max_items: int = 10) -> Any: elif isinstance(value, dict) and len(value) > max_items: value = dict(list(value.items())[:max_items]) - try: - json_bytes = pydantic_core.to_json(value, fallback=str) - return json.loads(json_bytes) - except Exception: - return f"<{type(value).__name__}>" + return json_safe_value(value) def _now_iso() -> str: diff --git a/hud/utils/serialization.py b/hud/utils/serialization.py new file mode 100644 index 000000000..b71c15bf3 --- /dev/null +++ b/hud/utils/serialization.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import json +from typing import Any + +import pydantic_core + + +def _unserializable_placeholder(value: Any) -> str: + return f"<{type(value).__name__}: not serializable>" + + +def json_safe_value(value: Any) -> Any: + """Serialize a value into JSON-compatible data when possible.""" + if isinstance(value, str | int | float | bool | type(None)): + return value + + try: + return json.loads(pydantic_core.to_json(value, fallback=_unserializable_placeholder)) + except Exception: + return _unserializable_placeholder(value) + + +def json_safe_dict(values: dict[str, Any]) -> dict[str, Any]: + """Serialize a mapping into JSON-compatible data.""" + return {key: json_safe_value(value) for key, value in values.items()} diff --git a/hud/utils/tests/test_serialization.py b/hud/utils/tests/test_serialization.py new file mode 100644 index 000000000..1dfb92f25 --- /dev/null +++ b/hud/utils/tests/test_serialization.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from hud.utils.serialization import json_safe_dict, json_safe_value + + +def test_json_safe_value_serializes_dataclass() -> None: + @dataclass + class Demo: + name: str + count: int + + result = json_safe_value(Demo(name="test", count=2)) + assert result == {"name": "test", "count": 2} + + +def test_json_safe_value_falls_back_for_unserializable_object() -> None: + class Weird: + def __init__(self) -> None: + raise RuntimeError("boom") + + value = Weird.__new__(Weird) + result = json_safe_value(value) + assert isinstance(result, str) + assert "Weird" in result + + +def test_json_safe_dict_serializes_each_value() -> None: + data = {"number": 1, "items": [1, 2, 3]} + assert json_safe_dict(data) == data From f93def98aedd98c04349692a92186aa8f660de68 Mon Sep 17 00:00:00 2001 From: Nancy Date: Wed, 25 Mar 2026 19:34:38 -0700 Subject: [PATCH 5/5] fix: preserve duplicate grader metadata in combinators --- hud/native/graders.py | 71 ++++++++++++++++++-------------- hud/native/tests/test_graders.py | 32 ++++++++++++++ 2 files changed, 73 insertions(+), 30 deletions(-) diff --git a/hud/native/graders.py b/hud/native/graders.py index a88d210d8..30e1930d2 100644 --- a/hud/native/graders.py +++ b/hud/native/graders.py @@ -14,6 +14,38 @@ __all__ = ["BashGrader", "Grade", "Grader"] +def _dedupe_subscore_names(subscores: list[SubScore]) -> list[str]: + """Return stable, unique names for a sequence of subscores.""" + name_counts: dict[str, int] = {} + for item in subscores: + name_counts[item.name] = name_counts.get(item.name, 0) + 1 + + reserved_names = {item.name for item in subscores} + name_usage: dict[str, int] = {} + used_names: set[str] = set() + final_names: list[str] = [] + + for item in subscores: + if name_counts[item.name] == 1 and item.name not in used_names: + final_name = item.name + else: + suffix = name_usage.get(item.name, 0) + while True: + suffix += 1 + candidate = f"{item.name}-{suffix}" + if candidate in used_names: + continue + if candidate in reserved_names: + continue + name_usage[item.name] = suffix + final_name = candidate + break + used_names.add(final_name) + final_names.append(final_name) + + return final_names + + class Grade: """Factory for building ``EvaluationResult`` objects from ``SubScore`` items.""" @@ -34,33 +66,10 @@ def from_subscores(subscores: list[SubScore]) -> EvaluationResult: if positive_weight_sum <= 0: raise ValueError("subscores must include at least one positive weight") - name_counts: dict[str, int] = {} - for item in subscores: - name_counts[item.name] = name_counts.get(item.name, 0) + 1 - - reserved_names = {item.name for item in subscores} - name_usage: dict[str, int] = {} - used_names: set[str] = set() normalized_subscores: list[SubScore] = [] metadata: dict[str, Any] = {} - for item in subscores: - if name_counts[item.name] == 1 and item.name not in used_names: - final_name = item.name - else: - suffix = name_usage.get(item.name, 0) - while True: - suffix += 1 - candidate = f"{item.name}-{suffix}" - if candidate in used_names: - continue - if candidate in reserved_names: - continue - name_usage[item.name] = suffix - final_name = candidate - break - used_names.add(final_name) - + for item, final_name in zip(subscores, _dedupe_subscore_names(subscores), strict=True): normalized_weight = ( item.weight / positive_weight_sum if item.weight > 0 else item.weight ) @@ -119,15 +128,16 @@ def any(cls, weight: float, subscores: list[SubScore]) -> SubScore: if not subscores: raise ValueError("subscores must not be empty") + unique_names = _dedupe_subscore_names(subscores) return SubScore( name=f"{cls.name}_any", value=max(subscore.value for subscore in subscores), weight=weight, metadata={ - "subscores": [subscore.name for subscore in subscores], + "subscores": unique_names, "subscore_metadata": { - subscore.name: subscore.metadata - for subscore in subscores + unique_name: subscore.metadata + for unique_name, subscore in zip(unique_names, subscores, strict=True) if subscore.metadata is not None }, }, @@ -139,15 +149,16 @@ def all(cls, weight: float, subscores: list[SubScore]) -> SubScore: if not subscores: raise ValueError("subscores must not be empty") + unique_names = _dedupe_subscore_names(subscores) return SubScore( name=f"{cls.name}_all", value=min(subscore.value for subscore in subscores), weight=weight, metadata={ - "subscores": [subscore.name for subscore in subscores], + "subscores": unique_names, "subscore_metadata": { - subscore.name: subscore.metadata - for subscore in subscores + unique_name: subscore.metadata + for unique_name, subscore in zip(unique_names, subscores, strict=True) if subscore.metadata is not None }, }, diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py index 806f2efd8..00ff2cec1 100644 --- a/hud/native/tests/test_graders.py +++ b/hud/native/tests/test_graders.py @@ -129,6 +129,22 @@ def test_any_picks_max(self) -> None: assert combined.name == "BaseGrader_any" assert combined.value == 1.0 + def test_any_preserves_metadata_for_duplicate_named_subscores(self) -> None: + combined = Grader.any( + weight=1.0, + subscores=[ + SubScore(name="BashGrader", value=1.0, weight=0.5, metadata={"exit_code": 0}), + SubScore(name="BashGrader", value=0.0, weight=0.5, metadata={"exit_code": 1}), + ], + ) + assert combined.metadata == { + "subscores": ["BashGrader-1", "BashGrader-2"], + "subscore_metadata": { + "BashGrader-1": {"exit_code": 0}, + "BashGrader-2": {"exit_code": 1}, + }, + } + def test_all_picks_min(self) -> None: combined = Grader.all( weight=1.0, @@ -140,6 +156,22 @@ def test_all_picks_min(self) -> None: assert combined.name == "BaseGrader_all" assert combined.value == 0.0 + def test_all_preserves_metadata_for_duplicate_named_subscores(self) -> None: + combined = Grader.all( + weight=1.0, + subscores=[ + SubScore(name="BashGrader", value=1.0, weight=0.5, metadata={"exit_code": 0}), + SubScore(name="BashGrader", value=0.0, weight=0.5, metadata={"exit_code": 1}), + ], + ) + assert combined.metadata == { + "subscores": ["BashGrader-1", "BashGrader-2"], + "subscore_metadata": { + "BashGrader-1": {"exit_code": 0}, + "BashGrader-2": {"exit_code": 1}, + }, + } + class TestBashGrader: def test_compute_score_for_passing_command(self) -> None: