From ed5e78093f17701ff81abe76df61d861b5ed9f61 Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Tue, 24 Mar 2026 11:24:36 -0700
Subject: [PATCH 1/5] feat: port core SDLC graders to hud.native

---
 hud/native/__init__.py           |   6 +-
 hud/native/graders.py            | 215 +++++++++++++++++++++++++++++++
 hud/native/tests/__init__.py     |   1 +
 hud/native/tests/test_graders.py | 164 +++++++++++++++++++++++
 4 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 hud/native/graders.py
 create mode 100644 hud/native/tests/__init__.py
 create mode 100644 hud/native/tests/test_graders.py

diff --git a/hud/native/__init__.py b/hud/native/__init__.py
index f4901d9d9..52584c622 100644
--- a/hud/native/__init__.py
+++ b/hud/native/__init__.py
@@ -1 +1,5 @@
-"""Native environments bundled with the HUD SDK."""
+"""Native environments and helpers bundled with the HUD SDK."""
+
+from hud.native.graders import BashGrader, Grade, Grader
+
+__all__ = ["BashGrader", "Grade", "Grader"]
diff --git a/hud/native/graders.py b/hud/native/graders.py
new file mode 100644
index 000000000..2cf5df39a
--- /dev/null
+++ b/hud/native/graders.py
@@ -0,0 +1,215 @@
+"""Generic graders for native HUD evaluation."""
+
+from __future__ import annotations
+
+import json
+import logging
+import subprocess
+from typing import Any
+
+from hud.tools.types import EvaluationResult, SubScore
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["BashGrader", "Grade", "Grader"]
+
+
+def _safe_params(kwargs: dict[str, Any]) -> dict[str, Any]:
+    """Return a JSON-safe copy of grader parameters for metadata storage."""
+    result: dict[str, Any] = {}
+    for key, value in kwargs.items():
+        try:
+            json.dumps(value)
+            result[key] = value
+        except (TypeError, ValueError):
+            result[key] = f"<{type(value).__name__}: not serializable>"
+    return result
+
+
+class Grade:
+    """Factory for building ``EvaluationResult`` objects from ``SubScore`` items."""
+
+    @staticmethod
+    def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
+        """Combine subscores into a clipped reward and ready-to-yield result.
+
+        Positive weights are normalized to sum to ``1.0`` so the returned
+        ``EvaluationResult`` lines up with the SDK's subscore semantics.
+        Negative weights are preserved as penalties.
+        """
+
+        if not subscores:
+            raise ValueError("subscores must not be empty")
+
+        positive_weight_sum = sum(item.weight for item in subscores if item.weight > 0)
+        if positive_weight_sum <= 0:
+            raise ValueError("subscores must include at least one positive weight")
+
+        name_counts: dict[str, int] = {}
+        for item in subscores:
+            name_counts[item.name] = name_counts.get(item.name, 0) + 1
+
+        name_usage: dict[str, int] = {}
+        normalized_subscores: list[SubScore] = []
+        metadata: dict[str, Any] = {}
+
+        for item in subscores:
+            if name_counts[item.name] == 1:
+                final_name = item.name
+            else:
+                name_usage[item.name] = name_usage.get(item.name, 0) + 1
+                final_name = f"{item.name}-{name_usage[item.name]}"
+
+            normalized_weight = (
+                item.weight / positive_weight_sum if item.weight > 0 else item.weight
+            )
+            normalized_subscores.append(
+                SubScore(
+                    name=final_name,
+                    weight=normalized_weight,
+                    value=item.value,
+                    metadata=item.metadata,
+                )
+            )
+            if item.metadata is not None:
+                metadata[final_name] = item.metadata
+
+        reward = float(
+            min(
+                max(sum(item.value * item.weight for item in normalized_subscores), 0.0),
+                1.0,
+            )
+        )
+
+        return EvaluationResult(
+            reward=reward,
+            done=True,
+            subscores=normalized_subscores,
+            info=metadata,
+        )
+
+
+class Grader:
+    """Base class for reusable graders that emit ``SubScore`` objects."""
+
+    name: str = "BaseGrader"
+
+    @classmethod
+    def grade(cls, weight: float, name: str | None = None, **kwargs: Any) -> SubScore:
+        """Run the grader and package the result as a ``SubScore``."""
+        result = cls.compute_score(**kwargs)
+
+        if isinstance(result, tuple):
+            score, metadata = result
+        else:
+            score = result
+            metadata = {}
+
+        return SubScore(
+            name=name or cls.name,
+            weight=weight,
+            value=float(score),
+            metadata={**metadata, "_parameters": _safe_params(kwargs)},
+        )
+
+    @classmethod
+    def compute_score(cls, *args: Any, **kwargs: Any) -> float | tuple[float, dict[str, Any]]:
+        """Compute a score between ``0.0`` and ``1.0``."""
+        raise NotImplementedError("Subclasses must implement compute_score")
+
+    @classmethod
+    def any(cls, weight: float, subscores: list[SubScore]) -> SubScore:
+        """Return a subscore that passes if any input subscore passes."""
+        if not subscores:
+            raise ValueError("subscores must not be empty")
+
+        return SubScore(
+            name=f"{cls.name}_any",
+            value=max(subscore.value for subscore in subscores),
+            weight=weight,
+            metadata={
+                "subscores": [subscore.name for subscore in subscores],
+                "subscore_metadata": {
+                    subscore.name: subscore.metadata
+                    for subscore in subscores
+                    if subscore.metadata is not None
+                },
+            },
+        )
+
+    @classmethod
+    def all(cls, weight: float, subscores: list[SubScore]) -> SubScore:
+        """Return a subscore that passes only if all input subscores pass."""
+        if not subscores:
+            raise ValueError("subscores must not be empty")
+
+        return SubScore(
+            name=f"{cls.name}_all",
+            value=min(subscore.value for subscore in subscores),
+            weight=weight,
+            metadata={
+                "subscores": [subscore.name for subscore in subscores],
+                "subscore_metadata": {
+                    subscore.name: subscore.metadata
+                    for subscore in subscores
+                    if subscore.metadata is not None
+                },
+            },
+        )
+
+
+class BashGrader(Grader):
+    """Run a shell command and score it by exit code."""
+
+    name = "BashGrader"
+
+    @classmethod
+    def compute_score(
+        cls,
+        command: str,
+        cwd: str | None = None,
+        timeout: int = 60,
+        **kwargs: Any,
+    ) -> tuple[float, dict[str, Any]]:
+        """Run ``command`` via ``bash -lc`` and return score plus execution metadata."""
+        del kwargs
+        logger.info("Running grader command: %s (cwd=%s, timeout=%ss)", command, cwd, timeout)
+        try:
+            result = subprocess.run(
+                ["/bin/bash", "-lc", command],
+                cwd=cwd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired as exc:
+            stdout = (
+                (exc.stdout or b"").decode(errors="replace")
+                if isinstance(exc.stdout, bytes)
+                else (exc.stdout or "")
+            )
+            stderr = (
+                (exc.stderr or b"").decode(errors="replace")
+                if isinstance(exc.stderr, bytes)
+                else (exc.stderr or "")
+            )
+            return (
+                0.0,
+                {
+                    "exit_code": None,
+                    "stdout": stdout,
+                    "stderr": stderr,
+                    "timed_out": True,
+                    "timeout": timeout,
+                },
+            )
+
+        score = 1.0 if result.returncode == 0 else 0.0
+        return (
+            score,
+            {
+                "exit_code": result.returncode,
+                "stdout": result.stdout,
+                "stderr": result.stderr,
+            },
+        )
diff --git a/hud/native/tests/__init__.py b/hud/native/tests/__init__.py
new file mode 100644
index 000000000..c14ccf20b
--- /dev/null
+++ b/hud/native/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for native HUD helpers."""
diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py
new file mode 100644
index 000000000..5deff2f2b
--- /dev/null
+++ b/hud/native/tests/test_graders.py
@@ -0,0 +1,164 @@
+"""Tests for first-party HUD native graders."""
+
+from __future__ import annotations
+
+import pytest
+
+from hud.environment import Environment
+from hud.native.graders import BashGrader, Grade, Grader
+from hud.tools.types import EvaluationResult, SubScore
+
+
+class TestGrade:
+    def test_from_subscores_returns_evaluation_result(self) -> None:
+        result = Grade.from_subscores([SubScore(name="alpha", value=1.0, weight=1.0)])
+        assert isinstance(result, EvaluationResult)
+        assert result.reward == 1.0
+        assert result.done is True
+
+    def test_from_subscores_normalizes_positive_weights(self) -> None:
+        result = Grade.from_subscores(
+            [
+                SubScore(name="alpha", value=1.0, weight=2.0),
+                SubScore(name="beta", value=0.0, weight=1.0),
+            ]
+        )
+        assert result.reward == pytest.approx(2.0 / 3.0)
+        assert result.subscores is not None
+        by_name = {subscore.name: subscore for subscore in result.subscores}
+        assert by_name["alpha"].weight == pytest.approx(2.0 / 3.0)
+        assert by_name["beta"].weight == pytest.approx(1.0 / 3.0)
+
+    def test_from_subscores_preserves_negative_penalties(self) -> None:
+        result = Grade.from_subscores(
+            [
+                SubScore(name="correct", value=1.0, weight=1.0),
+                SubScore(name="penalty", value=1.0, weight=-0.2),
+            ]
+        )
+        assert result.reward == pytest.approx(0.8)
+        assert result.subscores is not None
+        by_name = {subscore.name: subscore for subscore in result.subscores}
+        assert by_name["correct"].weight == pytest.approx(1.0)
+        assert by_name["penalty"].weight == pytest.approx(-0.2)
+
+    def test_from_subscores_duplicate_names_are_deduped(self) -> None:
+        result = Grade.from_subscores(
+            [
+                SubScore(name="same", value=1.0, weight=0.5),
+                SubScore(name="same", value=0.0, weight=0.5),
+            ]
+        )
+        assert result.subscores is not None
+        assert [subscore.name for subscore in result.subscores] == ["same-1", "same-2"]
+
+    def test_from_subscores_propagates_metadata(self) -> None:
+        metadata = {"stdout": "ok"}
+        result = Grade.from_subscores(
+            [SubScore(name="grader", value=1.0, weight=1.0, metadata=metadata)]
+        )
+        assert result.info["grader"] == metadata
+        assert result.subscores is not None
+        assert result.subscores[0].metadata == metadata
+
+
+class TestGrader:
+    def test_grade_returns_subscore_and_stores_parameters(self) -> None:
+        class DummyGrader(Grader):
+            name = "DummyGrader"
+
+            @classmethod
+            def compute_score(cls, **kwargs: object) -> tuple[float, dict[str, object]]:
+                return 0.75, {"source": "dummy", "kwargs_seen": sorted(kwargs)}
+
+        subscore = DummyGrader.grade(weight=0.4, marker="ok", payload=object())
+        assert isinstance(subscore, SubScore)
+        assert subscore.name == "DummyGrader"
+        assert subscore.value == pytest.approx(0.75)
+        assert subscore.weight == pytest.approx(0.4)
+        assert subscore.metadata is not None
+        assert subscore.metadata["source"] == "dummy"
+        assert subscore.metadata["_parameters"]["marker"] == "ok"
+        assert subscore.metadata["_parameters"]["payload"] == "<object: not serializable>"
+
+
+class TestGraderCombinators:
+    def test_any_picks_max(self) -> None:
+        combined = Grader.any(
+            weight=1.0,
+            subscores=[
+                SubScore(name="a", value=1.0, weight=0.5),
+                SubScore(name="b", value=0.0, weight=0.5),
+            ],
+        )
+        assert combined.name == "BaseGrader_any"
+        assert combined.value == 1.0
+
+    def test_all_picks_min(self) -> None:
+        combined = Grader.all(
+            weight=1.0,
+            subscores=[
+                SubScore(name="a", value=1.0, weight=0.5),
+                SubScore(name="b", value=0.0, weight=0.5),
+            ],
+        )
+        assert combined.name == "BaseGrader_all"
+        assert combined.value == 0.0
+
+
+class TestBashGrader:
+    def test_compute_score_for_passing_command(self) -> None:
+        score, metadata = BashGrader.compute_score(command="echo hello")
+        assert score == 1.0
+        assert metadata["exit_code"] == 0
+        assert "hello" in metadata["stdout"]
+
+    def test_compute_score_for_failing_command(self) -> None:
+        score, metadata = BashGrader.compute_score(command="echo oops >&2 && false")
+        assert score == 0.0
+        assert metadata["exit_code"] != 0
+        assert "oops" in metadata["stderr"]
+
+    def test_compute_score_timeout(self) -> None:
+        score, metadata = BashGrader.compute_score(command="sleep 2", timeout=1)
+        assert score == 0.0
+        assert metadata["timed_out"] is True
+        assert metadata["timeout"] == 1
+
+    def test_compute_score_invalid_cwd_raises(self, tmp_path) -> None:
+        with pytest.raises(FileNotFoundError):
+            BashGrader.compute_score(command="true", cwd=str(tmp_path / "missing"))
+
+    def test_grade_and_from_subscores_compose(self) -> None:
+        passing = BashGrader.grade(weight=0.5, command="true")
+        failing = BashGrader.grade(weight=0.5, command="false")
+        result = Grade.from_subscores([passing, failing])
+        assert result.reward == pytest.approx(0.5)
+        assert result.info["BashGrader-1"]["exit_code"] == 0
+        assert result.info["BashGrader-2"]["exit_code"] != 0
+
+
+class TestScenarioIntegration:
+    @pytest.mark.asyncio
+    async def test_scenario_can_yield_grade_from_subscores(self) -> None:
+        env = Environment("test-env")
+
+        @env.scenario("bash-graded")
+        async def bash_graded_scenario():
+            yield "Run the verification"
+            yield Grade.from_subscores(
+                [BashGrader.grade(weight=1.0, command="echo verified")]
+            )
+
+        prompt = await env.run_scenario_setup("bash-graded", {})
+        assert prompt == "Run the verification"
+
+        assert env._active_session is not None
+        env._active_session.answer = "done"
+        result = await env.run_scenario_evaluate("bash-graded")
+
+        assert result is not None
+        assert result.reward == 1.0
+        assert result.subscores is not None
+        assert result.subscores[0].name == "BashGrader"
+        assert "verified" in result.info["BashGrader"]["stdout"]

From b84d7717abe920eb586aeb0c2b0b861953aad032 Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Tue, 24 Mar 2026 11:39:32 -0700
Subject: [PATCH 2/5] style: format grader tests

---
 hud/native/tests/test_graders.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py
index 5deff2f2b..76032d2b1 100644
--- a/hud/native/tests/test_graders.py
+++ b/hud/native/tests/test_graders.py
@@ -146,9 +146,7 @@ async def test_scenario_can_yield_grade_from_subscores(self) -> None:
         @env.scenario("bash-graded")
         async def bash_graded_scenario():
             yield "Run the verification"
-            yield Grade.from_subscores(
-                [BashGrader.grade(weight=1.0, command="echo verified")]
-            )
+            yield Grade.from_subscores([BashGrader.grade(weight=1.0, command="echo verified")])
 
         prompt = await env.run_scenario_setup("bash-graded", {})
         assert prompt == "Run the verification"

From 521b1fef331d7efff6ea83329a7c1fea1fa0538c Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Tue, 24 Mar 2026 11:49:22 -0700
Subject: [PATCH 3/5] fix: handle grader name collisions and penalties

---
 hud/native/graders.py            | 30 +++++++++++++++++----------
 hud/native/tests/test_graders.py | 35 ++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/hud/native/graders.py b/hud/native/graders.py
index 2cf5df39a..5d7288825 100644
--- a/hud/native/graders.py
+++ b/hud/native/graders.py
@@ -31,11 +31,12 @@ class Grade:
 
     @staticmethod
     def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
-        """Combine subscores into a clipped reward and ready-to-yield result.
+        """Combine subscores into a weighted reward and ready-to-yield result.
 
         Positive weights are normalized to sum to ``1.0`` so the returned
         ``EvaluationResult`` lines up with the SDK's subscore semantics.
-        Negative weights are preserved as penalties.
+        Negative weights are preserved as penalties, including when they drive
+        the final reward below zero.
         """
 
         if not subscores:
@@ -49,16 +50,28 @@ def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
         for item in subscores:
             name_counts[item.name] = name_counts.get(item.name, 0) + 1
 
+        reserved_names = {item.name for item in subscores}
         name_usage: dict[str, int] = {}
+        used_names: set[str] = set()
         normalized_subscores: list[SubScore] = []
         metadata: dict[str, Any] = {}
 
         for item in subscores:
-            if name_counts[item.name] == 1:
+            if name_counts[item.name] == 1 and item.name not in used_names:
                 final_name = item.name
             else:
-                name_usage[item.name] = name_usage.get(item.name, 0) + 1
-                final_name = f"{item.name}-{name_usage[item.name]}"
+                suffix = name_usage.get(item.name, 0)
+                while True:
+                    suffix += 1
+                    candidate = f"{item.name}-{suffix}"
+                    if candidate in used_names:
+                        continue
+                    if candidate in reserved_names:
+                        continue
+                    name_usage[item.name] = suffix
+                    final_name = candidate
+                    break
+            used_names.add(final_name)
 
             normalized_weight = (
                 item.weight / positive_weight_sum if item.weight > 0 else item.weight
@@ -74,12 +87,7 @@ def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
             if item.metadata is not None:
                 metadata[final_name] = item.metadata
 
-        reward = float(
-            min(
-                max(sum(item.value * item.weight for item in normalized_subscores), 0.0),
-                1.0,
-            )
-        )
+        reward = float(sum(item.value * item.weight for item in normalized_subscores))
 
         return EvaluationResult(
             reward=reward,
diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py
index 76032d2b1..806f2efd8 100644
--- a/hud/native/tests/test_graders.py
+++ b/hud/native/tests/test_graders.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import warnings
+
 import pytest
 
 from hud.environment import Environment
@@ -52,6 +54,24 @@ def test_from_subscores_duplicate_names_are_deduped(self) -> None:
         assert result.subscores is not None
         assert [subscore.name for subscore in result.subscores] == ["same-1", "same-2"]
 
+    def test_from_subscores_duplicate_names_avoid_existing_suffix_collisions(self) -> None:
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            result = Grade.from_subscores(
+                [
+                    SubScore(name="x-1", value=1.0, weight=0.3),
+                    SubScore(name="x", value=1.0, weight=0.4),
+                    SubScore(name="x", value=0.0, weight=0.6),
+                ]
+            )
+
+        assert result.subscores is not None
+        assert [subscore.name for subscore in result.subscores] == ["x-1", "x-2", "x-3"]
+        assert set(result.info) == set()
+        assert not [
+            warning for warning in caught if "Duplicate subscore names" in str(warning.message)
+        ]
+
     def test_from_subscores_propagates_metadata(self) -> None:
         metadata = {"stdout": "ok"}
         result = Grade.from_subscores(
@@ -61,6 +81,21 @@ def test_from_subscores_propagates_metadata(self) -> None:
         assert result.subscores is not None
         assert result.subscores[0].metadata == metadata
 
+    def test_from_subscores_preserves_negative_reward_without_validator_warning(self) -> None:
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            result = Grade.from_subscores(
+                [
+                    SubScore(name="correct", value=0.0, weight=1.0),
+                    SubScore(name="penalty", value=1.0, weight=-0.2),
+                ]
+            )
+
+        assert result.reward == pytest.approx(-0.2)
+        assert not [
+            warning for warning in caught if "Subscores don't match reward" in str(warning.message)
+        ]
+
 
 class TestGrader:
     def test_grade_returns_subscore_and_stores_parameters(self) -> None:

From 7edd5870567f0305aeeeec342a511d2ccc3042fd Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Wed, 25 Mar 2026 19:16:38 -0700
Subject: [PATCH 4/5] refactor grader serialization and add docs

---
 docs/docs.json                        |   1 +
 docs/reference/native-graders.mdx     | 118 ++++++++++++++++++++++++++
 hud/native/graders.py                 |  16 +---
 hud/telemetry/instrument.py           |   7 +-
 hud/utils/serialization.py            |  26 ++++++
 hud/utils/tests/test_serialization.py |  31 +++++++
 6 files changed, 180 insertions(+), 19 deletions(-)
 create mode 100644 docs/reference/native-graders.mdx
 create mode 100644 hud/utils/serialization.py
 create mode 100644 hud/utils/tests/test_serialization.py

diff --git a/docs/docs.json b/docs/docs.json
index 9463d6981..586485faa 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -87,6 +87,7 @@
             "pages": [
               "reference/environments",
               "reference/tools",
+              "reference/native-graders",
               "reference/evals",
               "reference/agents",
               "reference/types"
diff --git a/docs/reference/native-graders.mdx b/docs/reference/native-graders.mdx
new file mode 100644
index 000000000..79f3a061f
--- /dev/null
+++ b/docs/reference/native-graders.mdx
@@ -0,0 +1,118 @@
+---
+title: "Native Graders"
+description: "SDK reference for reusable first-party evaluation helpers in hud.native"
+icon: "scale-balanced"
+---
+
+`hud.native` includes reusable grader helpers for scenarios that want structured scoring without hand-building `EvaluationResult` objects each time.
+
+## Quick Example
+
+```python
+from hud import Environment
+from hud.native import BashGrader, Grade
+
+env = Environment("coding-env")
+
+@env.scenario("fix-tests")
+async def fix_tests():
+    yield "Make the checkout tests pass"
+
+    yield Grade.from_subscores(
+        [
+            BashGrader.grade(
+                weight=1.0,
+                command="pytest tests/test_checkout.py -q",
+                timeout=120,
+            )
+        ]
+    )
+```
+
+`Grade.from_subscores(...)` returns a normal `EvaluationResult`, so the result can be yielded directly from a scenario.
+
+## Grade
+
+`Grade.from_subscores(subscores)` combines `SubScore` values into a single `EvaluationResult`.
+
+Behavior:
+
+- Positive weights are normalized to sum to `1.0`
+- Negative weights are preserved as penalties
+- Duplicate subscore names are de-duplicated
+- Per-subscore metadata is copied into `EvaluationResult.info`
+
+```python
+from hud.native import Grade
+from hud.tools.types import SubScore
+
+result = Grade.from_subscores(
+    [
+        SubScore(name="tests", value=1.0, weight=0.8),
+        SubScore(name="style", value=0.5, weight=0.2),
+    ]
+)
+```
+
+## Grader
+
+`Grader` is the base class for reusable scoring helpers. Subclasses implement `compute_score(...)`, and `grade(...)` packages the result as a `SubScore`.
+
+```python
+from hud.native import Grader
+
+class MyGrader(Grader):
+    name = "MyGrader"
+
+    @classmethod
+    def compute_score(cls, passed: bool) -> float:
+        return 1.0 if passed else 0.0
+
+subscore = MyGrader.grade(weight=1.0, passed=True)
+```
+
+`grade(...)` also records JSON-safe copies of the grader parameters in subscore metadata under `_parameters`.
+
+## BashGrader
+
+`BashGrader` runs a command with `/bin/bash -lc` and scores it by exit code.
+
+```python
+from hud.native import BashGrader
+
+subscore = BashGrader.grade(
+    weight=1.0,
+    command="pytest tests/test_checkout.py -q",
+    timeout=120,
+)
+```
+
+Behavior:
+
+- exit code `0` -> score `1.0`
+- non-zero exit code -> score `0.0`
+- timeout -> score `0.0` with timeout metadata
+- metadata includes `stdout`, `stderr`, and `exit_code`
+
+## Combinators
+
+`Grader.any(...)` and `Grader.all(...)` combine multiple subscores into a single summary subscore.
+
+```python
+from hud.native import BashGrader, Grader
+
+tests = BashGrader.grade(weight=0.5, command="pytest -q")
+lint = BashGrader.grade(weight=0.5, command="ruff check .")
+
+any_passes = Grader.any(weight=1.0, subscores=[tests, lint])
+all_pass = Grader.all(weight=1.0, subscores=[tests, lint])
+```
+
+- `any(...)` uses the maximum input score
+- `all(...)` uses the minimum input score
+
+## See Also
+
+- [Environments](/reference/environments)
+- [Evals](/reference/evals)
+- [Types](/reference/types)
diff --git a/hud/native/graders.py b/hud/native/graders.py
index 5d7288825..a88d210d8 100644
--- a/hud/native/graders.py
+++ b/hud/native/graders.py
@@ -2,30 +2,18 @@
 
 from __future__ import annotations
 
-import json
 import logging
 import subprocess
 from typing import Any
 
 from hud.tools.types import EvaluationResult, SubScore
+from hud.utils.serialization import json_safe_dict
 
 logger = logging.getLogger(__name__)
 
 __all__ = ["BashGrader", "Grade", "Grader"]
 
 
-def _safe_params(kwargs: dict[str, Any]) -> dict[str, Any]:
-    """Return a JSON-safe copy of grader parameters for metadata storage."""
-    result: dict[str, Any] = {}
-    for key, value in kwargs.items():
-        try:
-            json.dumps(value)
-            result[key] = value
-        except (TypeError, ValueError):
-            result[key] = f"<{type(value).__name__}: not serializable>"
-    return result
-
-
 class Grade:
     """Factory for building ``EvaluationResult`` objects from ``SubScore`` items."""
 
@@ -117,7 +105,7 @@ def grade(cls, weight: float, name: str | None = None, **kwargs: Any) -> SubScor
             name=name or cls.name,
             weight=weight,
             value=float(score),
-            metadata={**metadata, "_parameters": _safe_params(kwargs)},
+            metadata={**metadata, "_parameters": json_safe_dict(kwargs)},
         )
 
     @classmethod
diff --git a/hud/telemetry/instrument.py b/hud/telemetry/instrument.py
index 94a62828e..13394a49f 100644
--- a/hud/telemetry/instrument.py
+++ b/hud/telemetry/instrument.py
@@ -29,6 +29,7 @@ async def my_function(arg1, arg2):
 
 from hud.telemetry.exporter import queue_span
 from hud.types import MCPToolResult, TraceStep
+from hud.utils.serialization import json_safe_value
 
 
 def _get_trace_id() -> str | None:
@@ -75,11 +76,7 @@ def _serialize_value(value: Any, max_items: int = 10) -> Any:
     elif isinstance(value, dict) and len(value) > max_items:
         value = dict(list(value.items())[:max_items])
 
-    try:
-        json_bytes = pydantic_core.to_json(value, fallback=str)
-        return json.loads(json_bytes)
-    except Exception:
-        return f"<{type(value).__name__}>"
+    return json_safe_value(value)
 
 
 def _now_iso() -> str:
diff --git a/hud/utils/serialization.py b/hud/utils/serialization.py
new file mode 100644
index 000000000..b71c15bf3
--- /dev/null
+++ b/hud/utils/serialization.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import json
+from typing import Any
+
+import pydantic_core
+
+
+def _unserializable_placeholder(value: Any) -> str:
+    return f"<{type(value).__name__}: not serializable>"
+
+
+def json_safe_value(value: Any) -> Any:
+    """Serialize a value into JSON-compatible data when possible."""
+    if isinstance(value, str | int | float | bool | type(None)):
+        return value
+
+    try:
+        return json.loads(pydantic_core.to_json(value, fallback=_unserializable_placeholder))
+    except Exception:
+        return _unserializable_placeholder(value)
+
+
+def json_safe_dict(values: dict[str, Any]) -> dict[str, Any]:
+    """Serialize a mapping into JSON-compatible data."""
+    return {key: json_safe_value(value) for key, value in values.items()}
diff --git a/hud/utils/tests/test_serialization.py b/hud/utils/tests/test_serialization.py
new file mode 100644
index 000000000..1dfb92f25
--- /dev/null
+++ b/hud/utils/tests/test_serialization.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from hud.utils.serialization import json_safe_dict, json_safe_value
+
+
+def test_json_safe_value_serializes_dataclass() -> None:
+    @dataclass
+    class Demo:
+        name: str
+        count: int
+
+    result = json_safe_value(Demo(name="test", count=2))
+    assert result == {"name": "test", "count": 2}
+
+
+def test_json_safe_value_falls_back_for_unserializable_object() -> None:
+    class Weird:
+        def __init__(self) -> None:
+            raise RuntimeError("boom")
+
+    value = Weird.__new__(Weird)
+    result = json_safe_value(value)
+    assert isinstance(result, str)
+    assert "Weird" in result
+
+
+def test_json_safe_dict_serializes_each_value() -> None:
+    data = {"number": 1, "items": [1, 2, 3]}
+    assert json_safe_dict(data) == data

From f93def98aedd98c04349692a92186aa8f660de68 Mon Sep 17 00:00:00 2001
From: Nancy <najilau@ucsc.edu>
Date: Wed, 25 Mar 2026 19:34:38 -0700
Subject: [PATCH 5/5] fix: preserve duplicate grader metadata in combinators

---
 hud/native/graders.py            | 71 ++++++++++++++++++--------------
 hud/native/tests/test_graders.py | 32 ++++++++++++++
 2 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/hud/native/graders.py b/hud/native/graders.py
index a88d210d8..30e1930d2 100644
--- a/hud/native/graders.py
+++ b/hud/native/graders.py
@@ -14,6 +14,38 @@
 __all__ = ["BashGrader", "Grade", "Grader"]
 
 
+def _dedupe_subscore_names(subscores: list[SubScore]) -> list[str]:
+    """Return stable, unique names for a sequence of subscores."""
+    name_counts: dict[str, int] = {}
+    for item in subscores:
+        name_counts[item.name] = name_counts.get(item.name, 0) + 1
+
+    reserved_names = {item.name for item in subscores}
+    name_usage: dict[str, int] = {}
+    used_names: set[str] = set()
+    final_names: list[str] = []
+
+    for item in subscores:
+        if name_counts[item.name] == 1 and item.name not in used_names:
+            final_name = item.name
+        else:
+            suffix = name_usage.get(item.name, 0)
+            while True:
+                suffix += 1
+                candidate = f"{item.name}-{suffix}"
+                if candidate in used_names:
+                    continue
+                if candidate in reserved_names:
+                    continue
+                name_usage[item.name] = suffix
+                final_name = candidate
+                break
+        used_names.add(final_name)
+        final_names.append(final_name)
+
+    return final_names
+
+
 class Grade:
     """Factory for building ``EvaluationResult`` objects from ``SubScore`` items."""
 
@@ -34,33 +66,10 @@ def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
         if positive_weight_sum <= 0:
             raise ValueError("subscores must include at least one positive weight")
 
-        name_counts: dict[str, int] = {}
-        for item in subscores:
-            name_counts[item.name] = name_counts.get(item.name, 0) + 1
-
-        reserved_names = {item.name for item in subscores}
-        name_usage: dict[str, int] = {}
-        used_names: set[str] = set()
         normalized_subscores: list[SubScore] = []
         metadata: dict[str, Any] = {}
 
-        for item in subscores:
-            if name_counts[item.name] == 1 and item.name not in used_names:
-                final_name = item.name
-            else:
-                suffix = name_usage.get(item.name, 0)
-                while True:
-                    suffix += 1
-                    candidate = f"{item.name}-{suffix}"
-                    if candidate in used_names:
-                        continue
-                    if candidate in reserved_names:
-                        continue
-                    name_usage[item.name] = suffix
-                    final_name = candidate
-                    break
-            used_names.add(final_name)
-
+        for item, final_name in zip(subscores, _dedupe_subscore_names(subscores), strict=True):
             normalized_weight = (
                 item.weight / positive_weight_sum if item.weight > 0 else item.weight
             )
@@ -119,15 +128,16 @@ def any(cls, weight: float, subscores: list[SubScore]) -> SubScore:
         if not subscores:
             raise ValueError("subscores must not be empty")
 
+        unique_names = _dedupe_subscore_names(subscores)
         return SubScore(
             name=f"{cls.name}_any",
             value=max(subscore.value for subscore in subscores),
             weight=weight,
             metadata={
-                "subscores": [subscore.name for subscore in subscores],
+                "subscores": unique_names,
                 "subscore_metadata": {
-                    subscore.name: subscore.metadata
-                    for subscore in subscores
+                    unique_name: subscore.metadata
+                    for unique_name, subscore in zip(unique_names, subscores, strict=True)
                     if subscore.metadata is not None
                 },
             },
@@ -139,15 +149,16 @@ def all(cls, weight: float, subscores: list[SubScore]) -> SubScore:
         if not subscores:
             raise ValueError("subscores must not be empty")
 
+        unique_names = _dedupe_subscore_names(subscores)
         return SubScore(
             name=f"{cls.name}_all",
             value=min(subscore.value for subscore in subscores),
             weight=weight,
             metadata={
-                "subscores": [subscore.name for subscore in subscores],
+                "subscores": unique_names,
                 "subscore_metadata": {
-                    subscore.name: subscore.metadata
-                    for subscore in subscores
+                    unique_name: subscore.metadata
+                    for unique_name, subscore in zip(unique_names, subscores, strict=True)
                     if subscore.metadata is not None
                 },
             },
diff --git a/hud/native/tests/test_graders.py b/hud/native/tests/test_graders.py
index 806f2efd8..00ff2cec1 100644
--- a/hud/native/tests/test_graders.py
+++ b/hud/native/tests/test_graders.py
@@ -129,6 +129,22 @@ def test_any_picks_max(self) -> None:
         assert combined.name == "BaseGrader_any"
         assert combined.value == 1.0
 
+    def test_any_preserves_metadata_for_duplicate_named_subscores(self) -> None:
+        combined = Grader.any(
+            weight=1.0,
+            subscores=[
+                SubScore(name="BashGrader", value=1.0, weight=0.5, metadata={"exit_code": 0}),
+                SubScore(name="BashGrader", value=0.0, weight=0.5, metadata={"exit_code": 1}),
+            ],
+        )
+        assert combined.metadata == {
+            "subscores": ["BashGrader-1", "BashGrader-2"],
+            "subscore_metadata": {
+                "BashGrader-1": {"exit_code": 0},
+                "BashGrader-2": {"exit_code": 1},
+            },
+        }
+
     def test_all_picks_min(self) -> None:
         combined = Grader.all(
             weight=1.0,
@@ -140,6 +156,22 @@ def test_all_picks_min(self) -> None:
         assert combined.name == "BaseGrader_all"
         assert combined.value == 0.0
 
+    def test_all_preserves_metadata_for_duplicate_named_subscores(self) -> None:
+        combined = Grader.all(
+            weight=1.0,
+            subscores=[
+                SubScore(name="BashGrader", value=1.0, weight=0.5, metadata={"exit_code": 0}),
+                SubScore(name="BashGrader", value=0.0, weight=0.5, metadata={"exit_code": 1}),
+            ],
+        )
+        assert combined.metadata == {
+            "subscores": ["BashGrader-1", "BashGrader-2"],
+            "subscore_metadata": {
+                "BashGrader-1": {"exit_code": 0},
+                "BashGrader-2": {"exit_code": 1},
+            },
+        }
+
 
 class TestBashGrader:
     def test_compute_score_for_passing_command(self) -> None: