hud-evals · lorenss-m · Mar 27, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/docs/docs.json b/docs/docs.json
@@ -87,6 +87,7 @@
             "pages": [
               "reference/environments",
               "reference/tools",
+              "reference/native-graders",
               "reference/evals",
               "reference/agents",
               "reference/types"

diff --git a/docs/reference/native-graders.mdx b/docs/reference/native-graders.mdx
@@ -0,0 +1,118 @@
+---
+title: "Native Graders"
+description: "SDK reference for reusable first-party evaluation helpers in hud.native"
+icon: "scale-balanced"
+---
+
+`hud.native` includes reusable grader helpers for scenarios that want structured scoring without hand-building `EvaluationResult` objects each time.
+
+## Quick Example
+
+```python
+from hud import Environment
+from hud.native import BashGrader, Grade
+
+env = Environment("coding-env")
+
+@env.scenario("fix-tests")
+async def fix_tests():
+    yield "Make the checkout tests pass"
+
+    yield Grade.from_subscores(
+        [
+            BashGrader.grade(
+                weight=1.0,
+                command="pytest tests/test_checkout.py -q",
+                timeout=120,
+            )
+        ]
+    )
+```
+
+`Grade.from_subscores(...)` returns a normal `EvaluationResult`, so the result can be yielded directly from a scenario.
+
+## Grade
+
+`Grade.from_subscores(subscores)` combines `SubScore` values into a single `EvaluationResult`.
+
+Behavior:
+
+- Positive weights are normalized to sum to `1.0`
+- Negative weights are preserved as penalties
+- Duplicate subscore names are de-duplicated
+- Per-subscore metadata is copied into `EvaluationResult.info`
+
+```python
+from hud.native import Grade
+from hud.tools.types import SubScore
+
+result = Grade.from_subscores(
+    [
+        SubScore(name="tests", value=1.0, weight=0.8),
+        SubScore(name="style", value=0.5, weight=0.2),
+    ]
+)
+```
+
+## Grader
+
+`Grader` is the base class for reusable scoring helpers. Subclasses implement `compute_score(...)`, and `grade(...)` packages the result as a `SubScore`.
+
+```python
+from hud.native import Grader
+
+class MyGrader(Grader):
+    name = "MyGrader"
+
+    @classmethod
+    def compute_score(cls, passed: bool) -> float:
+        return 1.0 if passed else 0.0
+
+subscore = MyGrader.grade(weight=1.0, passed=True)
+```
+
+`grade(...)` also records JSON-safe copies of the grader parameters in subscore metadata under `_parameters`.
+
+## BashGrader
+
+`BashGrader` runs a command with `/bin/bash -lc` and scores it by exit code.
+
+```python
+from hud.native import BashGrader
+
+subscore = BashGrader.grade(
+    weight=1.0,
+    command="pytest tests/test_checkout.py -q",
+    timeout=120,
+)
+```
+
+Behavior:
+
+- exit code `0` -> score `1.0`
+- non-zero exit code -> score `0.0`
+- timeout -> score `0.0` with timeout metadata
+- metadata includes `stdout`, `stderr`, and `exit_code`
+
+## Combinators
+
+`Grader.any(...)` and `Grader.all(...)` combine multiple subscores into a single summary subscore.
+
+```python
+from hud.native import BashGrader, Grader
+
+tests = BashGrader.grade(weight=0.5, command="pytest -q")
+lint = BashGrader.grade(weight=0.5, command="ruff check .")
+
+any_passes = Grader.any(weight=1.0, subscores=[tests, lint])
+all_pass = Grader.all(weight=1.0, subscores=[tests, lint])
+```
+
+- `any(...)` uses the maximum input score
+- `all(...)` uses the minimum input score
+
+## See Also
+
+- [Environments](/reference/environments)
+- [Evals](/reference/evals)
+- [Types](/reference/types)
diff --git a/hud/native/__init__.py b/hud/native/__init__.py
@@ -1 +1,5 @@
-"""Native environments bundled with the HUD SDK."""
+"""Native environments and helpers bundled with the HUD SDK."""
+
+from hud.native.graders import BashGrader, Grade, Grader
+
+__all__ = ["BashGrader", "Grade", "Grader"]
diff --git a/hud/native/graders.py b/hud/native/graders.py
@@ -0,0 +1,222 @@
+"""Generic graders for native HUD evaluation."""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from typing import Any
+
+from hud.tools.types import EvaluationResult, SubScore
+from hud.utils.serialization import json_safe_dict
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["BashGrader", "Grade", "Grader"]
+
+
+def _dedupe_subscore_names(subscores: list[SubScore]) -> list[str]:
+    """Return stable, unique names for a sequence of subscores."""
+    name_counts: dict[str, int] = {}
+    for item in subscores:
+        name_counts[item.name] = name_counts.get(item.name, 0) + 1
+
+    reserved_names = {item.name for item in subscores}
+    name_usage: dict[str, int] = {}
+    used_names: set[str] = set()
+    final_names: list[str] = []
+
+    for item in subscores:
+        if name_counts[item.name] == 1 and item.name not in used_names:
+            final_name = item.name
+        else:
+            suffix = name_usage.get(item.name, 0)
+            while True:
+                suffix += 1
+                candidate = f"{item.name}-{suffix}"
+                if candidate in used_names:
+                    continue
+                if candidate in reserved_names:
+                    continue
+                name_usage[item.name] = suffix
+                final_name = candidate
+                break
+        used_names.add(final_name)
+        final_names.append(final_name)
+
+    return final_names
+
+
+class Grade:
+    """Factory for building ``EvaluationResult`` objects from ``SubScore`` items."""
+
+    @staticmethod
+    def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
+        """Combine subscores into a weighted reward and ready-to-yield result.
+
+        Positive weights are normalized to sum to ``1.0`` so the returned
+        ``EvaluationResult`` lines up with the SDK's subscore semantics.
+        Negative weights are preserved as penalties, including when they drive
+        the final reward below zero.
+        """
+
+        if not subscores:
+            raise ValueError("subscores must not be empty")
+
+        positive_weight_sum = sum(item.weight for item in subscores if item.weight > 0)
+        if positive_weight_sum <= 0:
+            raise ValueError("subscores must include at least one positive weight")
+
+        normalized_subscores: list[SubScore] = []
+        metadata: dict[str, Any] = {}
+
+        for item, final_name in zip(subscores, _dedupe_subscore_names(subscores), strict=True):
+            normalized_weight = (
+                item.weight / positive_weight_sum if item.weight > 0 else item.weight
+            )
+            normalized_subscores.append(
+                SubScore(
+                    name=final_name,
+                    weight=normalized_weight,
+                    value=item.value,
+                    metadata=item.metadata,
+                )
+            )
+            if item.metadata is not None:
+                metadata[final_name] = item.metadata
+
+        reward = float(sum(item.value * item.weight for item in normalized_subscores))
+
+        return EvaluationResult(
+            reward=reward,
+            done=True,
+            subscores=normalized_subscores,
+            info=metadata,
+        )
+
+
+class Grader:
+    """Base class for reusable graders that emit ``SubScore`` objects."""
+
+    name: str = "BaseGrader"
+
+    @classmethod
+    def grade(cls, weight: float, name: str | None = None, **kwargs: Any) -> SubScore:
+        """Run the grader and package the result as a ``SubScore``."""
+        result = cls.compute_score(**kwargs)
+
+        if isinstance(result, tuple):
+            score, metadata = result
+        else:
+            score = result
+            metadata = {}
+
+        return SubScore(
+            name=name or cls.name,
+            weight=weight,
+            value=float(score),
+            metadata={**metadata, "_parameters": json_safe_dict(kwargs)},
+        )
+
+    @classmethod
+    def compute_score(cls, *args: Any, **kwargs: Any) -> float | tuple[float, dict[str, Any]]:
+        """Compute a score between ``0.0`` and ``1.0``."""
+        raise NotImplementedError("Subclasses must implement compute_score")
+
+    @classmethod
+    def any(cls, weight: float, subscores: list[SubScore]) -> SubScore:
+        """Return a subscore that passes if any input subscore passes."""
+        if not subscores:
+            raise ValueError("subscores must not be empty")
+
+        unique_names = _dedupe_subscore_names(subscores)
+        return SubScore(
+            name=f"{cls.name}_any",
+            value=max(subscore.value for subscore in subscores),
+            weight=weight,
+            metadata={
+                "subscores": unique_names,
+                "subscore_metadata": {
+                    unique_name: subscore.metadata
+                    for unique_name, subscore in zip(unique_names, subscores, strict=True)
+                    if subscore.metadata is not None
+                },
+            },
+        )
+
+    @classmethod
+    def all(cls, weight: float, subscores: list[SubScore]) -> SubScore:
+        """Return a subscore that passes only if all input subscores pass."""
+        if not subscores:
+            raise ValueError("subscores must not be empty")
+
+        unique_names = _dedupe_subscore_names(subscores)
+        return SubScore(
+            name=f"{cls.name}_all",
+            value=min(subscore.value for subscore in subscores),
+            weight=weight,
+            metadata={
+                "subscores": unique_names,
+                "subscore_metadata": {
+                    unique_name: subscore.metadata
+                    for unique_name, subscore in zip(unique_names, subscores, strict=True)
+                    if subscore.metadata is not None
+                },
+            },
+        )
+
+
+class BashGrader(Grader):
+    """Run a shell command and score it by exit code."""
+
+    name = "BashGrader"
+
+    @classmethod
+    def compute_score(
+        cls,
+        command: str,
+        cwd: str | None = None,
+        timeout: int = 60,
+        **kwargs: Any,
+    ) -> tuple[float, dict[str, Any]]:
+        """Run ``command`` via ``bash -lc`` and return score plus execution metadata."""
+        del kwargs
+        logger.info("Running grader command: %s (cwd=%s, timeout=%ss)", command, cwd, timeout)
+        try:
+            result = subprocess.run(
+                ["/bin/bash", "-lc", command],
+                cwd=cwd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired as exc:
+            stdout = (
+                (exc.stdout or b"").decode(errors="replace")
+                if isinstance(exc.stdout, bytes)
+                else (exc.stdout or "")
+            )
+            stderr = (
+                (exc.stderr or b"").decode(errors="replace")
+                if isinstance(exc.stderr, bytes)
+                else (exc.stderr or "")
+            )
+            return (
+                0.0,
+                {
+                    "exit_code": None,
+                    "stdout": stdout,
+                    "stderr": stderr,
+                    "timed_out": True,
+                    "timeout": timeout,
+                },
+            )
+
+        score = 1.0 if result.returncode == 0 else 0.0
+        return (
+            score,
+            {
+                "exit_code": result.returncode,
+                "stdout": result.stdout,
+                "stderr": result.stderr,
+            },
+        )
diff --git a/hud/native/tests/__init__.py b/hud/native/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for native HUD helpers."""