Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/docs.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
"pages": [
"reference/environments",
"reference/tools",
"reference/native-graders",
"reference/evals",
"reference/agents",
"reference/types"
Expand Down
118 changes: 118 additions & 0 deletions docs/reference/native-graders.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
---
title: "Native Graders"
description: "SDK reference for reusable first-party evaluation helpers in hud.native"
icon: "scale-balanced"
---

`hud.native` includes reusable grader helpers for scenarios that want structured scoring without hand-building `EvaluationResult` objects each time.

## Quick Example

```python
from hud import Environment
from hud.native import BashGrader, Grade

env = Environment("coding-env")

@env.scenario("fix-tests")
async def fix_tests():
yield "Make the checkout tests pass"

yield Grade.from_subscores(
[
BashGrader.grade(
weight=1.0,
command="pytest tests/test_checkout.py -q",
timeout=120,
)
]
)
```

`Grade.from_subscores(...)` returns a normal `EvaluationResult`, so the result can be yielded directly from a scenario.

## Grade

`Grade.from_subscores(subscores)` combines `SubScore` values into a single `EvaluationResult`.

Behavior:

- Positive weights are normalized to sum to `1.0`
- Negative weights are preserved as penalties
- Duplicate subscore names are de-duplicated
- Per-subscore metadata is copied into `EvaluationResult.info`

```python
from hud.native import Grade
from hud.tools.types import SubScore

result = Grade.from_subscores(
[
SubScore(name="tests", value=1.0, weight=0.8),
SubScore(name="style", value=0.5, weight=0.2),
]
)
```

## Grader

`Grader` is the base class for reusable scoring helpers. Subclasses implement `compute_score(...)`, and `grade(...)` packages the result as a `SubScore`.

```python
from hud.native import Grader

class MyGrader(Grader):
name = "MyGrader"

@classmethod
def compute_score(cls, passed: bool) -> float:
return 1.0 if passed else 0.0

subscore = MyGrader.grade(weight=1.0, passed=True)
```

`grade(...)` also records JSON-safe copies of the grader parameters in subscore metadata under `_parameters`.

## BashGrader

`BashGrader` runs a command with `/bin/bash -lc` and scores it by exit code.

```python
from hud.native import BashGrader

subscore = BashGrader.grade(
weight=1.0,
command="pytest tests/test_checkout.py -q",
timeout=120,
)
```

Behavior:

- exit code `0` -> score `1.0`
- non-zero exit code -> score `0.0`
- timeout -> score `0.0` with timeout metadata
- metadata includes `stdout`, `stderr`, and `exit_code`

## Combinators

`Grader.any(...)` and `Grader.all(...)` combine multiple subscores into a single summary subscore.

```python
from hud.native import BashGrader, Grader

tests = BashGrader.grade(weight=0.5, command="pytest -q")
lint = BashGrader.grade(weight=0.5, command="ruff check .")

any_passes = Grader.any(weight=1.0, subscores=[tests, lint])
all_pass = Grader.all(weight=1.0, subscores=[tests, lint])
```

- `any(...)` uses the maximum input score
- `all(...)` uses the minimum input score

## See Also

- [Environments](/reference/environments)
- [Evals](/reference/evals)
- [Types](/reference/types)
6 changes: 5 additions & 1 deletion hud/native/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
"""Native environments bundled with the HUD SDK."""
"""Native environments and helpers bundled with the HUD SDK."""

from hud.native.graders import BashGrader, Grade, Grader

__all__ = ["BashGrader", "Grade", "Grader"]
222 changes: 222 additions & 0 deletions hud/native/graders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
"""Generic graders for native HUD evaluation."""

from __future__ import annotations

import logging
import subprocess
from typing import Any

from hud.tools.types import EvaluationResult, SubScore
from hud.utils.serialization import json_safe_dict

logger = logging.getLogger(__name__)

__all__ = ["BashGrader", "Grade", "Grader"]


def _dedupe_subscore_names(subscores: list[SubScore]) -> list[str]:
"""Return stable, unique names for a sequence of subscores."""
name_counts: dict[str, int] = {}
for item in subscores:
name_counts[item.name] = name_counts.get(item.name, 0) + 1

reserved_names = {item.name for item in subscores}
name_usage: dict[str, int] = {}
used_names: set[str] = set()
final_names: list[str] = []

for item in subscores:
if name_counts[item.name] == 1 and item.name not in used_names:
final_name = item.name
else:
suffix = name_usage.get(item.name, 0)
while True:
suffix += 1
candidate = f"{item.name}-{suffix}"
if candidate in used_names:
continue
if candidate in reserved_names:
continue
name_usage[item.name] = suffix
final_name = candidate
break
used_names.add(final_name)
final_names.append(final_name)

return final_names


class Grade:
"""Factory for building ``EvaluationResult`` objects from ``SubScore`` items."""

@staticmethod
def from_subscores(subscores: list[SubScore]) -> EvaluationResult:
"""Combine subscores into a weighted reward and ready-to-yield result.

Positive weights are normalized to sum to ``1.0`` so the returned
``EvaluationResult`` lines up with the SDK's subscore semantics.
Negative weights are preserved as penalties, including when they drive
the final reward below zero.
"""

if not subscores:
raise ValueError("subscores must not be empty")

positive_weight_sum = sum(item.weight for item in subscores if item.weight > 0)
if positive_weight_sum <= 0:
raise ValueError("subscores must include at least one positive weight")

normalized_subscores: list[SubScore] = []
metadata: dict[str, Any] = {}

for item, final_name in zip(subscores, _dedupe_subscore_names(subscores), strict=True):
normalized_weight = (
item.weight / positive_weight_sum if item.weight > 0 else item.weight
)
normalized_subscores.append(
SubScore(
name=final_name,
weight=normalized_weight,
value=item.value,
metadata=item.metadata,
)
)
if item.metadata is not None:
metadata[final_name] = item.metadata

reward = float(sum(item.value * item.weight for item in normalized_subscores))

return EvaluationResult(
reward=reward,
done=True,
subscores=normalized_subscores,
info=metadata,
)


class Grader:
"""Base class for reusable graders that emit ``SubScore`` objects."""

name: str = "BaseGrader"

@classmethod
def grade(cls, weight: float, name: str | None = None, **kwargs: Any) -> SubScore:
"""Run the grader and package the result as a ``SubScore``."""
result = cls.compute_score(**kwargs)

if isinstance(result, tuple):
score, metadata = result
else:
score = result
metadata = {}

return SubScore(
name=name or cls.name,
weight=weight,
value=float(score),
metadata={**metadata, "_parameters": json_safe_dict(kwargs)},
)

@classmethod
def compute_score(cls, *args: Any, **kwargs: Any) -> float | tuple[float, dict[str, Any]]:
"""Compute a score between ``0.0`` and ``1.0``."""
raise NotImplementedError("Subclasses must implement compute_score")

@classmethod
def any(cls, weight: float, subscores: list[SubScore]) -> SubScore:
"""Return a subscore that passes if any input subscore passes."""
if not subscores:
raise ValueError("subscores must not be empty")

unique_names = _dedupe_subscore_names(subscores)
return SubScore(
name=f"{cls.name}_any",
value=max(subscore.value for subscore in subscores),
weight=weight,
metadata={
"subscores": unique_names,
"subscore_metadata": {
unique_name: subscore.metadata
for unique_name, subscore in zip(unique_names, subscores, strict=True)
if subscore.metadata is not None
},
Comment thread
cursor[bot] marked this conversation as resolved.
},
)

@classmethod
def all(cls, weight: float, subscores: list[SubScore]) -> SubScore:
"""Return a subscore that passes only if all input subscores pass."""
if not subscores:
raise ValueError("subscores must not be empty")

unique_names = _dedupe_subscore_names(subscores)
return SubScore(
name=f"{cls.name}_all",
value=min(subscore.value for subscore in subscores),
weight=weight,
metadata={
"subscores": unique_names,
"subscore_metadata": {
unique_name: subscore.metadata
for unique_name, subscore in zip(unique_names, subscores, strict=True)
if subscore.metadata is not None
},
},
)


class BashGrader(Grader):
"""Run a shell command and score it by exit code."""

name = "BashGrader"

@classmethod
def compute_score(
cls,
command: str,
cwd: str | None = None,
timeout: int = 60,
**kwargs: Any,
) -> tuple[float, dict[str, Any]]:
"""Run ``command`` via ``bash -lc`` and return score plus execution metadata."""
del kwargs
logger.info("Running grader command: %s (cwd=%s, timeout=%ss)", command, cwd, timeout)
try:
result = subprocess.run(
["/bin/bash", "-lc", command],
cwd=cwd,
capture_output=True,
text=True,
timeout=timeout,
)
except subprocess.TimeoutExpired as exc:
stdout = (
(exc.stdout or b"").decode(errors="replace")
if isinstance(exc.stdout, bytes)
else (exc.stdout or "")
)
stderr = (
(exc.stderr or b"").decode(errors="replace")
if isinstance(exc.stderr, bytes)
else (exc.stderr or "")
)
return (
0.0,
{
"exit_code": None,
"stdout": stdout,
"stderr": stderr,
"timed_out": True,
"timeout": timeout,
},
)

score = 1.0 if result.returncode == 0 else 0.0
return (
score,
{
"exit_code": result.returncode,
"stdout": result.stdout,
"stderr": result.stderr,
},
)
1 change: 1 addition & 0 deletions hud/native/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for native HUD helpers."""
Loading
Loading