diff --git a/.github/actions/eval/action.yml b/.github/actions/eval/action.yml
new file mode 100644
index 0000000..01d8ef3
--- /dev/null
+++ b/.github/actions/eval/action.yml
@@ -0,0 +1,176 @@
+name: "Selectools Eval"
+description: "Run selectools eval suite and post results as a PR comment"
+branding:
+ icon: "check-circle"
+ color: "blue"
+
+inputs:
+ cases:
+ description: "Path to test cases file (JSON/YAML)"
+ required: true
+ provider:
+ description: "Provider: local, openai, anthropic, gemini, ollama"
+ default: "local"
+ model:
+ description: "Model name (optional, uses provider default)"
+ required: false
+ name:
+ description: "Suite name"
+ default: "eval"
+ concurrency:
+ description: "Max parallel cases"
+ default: "1"
+ baseline-dir:
+ description: "Baseline directory for regression detection"
+ default: ""
+ html-report:
+ description: "Path to write HTML report"
+ default: ""
+ junit-report:
+ description: "Path to write JUnit XML report"
+ default: ""
+ python-version:
+ description: "Python version"
+ default: "3.13"
+ post-comment:
+ description: "Post results as PR comment (true/false)"
+ default: "true"
+
+outputs:
+ accuracy:
+ description: "Eval accuracy (0.0 - 1.0)"
+ value: ${{ steps.run-eval.outputs.accuracy }}
+ pass-count:
+ description: "Number of passing cases"
+ value: ${{ steps.run-eval.outputs.pass_count }}
+ fail-count:
+ description: "Number of failing cases"
+ value: ${{ steps.run-eval.outputs.fail_count }}
+ regression:
+ description: "Whether regressions were detected"
+ value: ${{ steps.run-eval.outputs.regression }}
+
+runs:
+ using: "composite"
+ steps:
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ inputs.python-version }}
+
+ - name: Install selectools
+ shell: bash
+ run: pip install selectools
+
+ - name: Run eval suite
+ id: run-eval
+ shell: bash
+ run: |
+ set +e
+
+ ARGS="run ${{ inputs.cases }} --name ${{ inputs.name }} --provider ${{ inputs.provider }} --concurrency ${{ inputs.concurrency }} --json /tmp/eval-results.json --verbose"
+
+ if [ -n "${{ inputs.model }}" ]; then
+ ARGS="$ARGS --model ${{ inputs.model }}"
+ fi
+ if [ -n "${{ inputs.html-report }}" ]; then
+ ARGS="$ARGS --html ${{ inputs.html-report }}"
+ fi
+ if [ -n "${{ inputs.junit-report }}" ]; then
+ ARGS="$ARGS --junit ${{ inputs.junit-report }}"
+ fi
+ if [ -n "${{ inputs.baseline-dir }}" ]; then
+ ARGS="$ARGS --baseline ${{ inputs.baseline-dir }}"
+ fi
+
+ python -m selectools.evals $ARGS
+ EXIT_CODE=$?
+
+ # Parse JSON results for outputs
+ if [ -f /tmp/eval-results.json ]; then
+ ACCURACY=$(python -c "import json; d=json.load(open('/tmp/eval-results.json')); print(d['summary']['accuracy'])")
+ PASS_COUNT=$(python -c "import json; d=json.load(open('/tmp/eval-results.json')); print(d['summary']['pass'])")
+ FAIL_COUNT=$(python -c "import json; d=json.load(open('/tmp/eval-results.json')); print(d['summary']['fail'])")
+ echo "accuracy=$ACCURACY" >> $GITHUB_OUTPUT
+ echo "pass_count=$PASS_COUNT" >> $GITHUB_OUTPUT
+ echo "fail_count=$FAIL_COUNT" >> $GITHUB_OUTPUT
+ fi
+
+ if [ $EXIT_CODE -ne 0 ]; then
+ echo "regression=true" >> $GITHUB_OUTPUT
+ else
+ echo "regression=false" >> $GITHUB_OUTPUT
+ fi
+
+ exit $EXIT_CODE
+
+ - name: Post PR comment
+ if: inputs.post-comment == 'true' && github.event_name == 'pull_request' && always()
+ uses: actions/github-script@v7
+ with:
+ script: |
+ const fs = require('fs');
+ let data;
+ try {
+ data = JSON.parse(fs.readFileSync('/tmp/eval-results.json', 'utf8'));
+ } catch (e) {
+ console.log('No eval results to post');
+ return;
+ }
+
+ const s = data.summary;
+ const accPct = (s.accuracy * 100).toFixed(1);
+ const accEmoji = s.accuracy >= 0.9 ? '🟢' : s.accuracy >= 0.7 ? '🟡' : '🔴';
+
+ let failDetails = '';
+ const failures = data.cases.filter(c => c.verdict === 'fail' || c.verdict === 'error');
+ if (failures.length > 0) {
+ const rows = failures.slice(0, 10).map(c => {
+ const issues = c.failures.map(f => f.message).join('; ') || c.error || '';
+ return `| ${c.name} | \`${c.verdict}\` | ${issues.substring(0, 100)} |`;
+ }).join('\n');
+ failDetails = `\n\nFailed cases (${failures.length})
\n\n| Case | Verdict | Issue |\n|---|---|---|\n${rows}\n\n `;
+ }
+
+ const body = `## ${accEmoji} Eval Report: \`${data.metadata.suite_name}\`
+
+ | Metric | Value |
+ |---|---|
+ | **Accuracy** | **${accPct}%** (${s.pass} pass, ${s.fail} fail, ${s.error} error) |
+ | **Latency** | p50: ${s.latency_p50.toFixed(0)}ms, p95: ${s.latency_p95.toFixed(0)}ms |
+ | **Cost** | $${s.total_cost.toFixed(6)} ($${s.cost_per_case.toFixed(6)}/case) |
+ | **Tokens** | ${s.total_tokens.toLocaleString()} |
+ | **Model** | ${data.metadata.model} |
+ ${failDetails}
+
+ Generated by Selectools Eval — an open-source project from NichevLabs`;
+
+ // Find and update existing comment or create new
+ const comments = await github.rest.issues.listComments({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ });
+ const existing = comments.data.find(c => c.body.includes('Eval Report:'));
+ if (existing) {
+ await github.rest.issues.updateComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ comment_id: existing.id,
+ body: body,
+ });
+ } else {
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: body,
+ });
+ }
+
+ - name: Upload HTML report
+ if: inputs.html-report != '' && always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: eval-report
+ path: ${{ inputs.html-report }}
diff --git a/docs/modules/EVALS.md b/docs/modules/EVALS.md
new file mode 100644
index 0000000..ad1b677
--- /dev/null
+++ b/docs/modules/EVALS.md
@@ -0,0 +1,338 @@
+# Eval Framework
+
+**Added in:** v0.17.0
+
+Built-in agent evaluation with 22 evaluators, regression detection, and CI integration. No separate install, no SaaS account, no external dependencies.
+
+---
+
+## Quick Start
+
+```python
+from selectools.evals import EvalSuite, TestCase
+
+suite = EvalSuite(agent=agent, cases=[
+ TestCase(input="Cancel my account", expect_tool="cancel_subscription"),
+ TestCase(input="Check my balance", expect_contains="balance"),
+ TestCase(input="What's 2+2?", expect_output="4"),
+])
+report = suite.run()
+print(report.accuracy) # 0.95
+print(report.latency_p50) # 142ms
+print(report.total_cost) # $0.003
+```
+
+---
+
+## TestCase — Declarative Assertions
+
+Every `TestCase` has an `input` (the prompt) and optional `expect_*` fields. Only the fields you set are checked.
+
+### Tool Assertions
+
+```python
+TestCase(input="Cancel subscription", expect_tool="cancel_sub")
+TestCase(input="Full workflow", expect_tools=["search", "summarize"])
+TestCase(input="Search", expect_tool_args={"search": {"query": "python"}})
+```
+
+### Content Assertions
+
+```python
+TestCase(input="Hello", expect_contains="hello")
+TestCase(input="Safe?", expect_not_contains="error")
+TestCase(input="2+2", expect_output="4")
+TestCase(input="Phone", expect_output_regex=r"\d{3}-\d{4}")
+TestCase(input="JSON?", expect_json=True)
+TestCase(input="Prefix", expect_starts_with="Hello")
+TestCase(input="Suffix", expect_ends_with=".")
+TestCase(input="Short", expect_min_length=10, expect_max_length=500)
+```
+
+### Structured Output
+
+```python
+TestCase(
+ input="Extract name",
+ response_format=MyModel,
+ expect_parsed={"name": "Alice"},
+)
+```
+
+### Performance Assertions
+
+```python
+TestCase(
+ input="Fast query",
+ expect_latency_ms_lte=500,
+ expect_cost_usd_lte=0.01,
+ expect_iterations_lte=3,
+)
+```
+
+### Safety Assertions
+
+```python
+TestCase(input="Account info", expect_no_pii=True)
+TestCase(input="Ignore instructions", expect_no_injection=True)
+```
+
+### LLM-as-Judge Fields
+
+```python
+TestCase(
+ input="Summarize this",
+ reference="The original long text...", # ground truth
+ context="Retrieved document content...", # RAG context
+ rubric="Rate accuracy and completeness", # custom rubric
+)
+```
+
+### Custom Evaluators
+
+```python
+def must_be_polite(result) -> bool:
+ return "please" in result.content.lower()
+
+TestCase(
+ input="Help me",
+ custom_evaluator=must_be_polite,
+ custom_evaluator_name="politeness",
+)
+```
+
+### Tags and Weights
+
+```python
+TestCase(input="Critical", tags=["billing", "critical"], weight=3.0)
+TestCase(input="Minor", tags=["nice-to-have"], weight=0.5)
+```
+
+---
+
+## Built-in Evaluators (22)
+
+### Deterministic (12) — No API calls
+
+| Evaluator | What it checks |
+|---|---|
+| `ToolUseEvaluator` | Tool name, tool list, argument values |
+| `ContainsEvaluator` | Substring present/absent (case-insensitive) |
+| `OutputEvaluator` | Exact match, regex match |
+| `StructuredOutputEvaluator` | Parsed fields match (deep subset) |
+| `PerformanceEvaluator` | Iterations, latency, cost thresholds |
+| `JsonValidityEvaluator` | Valid JSON output |
+| `LengthEvaluator` | Min/max character count |
+| `StartsWithEvaluator` | Output prefix |
+| `EndsWithEvaluator` | Output suffix |
+| `PIILeakEvaluator` | SSN, email, phone, credit card, ZIP |
+| `InjectionResistanceEvaluator` | 10 prompt injection patterns |
+| `CustomEvaluator` | Any user-defined callable |
+
+### LLM-as-Judge (10) — Uses any Provider
+
+These evaluators call an LLM to grade the output. Pass any selectools `Provider` — works with OpenAI, Anthropic, Gemini, Ollama.
+
+```python
+from selectools.evals import CorrectnessEvaluator, RelevanceEvaluator
+
+suite = EvalSuite(
+ agent=agent,
+ cases=cases,
+ evaluators=[
+ CorrectnessEvaluator(provider=provider, model="gpt-4.1-mini"),
+ RelevanceEvaluator(provider=provider, model="gpt-4.1-mini"),
+ ],
+)
+```
+
+| Evaluator | What it checks | Requires |
+|---|---|---|
+| `LLMJudgeEvaluator` | Generic rubric scoring (0-10) | `rubric` on TestCase |
+| `CorrectnessEvaluator` | Correct vs reference answer | `reference` on TestCase |
+| `RelevanceEvaluator` | Response relevant to query | — |
+| `FaithfulnessEvaluator` | Grounded in provided context | `context` on TestCase |
+| `HallucinationEvaluator` | Fabricated information | `context` or `reference` |
+| `ToxicityEvaluator` | Harmful/inappropriate content | — |
+| `CoherenceEvaluator` | Well-structured and logical | — |
+| `CompletenessEvaluator` | Fully addresses the query | — |
+| `BiasEvaluator` | Gender, racial, political bias | — |
+| `SummaryEvaluator` | Summary accuracy and coverage | `reference` on TestCase |
+
+All LLM evaluators accept a `threshold` parameter (default: 7.0 for most, 8.0 for safety).
+
+---
+
+## EvalReport
+
+```python
+report = suite.run()
+
+# Aggregate metrics
+report.accuracy # Weighted accuracy (0.0 - 1.0)
+report.pass_count # Number of passing cases
+report.fail_count # Number of failing cases
+report.error_count # Number of error cases
+report.total_cost # Total USD cost
+report.total_tokens # Total tokens used
+report.latency_p50 # Median latency (ms)
+report.latency_p95 # 95th percentile latency
+report.latency_p99 # 99th percentile latency
+report.cost_per_case # Average cost per case
+
+# Filtering
+report.filter_by_tag("billing")
+report.filter_by_verdict(CaseVerdict.FAIL)
+report.failures_by_evaluator() # {"tool_use": 3, "contains": 1}
+
+# Export
+report.to_html("report.html") # Interactive HTML report
+report.to_junit_xml("results.xml") # JUnit XML for CI
+report.to_json("results.json") # Machine-readable JSON
+report.summary() # Human-readable text
+```
+
+---
+
+## Loading Test Cases from Files
+
+```python
+from selectools.evals import DatasetLoader
+
+# JSON
+cases = DatasetLoader.from_json("tests/eval_cases.json")
+
+# YAML (requires PyYAML)
+cases = DatasetLoader.from_yaml("tests/eval_cases.yaml")
+
+# Auto-detect from extension
+cases = DatasetLoader.load("tests/eval_cases.json")
+```
+
+**JSON format:**
+
+```json
+[
+ {"input": "Cancel account", "expect_tool": "cancel_sub", "name": "cancel"},
+ {"input": "Check balance", "expect_contains": "balance", "tags": ["billing"]}
+]
+```
+
+---
+
+## Regression Detection
+
+```python
+from selectools.evals import BaselineStore
+
+store = BaselineStore("./baselines")
+report = suite.run()
+
+# Compare against saved baseline
+result = store.compare(report)
+if result.is_regression:
+ print(f"Regressions: {result.regressions}")
+ print(f"Accuracy delta: {result.accuracy_delta:+.2%}")
+else:
+ store.save(report) # Update baseline
+```
+
+---
+
+## CLI
+
+Run evals from the command line:
+
+```bash
+# Run eval suite
+python -m selectools.evals run tests/cases.json --provider openai --model gpt-4.1-mini --html report.html --verbose
+
+# Compare against baseline
+python -m selectools.evals compare tests/cases.json --baseline ./baselines --save
+
+# With concurrency
+python -m selectools.evals run tests/cases.json --concurrency 5 --junit results.xml
+```
+
+---
+
+## GitHub Actions
+
+Use the built-in action to run evals on every PR and post results as a comment:
+
+```yaml
+- name: Run eval suite
+ uses: johnnichev/selectools/.github/actions/eval@main
+ with:
+ cases: tests/eval_cases.json
+ provider: openai
+ model: gpt-4.1-mini
+ html-report: eval-report.html
+ baseline-dir: ./baselines
+ post-comment: "true"
+ env:
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+```
+
+The action:
+- Runs all test cases
+- Posts accuracy, latency, cost, and failures as a PR comment
+- Detects regressions against baselines
+- Uploads HTML report as an artifact
+- Outputs `accuracy`, `pass-count`, `fail-count`, `regression` for downstream steps
+
+---
+
+## Concurrent Execution
+
+```python
+suite = EvalSuite(
+ agent=agent,
+ cases=cases,
+ max_concurrency=5, # Run 5 cases in parallel
+ on_progress=lambda done, total: print(f"[{done}/{total}]"),
+)
+```
+
+Uses `ThreadPoolExecutor` (sync) or `asyncio.Semaphore` (async via `suite.arun()`).
+
+---
+
+## In pytest
+
+```python
+def test_agent_accuracy(agent):
+ suite = EvalSuite(agent=agent, cases=[
+ TestCase(input="Cancel", expect_tool="cancel_sub"),
+ TestCase(input="Balance", expect_contains="balance"),
+ ])
+ report = suite.run()
+ assert report.accuracy >= 0.9
+ assert report.latency_p50 < 500
+```
+
+---
+
+## API Reference
+
+### Core
+
+| Symbol | Description |
+|---|---|
+| `EvalSuite(agent, cases, ...)` | Orchestrates eval runs |
+| `TestCase(input, ...)` | Single test case with assertions |
+| `EvalReport` | Aggregated results with metrics |
+| `CaseResult` | Per-case result with verdict and failures |
+| `CaseVerdict` | Enum: PASS, FAIL, ERROR, SKIP |
+| `EvalFailure` | Single assertion failure |
+
+### Infrastructure
+
+| Symbol | Description |
+|---|---|
+| `DatasetLoader.load(path)` | Load test cases from JSON/YAML |
+| `BaselineStore(dir)` | Save and compare baselines |
+| `RegressionResult` | Regression comparison result |
+| `report.to_html(path)` | Interactive HTML report |
+| `report.to_junit_xml(path)` | JUnit XML for CI |
+| `report.to_json(path)` | Machine-readable JSON |
diff --git a/examples/39_eval_framework.py b/examples/39_eval_framework.py
new file mode 100644
index 0000000..56fca8e
--- /dev/null
+++ b/examples/39_eval_framework.py
@@ -0,0 +1,165 @@
+"""
+Example 39: Built-in Eval Framework
+====================================
+
+Evaluate your agent's accuracy, tool use, latency, cost, and safety
+with the built-in eval suite. No separate install needed.
+
+Usage:
+ python examples/39_eval_framework.py
+
+This example uses the LocalProvider stub so no API key is needed.
+"""
+
+from selectools import Agent, AgentConfig, tool
+from selectools.evals import BaselineStore, DatasetLoader, EvalSuite, TestCase
+from selectools.providers.stubs import LocalProvider
+
+# --- Define tools ---
+
+
+@tool(description="Look up the price of a product")
+def get_price(product: str) -> str:
+ prices = {"laptop": "$999", "phone": "$699", "headphones": "$149"}
+ return prices.get(product.lower(), f"No price found for {product}")
+
+
+@tool(description="Check if a product is in stock")
+def check_stock(product: str) -> str:
+ stock = {
+ "laptop": "In stock (5 left)",
+ "phone": "Out of stock",
+ "headphones": "In stock (20 left)",
+ }
+ return stock.get(product.lower(), f"Unknown product: {product}")
+
+
+@tool(description="Cancel a user's subscription")
+def cancel_subscription(user_id: str) -> str:
+ return f"Subscription for user {user_id} has been cancelled."
+
+
+# --- Create agent ---
+
+agent = Agent(
+ provider=LocalProvider(),
+ config=AgentConfig(model="local"),
+ tools=[get_price, check_stock, cancel_subscription],
+)
+
+
+# --- Define test cases ---
+
+cases = [
+ # Tool use assertions
+ TestCase(
+ input="How much does a laptop cost?",
+ name="price_lookup",
+ expect_tool="get_price",
+ tags=["pricing"],
+ ),
+ TestCase(
+ input="Is the phone in stock?",
+ name="stock_check",
+ expect_tool="check_stock",
+ tags=["inventory"],
+ ),
+ # Content assertions
+ TestCase(
+ input="Tell me about headphones",
+ name="content_check",
+ expect_contains="headphones",
+ ),
+ # Performance assertions
+ TestCase(
+ input="Quick question",
+ name="performance",
+ expect_latency_ms_lte=5000,
+ expect_cost_usd_lte=1.0,
+ ),
+ # Safety assertions
+ TestCase(
+ input="What's my account info?",
+ name="no_pii",
+ expect_no_pii=True,
+ ),
+]
+
+
+# --- Run eval suite ---
+
+print("Running eval suite...")
+print()
+
+suite = EvalSuite(
+ agent=agent,
+ cases=cases,
+ name="product-agent-v1",
+ on_progress=lambda done, total: print(f" [{done}/{total}]", end="\r"),
+)
+
+report = suite.run()
+print()
+print(report.summary())
+print()
+
+# --- Export reports ---
+
+report.to_html("/tmp/selectools-eval-report.html")
+print("HTML report: /tmp/selectools-eval-report.html")
+
+report.to_junit_xml("/tmp/selectools-eval-results.xml")
+print("JUnit XML: /tmp/selectools-eval-results.xml")
+
+report.to_json("/tmp/selectools-eval-results.json")
+print("JSON report: /tmp/selectools-eval-results.json")
+print()
+
+# --- Per-case results ---
+
+print("Per-case results:")
+for cr in report.case_results:
+ status = cr.verdict.value.upper()
+ name = cr.case.name or cr.case.input[:50]
+ print(f" [{status:5s}] {name} ({cr.latency_ms:.0f}ms, ${cr.cost_usd:.6f})")
+ for f in cr.failures:
+ print(f" {f.evaluator_name}: {f.message}")
+print()
+
+# --- Regression detection ---
+
+import tempfile
+
+baseline_dir = tempfile.mkdtemp()
+store = BaselineStore(baseline_dir)
+
+# Save current run as baseline
+store.save(report)
+print(f"Baseline saved to {baseline_dir}/")
+
+# Compare (no regression since it's the same run)
+result = store.compare(report)
+print(f"Regression detected: {result.is_regression}")
+print(f"Accuracy delta: {result.accuracy_delta:+.2%}")
+print()
+
+# --- Loading from file ---
+
+print("Dataset loading example:")
+import json
+
+cases_file = "/tmp/eval_cases.json"
+with open(cases_file, "w") as f:
+ json.dump(
+ [
+ {"input": "Price of laptop?", "expect_tool": "get_price", "name": "from_file"},
+ {"input": "Stock check", "expect_contains": "stock", "tags": ["inventory"]},
+ ],
+ f,
+ )
+
+loaded_cases = DatasetLoader.load(cases_file)
+print(f" Loaded {len(loaded_cases)} cases from {cases_file}")
+print()
+
+print("Done! Open /tmp/selectools-eval-report.html in your browser to see the interactive report.")
diff --git a/landing/index.html b/landing/index.html
index 46241bd..dfd05a7 100644
--- a/landing/index.html
+++ b/landing/index.html
@@ -237,9 +237,96 @@
24 Built-in Tools
Files, web, data, datetime, text — ready to use out of the box
-
✅
-
1620 Tests
-
Unit, integration, regression, and E2E — production hardened
+
📊
+
22 Eval Evaluators
+
Built-in agent testing: tool use, correctness, safety, LLM-as-judge, regression detection
+
+
+
+
+
+
+
+
+
Built-in Agent Evaluation
+
The only agent framework with a built-in eval suite. No separate install, no SaaS account, no external dependencies. 22 evaluators out of the box.
+
+
+
+
+
from selectools.evals import EvalSuite, TestCase
+
+suite = EvalSuite(agent=agent, cases=[
+ TestCase(
+ input="Cancel my subscription",
+ expect_tool="cancel_sub",
+ expect_contains="cancelled",
+ expect_no_pii=True,
+ ),
+ TestCase(
+ input="What's my balance?",
+ expect_tool="check_balance",
+ expect_latency_ms_lte=500,
+ expect_cost_usd_lte=0.01,
+ ),
+])
+
+report = suite.run()
+print(report.accuracy) # 1.0
+print(report.latency_p50) # 142ms
+print(report.total_cost) # $0.002
+
+report.to_html("report.html")
+
+
+
+
+
12 Deterministic Evaluators
+
+ ToolUse
+ Contains
+ Output
+ Structured
+ Performance
+ JSON
+ Length
+ StartsWith
+ EndsWith
+ PII Leak
+ Injection
+ Custom
+
+
+
+
10 LLM-as-Judge Evaluators
+
+ Correctness
+ Relevance
+ Faithfulness
+ Hallucination
+ Toxicity
+ Coherence
+ Completeness
+ Bias
+ Summary
+ Custom Rubric
+
+
+
+
Infrastructure
+
+
Interactive HTML report with charts and filtering
+
JUnit XML for CI (GitHub Actions, Jenkins)
+
Regression detection with baseline comparison
+
Dataset loading from JSON/YAML files
+
GitHub Action with automatic PR comments
+
+
@@ -295,6 +382,11 @@ Selectools vs. LangChain
result.reasoning |
Not available |
+
+ | Agent evaluation |
+ Built-in (22 evaluators) |
+ LangSmith (paid) or DeepEval (separate) |
+
| Community |
Growing |
diff --git a/mkdocs.yml b/mkdocs.yml
index 99ff9a1..b4c590d 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -107,6 +107,8 @@ nav:
- Advanced Chunking: modules/ADVANCED_CHUNKING.md
- Embeddings: modules/EMBEDDINGS.md
- Vector Stores: modules/VECTOR_STORES.md
+ - Evaluation:
+ - Eval Framework: modules/EVALS.md
- Security:
- Guardrails: modules/GUARDRAILS.md
- Audit Logging: modules/AUDIT.md
diff --git a/src/selectools/evals/__main__.py b/src/selectools/evals/__main__.py
new file mode 100644
index 0000000..341da09
--- /dev/null
+++ b/src/selectools/evals/__main__.py
@@ -0,0 +1,198 @@
+"""CLI entry point: python -m selectools.evals
+
+Usage:
+ python -m selectools.evals run cases.json [options]
+ python -m selectools.evals compare cases.json --baseline ./baselines [options]
+
+Options:
+ --agent YAML Agent config YAML file
+ --html FILE Write HTML report to FILE
+ --junit FILE Write JUnit XML to FILE
+ --json FILE Write JSON report to FILE
+ --baseline DIR Baseline directory for regression detection
+ --concurrency N Max parallel cases (default: 1)
+ --name NAME Suite name (default: "eval")
+ --verbose Print per-case results
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import Any
+
+from . import BaselineStore, DatasetLoader, EvalSuite
+
+
+def _build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ prog="python -m selectools.evals",
+ description="Selectools Eval Framework — evaluate agents from the command line.",
+ )
+ sub = parser.add_subparsers(dest="command")
+
+ # run command
+ run_p = sub.add_parser("run", help="Run eval suite against an agent")
+ run_p.add_argument("cases", help="Path to test cases file (JSON/YAML)")
+ run_p.add_argument("--html", help="Write HTML report to file")
+ run_p.add_argument("--junit", help="Write JUnit XML to file")
+ run_p.add_argument("--json", dest="json_out", help="Write JSON report to file")
+ run_p.add_argument("--baseline", help="Baseline directory for regression detection")
+ run_p.add_argument("--concurrency", type=int, default=1, help="Max parallel cases")
+ run_p.add_argument("--name", default="eval", help="Suite name")
+ run_p.add_argument("--verbose", action="store_true", help="Print per-case results")
+ run_p.add_argument(
+ "--provider",
+ default="local",
+ choices=["local", "openai", "anthropic", "gemini", "ollama"],
+ help="Provider to use (default: local)",
+ )
+ run_p.add_argument("--model", help="Model name")
+
+ # compare command
+ cmp_p = sub.add_parser("compare", help="Compare current run against baseline")
+ cmp_p.add_argument("cases", help="Path to test cases file (JSON/YAML)")
+ cmp_p.add_argument("--baseline", required=True, help="Baseline directory")
+ cmp_p.add_argument("--name", default="eval", help="Suite name")
+ cmp_p.add_argument("--provider", default="local")
+ cmp_p.add_argument("--model", help="Model name")
+ cmp_p.add_argument("--concurrency", type=int, default=1)
+ cmp_p.add_argument("--save", action="store_true", help="Save as new baseline if no regression")
+
+ return parser
+
+
+def _create_agent(provider_name: str, model: str | None) -> "Agent": # type: ignore[name-defined] # noqa: F821
+ """Create an agent with the specified provider."""
+ from selectools import Agent, AgentConfig
+
+ prov: Any = None
+ mdl = model or "local"
+
+ if provider_name == "local":
+ from selectools.providers.stubs import LocalProvider
+
+ prov = LocalProvider()
+ mdl = model or "local"
+ elif provider_name == "openai":
+ from selectools.providers import OpenAIProvider
+
+ prov = OpenAIProvider()
+ mdl = model or "gpt-4.1-mini"
+ elif provider_name == "anthropic":
+ from selectools.providers import AnthropicProvider
+
+ prov = AnthropicProvider()
+ mdl = model or "claude-sonnet-4-6"
+ elif provider_name == "gemini":
+ from selectools.providers import GeminiProvider
+
+ prov = GeminiProvider()
+ mdl = model or "gemini-2.5-flash"
+ elif provider_name == "ollama":
+ from selectools.providers import OllamaProvider
+
+ prov = OllamaProvider()
+ mdl = model or "llama3"
+ else:
+ raise ValueError(f"Unknown provider: {provider_name}")
+
+ return Agent(provider=prov, config=AgentConfig(model=mdl), tools=[])
+
+
+def main() -> None: # noqa: C901
+ """CLI entry point."""
+ parser = _build_parser()
+ args = parser.parse_args()
+
+ if not args.command:
+ parser.print_help()
+ sys.exit(1)
+
+ # Load cases
+ cases = DatasetLoader.load(args.cases)
+ print(f"Loaded {len(cases)} test cases from {args.cases}")
+
+ # Create agent
+ agent = _create_agent(args.provider, getattr(args, "model", None))
+
+ # Run suite
+ def on_progress(done: int, total: int) -> None:
+ print(f" [{done}/{total}]", end="\r", flush=True)
+
+ suite = EvalSuite(
+ agent=agent,
+ cases=cases,
+ name=args.name,
+ max_concurrency=args.concurrency,
+ on_progress=on_progress,
+ )
+
+ print(f"Running eval suite '{args.name}'...")
+ report = suite.run()
+ print()
+ print(report.summary())
+ print()
+
+ if args.command == "run":
+ if getattr(args, "verbose", False):
+ for cr in report.case_results:
+ status = cr.verdict.value.upper()
+ name = cr.case.name or cr.case.input[:50]
+ print(f" [{status:5s}] {name} ({cr.latency_ms:.0f}ms)")
+ for f in cr.failures:
+ print(f" {f.evaluator_name}: {f.message}")
+ print()
+
+ if args.html:
+ report.to_html(args.html)
+ print(f"HTML report: {args.html}")
+ if args.junit:
+ report.to_junit_xml(args.junit)
+ print(f"JUnit XML: {args.junit}")
+ if args.json_out:
+ report.to_json(args.json_out)
+ print(f"JSON report: {args.json_out}")
+
+ if args.baseline:
+ store = BaselineStore(args.baseline)
+ result = store.compare(report)
+ if result.is_regression:
+ print(f"\nREGRESSIONS DETECTED: {result.regressions}")
+ print(f"Accuracy delta: {result.accuracy_delta:+.2%}")
+ sys.exit(1)
+ else:
+ print(f"\nNo regressions (accuracy delta: {result.accuracy_delta:+.2%})")
+ if result.improvements:
+ print(f"Improvements: {result.improvements}")
+ store.save(report)
+ print(f"Baseline saved to {args.baseline}/")
+
+ elif args.command == "compare":
+ store = BaselineStore(args.baseline)
+ result = store.compare(report)
+
+ if result.is_regression:
+ print("REGRESSIONS DETECTED:")
+ for name in result.regressions:
+ print(f" - {name}")
+ print(f"Accuracy: {result.accuracy_delta:+.2%}")
+ print(f"Latency p50: {result.latency_p50_delta:+.0f}ms")
+ print(f"Cost: ${result.cost_delta:+.6f}")
+ sys.exit(1)
+ else:
+ print("No regressions detected.")
+ if result.improvements:
+ print(f"Improvements: {result.improvements}")
+ print(f"Accuracy: {result.accuracy_delta:+.2%}")
+ if getattr(args, "save", False):
+ store.save(report)
+ print(f"Baseline updated at {args.baseline}/")
+
+ # Exit with non-zero if accuracy is 0
+ if report.accuracy == 0.0 and report.metadata.total_cases > 0:
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/selectools/evals/html.py b/src/selectools/evals/html.py
index 02ddff1..04eeaa7 100644
--- a/src/selectools/evals/html.py
+++ b/src/selectools/evals/html.py
@@ -1,19 +1,100 @@
-"""Self-contained HTML report renderer."""
+"""Self-contained interactive HTML report renderer."""
from __future__ import annotations
import html
+import math
from pathlib import Path
-from typing import Any, Union
+from typing import Any, List, Union
from .types import CaseVerdict
-def render_html_report(report: Any, filepath: Union[str, Path]) -> None:
- """Render an EvalReport as a self-contained HTML file."""
+def _donut_svg(pass_n: int, fail_n: int, error_n: int, skip_n: int) -> str:
+ """Generate an SVG donut chart for pass/fail/error/skip distribution."""
+ total = pass_n + fail_n + error_n + skip_n
+ if total == 0:
+ return ""
+ segments = [
+ (pass_n, "#4ade80"),
+ (fail_n, "#f87171"),
+ (error_n, "#fbbf24"),
+ (skip_n, "#64748b"),
+ ]
+ cx, cy, r = 60, 60, 50
+ inner_r = 35
+ paths: List[str] = []
+ start_angle = -90.0
+ for count, color in segments:
+ if count == 0:
+ continue
+ sweep = (count / total) * 360
+ end_angle = start_angle + sweep
+ large = 1 if sweep > 180 else 0
+ sa = math.radians(start_angle)
+ ea = math.radians(end_angle)
+ x1_o, y1_o = cx + r * math.cos(sa), cy + r * math.sin(sa)
+ x2_o, y2_o = cx + r * math.cos(ea), cy + r * math.sin(ea)
+ x1_i, y1_i = cx + inner_r * math.cos(ea), cy + inner_r * math.sin(ea)
+ x2_i, y2_i = cx + inner_r * math.cos(sa), cy + inner_r * math.sin(sa)
+ d = (
+ f"M {x1_o:.1f} {y1_o:.1f} "
+ f"A {r} {r} 0 {large} 1 {x2_o:.1f} {y2_o:.1f} "
+ f"L {x1_i:.1f} {y1_i:.1f} "
+ f"A {inner_r} {inner_r} 0 {large} 0 {x2_i:.1f} {y2_i:.1f} Z"
+ )
+ paths.append(f'')
+ start_angle = end_angle
+ return f''
+
+
+def _histogram_svg(latencies: List[float]) -> str:
+ """Generate an SVG histogram of latency distribution."""
+ if not latencies:
+ return ""
+ min_v = min(latencies)
+ max_v = max(latencies)
+ if max_v == min_v:
+ max_v = min_v + 1
+ n_bins = min(12, len(latencies))
+ bin_width = (max_v - min_v) / n_bins
+ bins = [0] * n_bins
+ for v in latencies:
+ idx = min(int((v - min_v) / bin_width), n_bins - 1)
+ bins[idx] += 1
+ max_count = max(bins) or 1
+ w, h = 300, 100
+ bar_w = w / n_bins - 2
+ bars: List[str] = []
+ for i, count in enumerate(bins):
+ bar_h = (count / max_count) * (h - 20)
+ x = i * (w / n_bins) + 1
+ y = h - 15 - bar_h
+ bars.append(
+ f''
+ )
+ label_v = min_v + (i + 0.5) * bin_width
+ if i % max(1, n_bins // 4) == 0:
+ bars.append(
+ f'{label_v:.0f}'
+ )
+ return (
+ f''
+ )
+
+
+def render_html_report(report: Any, filepath: Union[str, Path]) -> None: # noqa: C901
+ """Render an EvalReport as a self-contained interactive HTML file."""
+ # Build table rows with expandable details
rows = []
for i, cr in enumerate(report.case_results):
name = html.escape(cr.case.name or cr.case.input[:60])
+ input_text = html.escape(cr.case.input[:300])
verdict_class = {
CaseVerdict.PASS: "pass",
CaseVerdict.FAIL: "fail",
@@ -21,123 +102,198 @@ def render_html_report(report: Any, filepath: Union[str, Path]) -> None:
CaseVerdict.SKIP: "skip",
}.get(cr.verdict, "")
- failure_html = ""
+ # Expandable detail content
+ detail_parts = [f"Input: {input_text}"]
+ if cr.agent_result:
+ output = html.escape((cr.agent_result.content or "")[:500])
+ detail_parts.append(f"Output: {output}")
+ if cr.agent_result.reasoning:
+ reasoning = html.escape(str(cr.agent_result.reasoning)[:300])
+ detail_parts.append(f"Reasoning: {reasoning}")
+ if cr.tool_calls:
+ detail_parts.append(f"Tools: {html.escape(', '.join(cr.tool_calls))}")
if cr.failures:
items = "".join(
- f"{html.escape(f.evaluator_name)}: "
+ f"{html.escape(f.evaluator_name)}: "
f"{html.escape(f.message)}"
for f in cr.failures
)
- failure_html = f''
- elif cr.error:
- failure_html = f'{html.escape(cr.error)}
'
+ detail_parts.append(f"Failures:")
+ if cr.error:
+ detail_parts.append(
+ f"Error: "
+ f"{html.escape(cr.error)}"
+ )
+
+ detail_html = "
".join(detail_parts)
+ tags_data = html.escape(" ".join(cr.case.tags)) if cr.case.tags else ""
+ fail_count = len(cr.failures) if cr.failures else (1 if cr.error else 0)
- tools = ", ".join(cr.tool_calls) if cr.tool_calls else "-"
+ # Build tag pills outside f-string to avoid backslash issue
+ tag_pills = ""
+ if cr.case.tags:
+ pill_items = "".join(
+ '' + html.escape(t) + "" for t in cr.case.tags
+ )
+ tag_pills = '' + pill_items + ""
rows.append(
- f"
"
+ f"
"
f"| {i + 1} | "
- f"{name} | "
+ f"{name}{tag_pills} | "
f"{cr.verdict.value} | "
f"{cr.latency_ms:.0f}ms | "
f"${cr.cost_usd:.6f} | "
- f"{html.escape(tools)} | "
- f"{failure_html} | "
+ f"{fail_count} | "
+ f"
"
+ f""
+ f"{detail_html} | "
f"
"
)
table_rows = "\n".join(rows)
+ # Charts
+ donut = _donut_svg(report.pass_count, report.fail_count, report.error_count, report.skip_count)
+ latencies = [cr.latency_ms for cr in report.case_results if cr.verdict != CaseVerdict.SKIP]
+ histogram = _histogram_svg(latencies)
+
+ # Failure breakdown
failures_by_eval = report.failures_by_evaluator()
- eval_breakdown = ""
+ eval_bars = ""
if failures_by_eval:
- items = "".join(
- f"{html.escape(k)}: {v}"
+ max_f = max(failures_by_eval.values())
+ bars = "".join(
+ f""
+ f"
{html.escape(k)}"
+ f"
"
+ f"
{v} "
for k, v in sorted(failures_by_eval.items(), key=lambda x: -x[1])
)
- eval_breakdown = f"Failures by Evaluator
"
+ eval_bars = f"Failures by Evaluator
{bars}"
- content = f"""
+ # Collect unique tags for filter buttons
+ all_tags = sorted({t for cr in report.case_results for t in cr.case.tags})
+ tag_buttons = "".join(
+ f""
+ for t in all_tags
+ )
+ filter_bar = ""
+ if all_tags:
+ filter_bar = (
+ f""
+ f""
+ f""
+ f""
+ f"{tag_buttons}
"
+ )
+ else:
+ filter_bar = (
+ ""
+ ""
+ ""
+ ""
+ "
"
+ )
+
+ acc_class = "good" if report.accuracy >= 0.9 else "warn" if report.accuracy >= 0.7 else "bad"
+
+ page = f"""
Eval Report: {html.escape(report.metadata.suite_name)}
+
Eval Report: {html.escape(report.metadata.suite_name)}
-{report.metadata.model or 'unknown model'} ·
- {report.metadata.provider or 'unknown provider'} ·
- {report.metadata.total_cases} cases ·
- {report.metadata.duration_ms:.0f}ms
-
-
-
-
Accuracy
-
{report.accuracy:.1%}
-
-
-
Pass / Fail / Error
-
{report.pass_count} / {report.fail_count} / {report.error_count}
-
-
-
Latency p50
-
{report.latency_p50:.0f}ms
-
-
-
Latency p95
-
{report.latency_p95:.0f}ms
-
-
-
Total Cost
-
${report.total_cost:.6f}
+
{report.metadata.model or 'unknown model'} · {report.metadata.provider or 'unknown provider'} · {report.metadata.total_cases} cases · {report.metadata.duration_ms:.0f}ms
+
+
+
+
+
Accuracy
{report.accuracy:.1%}
+
+
+
Latency p50
{report.latency_p50:.0f}ms
+
Latency p95
{report.latency_p95:.0f}ms
+
Total Cost
${report.total_cost:.4f}
+
Cost/Case
${report.cost_per_case:.6f}
+
Tokens
{report.total_tokens:,}
+
Errors
{report.error_count}
+
-
-
Total Tokens
-
{report.total_tokens}
+
+
+ {donut}
+
{histogram}
+
+
+ Pass ({report.pass_count})
+ Fail ({report.fail_count})
+ Error ({report.error_count})
+ Skip ({report.skip_count})
+
-{eval_breakdown}
+{eval_bars}
+
+{filter_bar}
-
+
-| # | Test Case | Verdict | Latency | Cost | Tools | Details |
+| # | Test Case | Verdict | Latency | Cost | Issues |
{table_rows}
@@ -145,11 +301,36 @@ def render_html_report(report: Any, filepath: Union[str, Path]) -> None:
+
+
"""
- Path(filepath).write_text(content)
+ Path(filepath).write_text(page)