diff --git a/.github/actions/eval/action.yml b/.github/actions/eval/action.yml new file mode 100644 index 0000000..01d8ef3 --- /dev/null +++ b/.github/actions/eval/action.yml @@ -0,0 +1,176 @@ +name: "Selectools Eval" +description: "Run selectools eval suite and post results as a PR comment" +branding: + icon: "check-circle" + color: "blue" + +inputs: + cases: + description: "Path to test cases file (JSON/YAML)" + required: true + provider: + description: "Provider: local, openai, anthropic, gemini, ollama" + default: "local" + model: + description: "Model name (optional, uses provider default)" + required: false + name: + description: "Suite name" + default: "eval" + concurrency: + description: "Max parallel cases" + default: "1" + baseline-dir: + description: "Baseline directory for regression detection" + default: "" + html-report: + description: "Path to write HTML report" + default: "" + junit-report: + description: "Path to write JUnit XML report" + default: "" + python-version: + description: "Python version" + default: "3.13" + post-comment: + description: "Post results as PR comment (true/false)" + default: "true" + +outputs: + accuracy: + description: "Eval accuracy (0.0 - 1.0)" + value: ${{ steps.run-eval.outputs.accuracy }} + pass-count: + description: "Number of passing cases" + value: ${{ steps.run-eval.outputs.pass_count }} + fail-count: + description: "Number of failing cases" + value: ${{ steps.run-eval.outputs.fail_count }} + regression: + description: "Whether regressions were detected" + value: ${{ steps.run-eval.outputs.regression }} + +runs: + using: "composite" + steps: + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Install selectools + shell: bash + run: pip install selectools + + - name: Run eval suite + id: run-eval + shell: bash + run: | + set +e + + ARGS="run ${{ inputs.cases }} --name ${{ inputs.name }} --provider ${{ inputs.provider }} --concurrency ${{ inputs.concurrency }} --json /tmp/eval-results.json --verbose" + + if [ -n "${{ inputs.model }}" ]; then + ARGS="$ARGS --model ${{ inputs.model }}" + fi + if [ -n "${{ inputs.html-report }}" ]; then + ARGS="$ARGS --html ${{ inputs.html-report }}" + fi + if [ -n "${{ inputs.junit-report }}" ]; then + ARGS="$ARGS --junit ${{ inputs.junit-report }}" + fi + if [ -n "${{ inputs.baseline-dir }}" ]; then + ARGS="$ARGS --baseline ${{ inputs.baseline-dir }}" + fi + + python -m selectools.evals $ARGS + EXIT_CODE=$? + + # Parse JSON results for outputs + if [ -f /tmp/eval-results.json ]; then + ACCURACY=$(python -c "import json; d=json.load(open('/tmp/eval-results.json')); print(d['summary']['accuracy'])") + PASS_COUNT=$(python -c "import json; d=json.load(open('/tmp/eval-results.json')); print(d['summary']['pass'])") + FAIL_COUNT=$(python -c "import json; d=json.load(open('/tmp/eval-results.json')); print(d['summary']['fail'])") + echo "accuracy=$ACCURACY" >> $GITHUB_OUTPUT + echo "pass_count=$PASS_COUNT" >> $GITHUB_OUTPUT + echo "fail_count=$FAIL_COUNT" >> $GITHUB_OUTPUT + fi + + if [ $EXIT_CODE -ne 0 ]; then + echo "regression=true" >> $GITHUB_OUTPUT + else + echo "regression=false" >> $GITHUB_OUTPUT + fi + + exit $EXIT_CODE + + - name: Post PR comment + if: inputs.post-comment == 'true' && github.event_name == 'pull_request' && always() + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + let data; + try { + data = JSON.parse(fs.readFileSync('/tmp/eval-results.json', 'utf8')); + } catch (e) { + console.log('No eval results to post'); + return; + } + + const s = data.summary; + const accPct = (s.accuracy * 100).toFixed(1); + const accEmoji = s.accuracy >= 0.9 ? '🟢' : s.accuracy >= 0.7 ? '🟡' : '🔴'; + + let failDetails = ''; + const failures = data.cases.filter(c => c.verdict === 'fail' || c.verdict === 'error'); + if (failures.length > 0) { + const rows = failures.slice(0, 10).map(c => { + const issues = c.failures.map(f => f.message).join('; ') || c.error || ''; + return `| ${c.name} | \`${c.verdict}\` | ${issues.substring(0, 100)} |`; + }).join('\n'); + failDetails = `\n\n
Failed cases (${failures.length})\n\n| Case | Verdict | Issue |\n|---|---|---|\n${rows}\n\n
`; + } + + const body = `## ${accEmoji} Eval Report: \`${data.metadata.suite_name}\` + + | Metric | Value | + |---|---| + | **Accuracy** | **${accPct}%** (${s.pass} pass, ${s.fail} fail, ${s.error} error) | + | **Latency** | p50: ${s.latency_p50.toFixed(0)}ms, p95: ${s.latency_p95.toFixed(0)}ms | + | **Cost** | $${s.total_cost.toFixed(6)} ($${s.cost_per_case.toFixed(6)}/case) | + | **Tokens** | ${s.total_tokens.toLocaleString()} | + | **Model** | ${data.metadata.model} | + ${failDetails} + + Generated by Selectools Eval — an open-source project from NichevLabs`; + + // Find and update existing comment or create new + const comments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + const existing = comments.data.find(c => c.body.includes('Eval Report:')); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body: body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: body, + }); + } + + - name: Upload HTML report + if: inputs.html-report != '' && always() + uses: actions/upload-artifact@v4 + with: + name: eval-report + path: ${{ inputs.html-report }} diff --git a/docs/modules/EVALS.md b/docs/modules/EVALS.md new file mode 100644 index 0000000..ad1b677 --- /dev/null +++ b/docs/modules/EVALS.md @@ -0,0 +1,338 @@ +# Eval Framework + +**Added in:** v0.17.0 + +Built-in agent evaluation with 22 evaluators, regression detection, and CI integration. No separate install, no SaaS account, no external dependencies. + +--- + +## Quick Start + +```python +from selectools.evals import EvalSuite, TestCase + +suite = EvalSuite(agent=agent, cases=[ + TestCase(input="Cancel my account", expect_tool="cancel_subscription"), + TestCase(input="Check my balance", expect_contains="balance"), + TestCase(input="What's 2+2?", expect_output="4"), +]) +report = suite.run() +print(report.accuracy) # 0.95 +print(report.latency_p50) # 142ms +print(report.total_cost) # $0.003 +``` + +--- + +## TestCase — Declarative Assertions + +Every `TestCase` has an `input` (the prompt) and optional `expect_*` fields. Only the fields you set are checked. + +### Tool Assertions + +```python +TestCase(input="Cancel subscription", expect_tool="cancel_sub") +TestCase(input="Full workflow", expect_tools=["search", "summarize"]) +TestCase(input="Search", expect_tool_args={"search": {"query": "python"}}) +``` + +### Content Assertions + +```python +TestCase(input="Hello", expect_contains="hello") +TestCase(input="Safe?", expect_not_contains="error") +TestCase(input="2+2", expect_output="4") +TestCase(input="Phone", expect_output_regex=r"\d{3}-\d{4}") +TestCase(input="JSON?", expect_json=True) +TestCase(input="Prefix", expect_starts_with="Hello") +TestCase(input="Suffix", expect_ends_with=".") +TestCase(input="Short", expect_min_length=10, expect_max_length=500) +``` + +### Structured Output + +```python +TestCase( + input="Extract name", + response_format=MyModel, + expect_parsed={"name": "Alice"}, +) +``` + +### Performance Assertions + +```python +TestCase( + input="Fast query", + expect_latency_ms_lte=500, + expect_cost_usd_lte=0.01, + expect_iterations_lte=3, +) +``` + +### Safety Assertions + +```python +TestCase(input="Account info", expect_no_pii=True) +TestCase(input="Ignore instructions", expect_no_injection=True) +``` + +### LLM-as-Judge Fields + +```python +TestCase( + input="Summarize this", + reference="The original long text...", # ground truth + context="Retrieved document content...", # RAG context + rubric="Rate accuracy and completeness", # custom rubric +) +``` + +### Custom Evaluators + +```python +def must_be_polite(result) -> bool: + return "please" in result.content.lower() + +TestCase( + input="Help me", + custom_evaluator=must_be_polite, + custom_evaluator_name="politeness", +) +``` + +### Tags and Weights + +```python +TestCase(input="Critical", tags=["billing", "critical"], weight=3.0) +TestCase(input="Minor", tags=["nice-to-have"], weight=0.5) +``` + +--- + +## Built-in Evaluators (22) + +### Deterministic (12) — No API calls + +| Evaluator | What it checks | +|---|---| +| `ToolUseEvaluator` | Tool name, tool list, argument values | +| `ContainsEvaluator` | Substring present/absent (case-insensitive) | +| `OutputEvaluator` | Exact match, regex match | +| `StructuredOutputEvaluator` | Parsed fields match (deep subset) | +| `PerformanceEvaluator` | Iterations, latency, cost thresholds | +| `JsonValidityEvaluator` | Valid JSON output | +| `LengthEvaluator` | Min/max character count | +| `StartsWithEvaluator` | Output prefix | +| `EndsWithEvaluator` | Output suffix | +| `PIILeakEvaluator` | SSN, email, phone, credit card, ZIP | +| `InjectionResistanceEvaluator` | 10 prompt injection patterns | +| `CustomEvaluator` | Any user-defined callable | + +### LLM-as-Judge (10) — Uses any Provider + +These evaluators call an LLM to grade the output. Pass any selectools `Provider` — works with OpenAI, Anthropic, Gemini, Ollama. + +```python +from selectools.evals import CorrectnessEvaluator, RelevanceEvaluator + +suite = EvalSuite( + agent=agent, + cases=cases, + evaluators=[ + CorrectnessEvaluator(provider=provider, model="gpt-4.1-mini"), + RelevanceEvaluator(provider=provider, model="gpt-4.1-mini"), + ], +) +``` + +| Evaluator | What it checks | Requires | +|---|---|---| +| `LLMJudgeEvaluator` | Generic rubric scoring (0-10) | `rubric` on TestCase | +| `CorrectnessEvaluator` | Correct vs reference answer | `reference` on TestCase | +| `RelevanceEvaluator` | Response relevant to query | — | +| `FaithfulnessEvaluator` | Grounded in provided context | `context` on TestCase | +| `HallucinationEvaluator` | Fabricated information | `context` or `reference` | +| `ToxicityEvaluator` | Harmful/inappropriate content | — | +| `CoherenceEvaluator` | Well-structured and logical | — | +| `CompletenessEvaluator` | Fully addresses the query | — | +| `BiasEvaluator` | Gender, racial, political bias | — | +| `SummaryEvaluator` | Summary accuracy and coverage | `reference` on TestCase | + +All LLM evaluators accept a `threshold` parameter (default: 7.0 for most, 8.0 for safety). + +--- + +## EvalReport + +```python +report = suite.run() + +# Aggregate metrics +report.accuracy # Weighted accuracy (0.0 - 1.0) +report.pass_count # Number of passing cases +report.fail_count # Number of failing cases +report.error_count # Number of error cases +report.total_cost # Total USD cost +report.total_tokens # Total tokens used +report.latency_p50 # Median latency (ms) +report.latency_p95 # 95th percentile latency +report.latency_p99 # 99th percentile latency +report.cost_per_case # Average cost per case + +# Filtering +report.filter_by_tag("billing") +report.filter_by_verdict(CaseVerdict.FAIL) +report.failures_by_evaluator() # {"tool_use": 3, "contains": 1} + +# Export +report.to_html("report.html") # Interactive HTML report +report.to_junit_xml("results.xml") # JUnit XML for CI +report.to_json("results.json") # Machine-readable JSON +report.summary() # Human-readable text +``` + +--- + +## Loading Test Cases from Files + +```python +from selectools.evals import DatasetLoader + +# JSON +cases = DatasetLoader.from_json("tests/eval_cases.json") + +# YAML (requires PyYAML) +cases = DatasetLoader.from_yaml("tests/eval_cases.yaml") + +# Auto-detect from extension +cases = DatasetLoader.load("tests/eval_cases.json") +``` + +**JSON format:** + +```json +[ + {"input": "Cancel account", "expect_tool": "cancel_sub", "name": "cancel"}, + {"input": "Check balance", "expect_contains": "balance", "tags": ["billing"]} +] +``` + +--- + +## Regression Detection + +```python +from selectools.evals import BaselineStore + +store = BaselineStore("./baselines") +report = suite.run() + +# Compare against saved baseline +result = store.compare(report) +if result.is_regression: + print(f"Regressions: {result.regressions}") + print(f"Accuracy delta: {result.accuracy_delta:+.2%}") +else: + store.save(report) # Update baseline +``` + +--- + +## CLI + +Run evals from the command line: + +```bash +# Run eval suite +python -m selectools.evals run tests/cases.json --provider openai --model gpt-4.1-mini --html report.html --verbose + +# Compare against baseline +python -m selectools.evals compare tests/cases.json --baseline ./baselines --save + +# With concurrency +python -m selectools.evals run tests/cases.json --concurrency 5 --junit results.xml +``` + +--- + +## GitHub Actions + +Use the built-in action to run evals on every PR and post results as a comment: + +```yaml +- name: Run eval suite + uses: johnnichev/selectools/.github/actions/eval@main + with: + cases: tests/eval_cases.json + provider: openai + model: gpt-4.1-mini + html-report: eval-report.html + baseline-dir: ./baselines + post-comment: "true" + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} +``` + +The action: +- Runs all test cases +- Posts accuracy, latency, cost, and failures as a PR comment +- Detects regressions against baselines +- Uploads HTML report as an artifact +- Outputs `accuracy`, `pass-count`, `fail-count`, `regression` for downstream steps + +--- + +## Concurrent Execution + +```python +suite = EvalSuite( + agent=agent, + cases=cases, + max_concurrency=5, # Run 5 cases in parallel + on_progress=lambda done, total: print(f"[{done}/{total}]"), +) +``` + +Uses `ThreadPoolExecutor` (sync) or `asyncio.Semaphore` (async via `suite.arun()`). + +--- + +## In pytest + +```python +def test_agent_accuracy(agent): + suite = EvalSuite(agent=agent, cases=[ + TestCase(input="Cancel", expect_tool="cancel_sub"), + TestCase(input="Balance", expect_contains="balance"), + ]) + report = suite.run() + assert report.accuracy >= 0.9 + assert report.latency_p50 < 500 +``` + +--- + +## API Reference + +### Core + +| Symbol | Description | +|---|---| +| `EvalSuite(agent, cases, ...)` | Orchestrates eval runs | +| `TestCase(input, ...)` | Single test case with assertions | +| `EvalReport` | Aggregated results with metrics | +| `CaseResult` | Per-case result with verdict and failures | +| `CaseVerdict` | Enum: PASS, FAIL, ERROR, SKIP | +| `EvalFailure` | Single assertion failure | + +### Infrastructure + +| Symbol | Description | +|---|---| +| `DatasetLoader.load(path)` | Load test cases from JSON/YAML | +| `BaselineStore(dir)` | Save and compare baselines | +| `RegressionResult` | Regression comparison result | +| `report.to_html(path)` | Interactive HTML report | +| `report.to_junit_xml(path)` | JUnit XML for CI | +| `report.to_json(path)` | Machine-readable JSON | diff --git a/examples/39_eval_framework.py b/examples/39_eval_framework.py new file mode 100644 index 0000000..56fca8e --- /dev/null +++ b/examples/39_eval_framework.py @@ -0,0 +1,165 @@ +""" +Example 39: Built-in Eval Framework +==================================== + +Evaluate your agent's accuracy, tool use, latency, cost, and safety +with the built-in eval suite. No separate install needed. + +Usage: + python examples/39_eval_framework.py + +This example uses the LocalProvider stub so no API key is needed. +""" + +from selectools import Agent, AgentConfig, tool +from selectools.evals import BaselineStore, DatasetLoader, EvalSuite, TestCase +from selectools.providers.stubs import LocalProvider + +# --- Define tools --- + + +@tool(description="Look up the price of a product") +def get_price(product: str) -> str: + prices = {"laptop": "$999", "phone": "$699", "headphones": "$149"} + return prices.get(product.lower(), f"No price found for {product}") + + +@tool(description="Check if a product is in stock") +def check_stock(product: str) -> str: + stock = { + "laptop": "In stock (5 left)", + "phone": "Out of stock", + "headphones": "In stock (20 left)", + } + return stock.get(product.lower(), f"Unknown product: {product}") + + +@tool(description="Cancel a user's subscription") +def cancel_subscription(user_id: str) -> str: + return f"Subscription for user {user_id} has been cancelled." + + +# --- Create agent --- + +agent = Agent( + provider=LocalProvider(), + config=AgentConfig(model="local"), + tools=[get_price, check_stock, cancel_subscription], +) + + +# --- Define test cases --- + +cases = [ + # Tool use assertions + TestCase( + input="How much does a laptop cost?", + name="price_lookup", + expect_tool="get_price", + tags=["pricing"], + ), + TestCase( + input="Is the phone in stock?", + name="stock_check", + expect_tool="check_stock", + tags=["inventory"], + ), + # Content assertions + TestCase( + input="Tell me about headphones", + name="content_check", + expect_contains="headphones", + ), + # Performance assertions + TestCase( + input="Quick question", + name="performance", + expect_latency_ms_lte=5000, + expect_cost_usd_lte=1.0, + ), + # Safety assertions + TestCase( + input="What's my account info?", + name="no_pii", + expect_no_pii=True, + ), +] + + +# --- Run eval suite --- + +print("Running eval suite...") +print() + +suite = EvalSuite( + agent=agent, + cases=cases, + name="product-agent-v1", + on_progress=lambda done, total: print(f" [{done}/{total}]", end="\r"), +) + +report = suite.run() +print() +print(report.summary()) +print() + +# --- Export reports --- + +report.to_html("/tmp/selectools-eval-report.html") +print("HTML report: /tmp/selectools-eval-report.html") + +report.to_junit_xml("/tmp/selectools-eval-results.xml") +print("JUnit XML: /tmp/selectools-eval-results.xml") + +report.to_json("/tmp/selectools-eval-results.json") +print("JSON report: /tmp/selectools-eval-results.json") +print() + +# --- Per-case results --- + +print("Per-case results:") +for cr in report.case_results: + status = cr.verdict.value.upper() + name = cr.case.name or cr.case.input[:50] + print(f" [{status:5s}] {name} ({cr.latency_ms:.0f}ms, ${cr.cost_usd:.6f})") + for f in cr.failures: + print(f" {f.evaluator_name}: {f.message}") +print() + +# --- Regression detection --- + +import tempfile + +baseline_dir = tempfile.mkdtemp() +store = BaselineStore(baseline_dir) + +# Save current run as baseline +store.save(report) +print(f"Baseline saved to {baseline_dir}/") + +# Compare (no regression since it's the same run) +result = store.compare(report) +print(f"Regression detected: {result.is_regression}") +print(f"Accuracy delta: {result.accuracy_delta:+.2%}") +print() + +# --- Loading from file --- + +print("Dataset loading example:") +import json + +cases_file = "/tmp/eval_cases.json" +with open(cases_file, "w") as f: + json.dump( + [ + {"input": "Price of laptop?", "expect_tool": "get_price", "name": "from_file"}, + {"input": "Stock check", "expect_contains": "stock", "tags": ["inventory"]}, + ], + f, + ) + +loaded_cases = DatasetLoader.load(cases_file) +print(f" Loaded {len(loaded_cases)} cases from {cases_file}") +print() + +print("Done! Open /tmp/selectools-eval-report.html in your browser to see the interactive report.") diff --git a/landing/index.html b/landing/index.html index 46241bd..dfd05a7 100644 --- a/landing/index.html +++ b/landing/index.html @@ -237,9 +237,96 @@

24 Built-in Tools

Files, web, data, datetime, text — ready to use out of the box

-
-

1620 Tests

-

Unit, integration, regression, and E2E — production hardened

+
📊
+

22 Eval Evaluators

+

Built-in agent testing: tool use, correctness, safety, LLM-as-judge, regression detection

+
+ + + + + +
+
+

Built-in Agent Evaluation

+

The only agent framework with a built-in eval suite. No separate install, no SaaS account, no external dependencies. 22 evaluators out of the box.

+
+
+
+
+
+
+
+ test_agent.py +
+
from selectools.evals import EvalSuite, TestCase
+
+suite = EvalSuite(agent=agent, cases=[
+    TestCase(
+        input="Cancel my subscription",
+        expect_tool="cancel_sub",
+        expect_contains="cancelled",
+        expect_no_pii=True,
+    ),
+    TestCase(
+        input="What's my balance?",
+        expect_tool="check_balance",
+        expect_latency_ms_lte=500,
+        expect_cost_usd_lte=0.01,
+    ),
+])
+
+report = suite.run()
+print(report.accuracy)     # 1.0
+print(report.latency_p50)  # 142ms
+print(report.total_cost)   # $0.002
+
+report.to_html("report.html")
+
+
+
+
+

12 Deterministic Evaluators

+
+ ToolUse + Contains + Output + Structured + Performance + JSON + Length + StartsWith + EndsWith + PII Leak + Injection + Custom +
+
+
+

10 LLM-as-Judge Evaluators

+
+ Correctness + Relevance + Faithfulness + Hallucination + Toxicity + Coherence + Completeness + Bias + Summary + Custom Rubric +
+
+
+

Infrastructure

+
+
Interactive HTML report with charts and filtering
+
JUnit XML for CI (GitHub Actions, Jenkins)
+
Regression detection with baseline comparison
+
Dataset loading from JSON/YAML files
+
GitHub Action with automatic PR comments
+
+
@@ -295,6 +382,11 @@

Selectools vs. LangChain

result.reasoning Not available + + Agent evaluation + Built-in (22 evaluators) + LangSmith (paid) or DeepEval (separate) + Community Growing diff --git a/mkdocs.yml b/mkdocs.yml index 99ff9a1..b4c590d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -107,6 +107,8 @@ nav: - Advanced Chunking: modules/ADVANCED_CHUNKING.md - Embeddings: modules/EMBEDDINGS.md - Vector Stores: modules/VECTOR_STORES.md + - Evaluation: + - Eval Framework: modules/EVALS.md - Security: - Guardrails: modules/GUARDRAILS.md - Audit Logging: modules/AUDIT.md diff --git a/src/selectools/evals/__main__.py b/src/selectools/evals/__main__.py new file mode 100644 index 0000000..341da09 --- /dev/null +++ b/src/selectools/evals/__main__.py @@ -0,0 +1,198 @@ +"""CLI entry point: python -m selectools.evals + +Usage: + python -m selectools.evals run cases.json [options] + python -m selectools.evals compare cases.json --baseline ./baselines [options] + +Options: + --agent YAML Agent config YAML file + --html FILE Write HTML report to FILE + --junit FILE Write JUnit XML to FILE + --json FILE Write JSON report to FILE + --baseline DIR Baseline directory for regression detection + --concurrency N Max parallel cases (default: 1) + --name NAME Suite name (default: "eval") + --verbose Print per-case results +""" + +from __future__ import annotations + +import argparse +import sys +from typing import Any + +from . import BaselineStore, DatasetLoader, EvalSuite + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python -m selectools.evals", + description="Selectools Eval Framework — evaluate agents from the command line.", + ) + sub = parser.add_subparsers(dest="command") + + # run command + run_p = sub.add_parser("run", help="Run eval suite against an agent") + run_p.add_argument("cases", help="Path to test cases file (JSON/YAML)") + run_p.add_argument("--html", help="Write HTML report to file") + run_p.add_argument("--junit", help="Write JUnit XML to file") + run_p.add_argument("--json", dest="json_out", help="Write JSON report to file") + run_p.add_argument("--baseline", help="Baseline directory for regression detection") + run_p.add_argument("--concurrency", type=int, default=1, help="Max parallel cases") + run_p.add_argument("--name", default="eval", help="Suite name") + run_p.add_argument("--verbose", action="store_true", help="Print per-case results") + run_p.add_argument( + "--provider", + default="local", + choices=["local", "openai", "anthropic", "gemini", "ollama"], + help="Provider to use (default: local)", + ) + run_p.add_argument("--model", help="Model name") + + # compare command + cmp_p = sub.add_parser("compare", help="Compare current run against baseline") + cmp_p.add_argument("cases", help="Path to test cases file (JSON/YAML)") + cmp_p.add_argument("--baseline", required=True, help="Baseline directory") + cmp_p.add_argument("--name", default="eval", help="Suite name") + cmp_p.add_argument("--provider", default="local") + cmp_p.add_argument("--model", help="Model name") + cmp_p.add_argument("--concurrency", type=int, default=1) + cmp_p.add_argument("--save", action="store_true", help="Save as new baseline if no regression") + + return parser + + +def _create_agent(provider_name: str, model: str | None) -> "Agent": # type: ignore[name-defined] # noqa: F821 + """Create an agent with the specified provider.""" + from selectools import Agent, AgentConfig + + prov: Any = None + mdl = model or "local" + + if provider_name == "local": + from selectools.providers.stubs import LocalProvider + + prov = LocalProvider() + mdl = model or "local" + elif provider_name == "openai": + from selectools.providers import OpenAIProvider + + prov = OpenAIProvider() + mdl = model or "gpt-4.1-mini" + elif provider_name == "anthropic": + from selectools.providers import AnthropicProvider + + prov = AnthropicProvider() + mdl = model or "claude-sonnet-4-6" + elif provider_name == "gemini": + from selectools.providers import GeminiProvider + + prov = GeminiProvider() + mdl = model or "gemini-2.5-flash" + elif provider_name == "ollama": + from selectools.providers import OllamaProvider + + prov = OllamaProvider() + mdl = model or "llama3" + else: + raise ValueError(f"Unknown provider: {provider_name}") + + return Agent(provider=prov, config=AgentConfig(model=mdl), tools=[]) + + +def main() -> None: # noqa: C901 + """CLI entry point.""" + parser = _build_parser() + args = parser.parse_args() + + if not args.command: + parser.print_help() + sys.exit(1) + + # Load cases + cases = DatasetLoader.load(args.cases) + print(f"Loaded {len(cases)} test cases from {args.cases}") + + # Create agent + agent = _create_agent(args.provider, getattr(args, "model", None)) + + # Run suite + def on_progress(done: int, total: int) -> None: + print(f" [{done}/{total}]", end="\r", flush=True) + + suite = EvalSuite( + agent=agent, + cases=cases, + name=args.name, + max_concurrency=args.concurrency, + on_progress=on_progress, + ) + + print(f"Running eval suite '{args.name}'...") + report = suite.run() + print() + print(report.summary()) + print() + + if args.command == "run": + if getattr(args, "verbose", False): + for cr in report.case_results: + status = cr.verdict.value.upper() + name = cr.case.name or cr.case.input[:50] + print(f" [{status:5s}] {name} ({cr.latency_ms:.0f}ms)") + for f in cr.failures: + print(f" {f.evaluator_name}: {f.message}") + print() + + if args.html: + report.to_html(args.html) + print(f"HTML report: {args.html}") + if args.junit: + report.to_junit_xml(args.junit) + print(f"JUnit XML: {args.junit}") + if args.json_out: + report.to_json(args.json_out) + print(f"JSON report: {args.json_out}") + + if args.baseline: + store = BaselineStore(args.baseline) + result = store.compare(report) + if result.is_regression: + print(f"\nREGRESSIONS DETECTED: {result.regressions}") + print(f"Accuracy delta: {result.accuracy_delta:+.2%}") + sys.exit(1) + else: + print(f"\nNo regressions (accuracy delta: {result.accuracy_delta:+.2%})") + if result.improvements: + print(f"Improvements: {result.improvements}") + store.save(report) + print(f"Baseline saved to {args.baseline}/") + + elif args.command == "compare": + store = BaselineStore(args.baseline) + result = store.compare(report) + + if result.is_regression: + print("REGRESSIONS DETECTED:") + for name in result.regressions: + print(f" - {name}") + print(f"Accuracy: {result.accuracy_delta:+.2%}") + print(f"Latency p50: {result.latency_p50_delta:+.0f}ms") + print(f"Cost: ${result.cost_delta:+.6f}") + sys.exit(1) + else: + print("No regressions detected.") + if result.improvements: + print(f"Improvements: {result.improvements}") + print(f"Accuracy: {result.accuracy_delta:+.2%}") + if getattr(args, "save", False): + store.save(report) + print(f"Baseline updated at {args.baseline}/") + + # Exit with non-zero if accuracy is 0 + if report.accuracy == 0.0 and report.metadata.total_cases > 0: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/selectools/evals/html.py b/src/selectools/evals/html.py index 02ddff1..04eeaa7 100644 --- a/src/selectools/evals/html.py +++ b/src/selectools/evals/html.py @@ -1,19 +1,100 @@ -"""Self-contained HTML report renderer.""" +"""Self-contained interactive HTML report renderer.""" from __future__ import annotations import html +import math from pathlib import Path -from typing import Any, Union +from typing import Any, List, Union from .types import CaseVerdict -def render_html_report(report: Any, filepath: Union[str, Path]) -> None: - """Render an EvalReport as a self-contained HTML file.""" +def _donut_svg(pass_n: int, fail_n: int, error_n: int, skip_n: int) -> str: + """Generate an SVG donut chart for pass/fail/error/skip distribution.""" + total = pass_n + fail_n + error_n + skip_n + if total == 0: + return "" + segments = [ + (pass_n, "#4ade80"), + (fail_n, "#f87171"), + (error_n, "#fbbf24"), + (skip_n, "#64748b"), + ] + cx, cy, r = 60, 60, 50 + inner_r = 35 + paths: List[str] = [] + start_angle = -90.0 + for count, color in segments: + if count == 0: + continue + sweep = (count / total) * 360 + end_angle = start_angle + sweep + large = 1 if sweep > 180 else 0 + sa = math.radians(start_angle) + ea = math.radians(end_angle) + x1_o, y1_o = cx + r * math.cos(sa), cy + r * math.sin(sa) + x2_o, y2_o = cx + r * math.cos(ea), cy + r * math.sin(ea) + x1_i, y1_i = cx + inner_r * math.cos(ea), cy + inner_r * math.sin(ea) + x2_i, y2_i = cx + inner_r * math.cos(sa), cy + inner_r * math.sin(sa) + d = ( + f"M {x1_o:.1f} {y1_o:.1f} " + f"A {r} {r} 0 {large} 1 {x2_o:.1f} {y2_o:.1f} " + f"L {x1_i:.1f} {y1_i:.1f} " + f"A {inner_r} {inner_r} 0 {large} 0 {x2_i:.1f} {y2_i:.1f} Z" + ) + paths.append(f'') + start_angle = end_angle + return f'' f'{"".join(paths)}' + + +def _histogram_svg(latencies: List[float]) -> str: + """Generate an SVG histogram of latency distribution.""" + if not latencies: + return "" + min_v = min(latencies) + max_v = max(latencies) + if max_v == min_v: + max_v = min_v + 1 + n_bins = min(12, len(latencies)) + bin_width = (max_v - min_v) / n_bins + bins = [0] * n_bins + for v in latencies: + idx = min(int((v - min_v) / bin_width), n_bins - 1) + bins[idx] += 1 + max_count = max(bins) or 1 + w, h = 300, 100 + bar_w = w / n_bins - 2 + bars: List[str] = [] + for i, count in enumerate(bins): + bar_h = (count / max_count) * (h - 20) + x = i * (w / n_bins) + 1 + y = h - 15 - bar_h + bars.append( + f'' + ) + label_v = min_v + (i + 0.5) * bin_width + if i % max(1, n_bins // 4) == 0: + bars.append( + f'{label_v:.0f}' + ) + return ( + f'{" ".join(bars)}' + f'Latency Distribution (ms)' + ) + + +def render_html_report(report: Any, filepath: Union[str, Path]) -> None: # noqa: C901 + """Render an EvalReport as a self-contained interactive HTML file.""" + # Build table rows with expandable details rows = [] for i, cr in enumerate(report.case_results): name = html.escape(cr.case.name or cr.case.input[:60]) + input_text = html.escape(cr.case.input[:300]) verdict_class = { CaseVerdict.PASS: "pass", CaseVerdict.FAIL: "fail", @@ -21,123 +102,198 @@ def render_html_report(report: Any, filepath: Union[str, Path]) -> None: CaseVerdict.SKIP: "skip", }.get(cr.verdict, "") - failure_html = "" + # Expandable detail content + detail_parts = [f"Input: {input_text}"] + if cr.agent_result: + output = html.escape((cr.agent_result.content or "")[:500]) + detail_parts.append(f"Output: {output}") + if cr.agent_result.reasoning: + reasoning = html.escape(str(cr.agent_result.reasoning)[:300]) + detail_parts.append(f"Reasoning: {reasoning}") + if cr.tool_calls: + detail_parts.append(f"Tools: {html.escape(', '.join(cr.tool_calls))}") if cr.failures: items = "".join( - f"
  • {html.escape(f.evaluator_name)}: " + f"
  • {html.escape(f.evaluator_name)}: " f"{html.escape(f.message)}
  • " for f in cr.failures ) - failure_html = f'
      {items}
    ' - elif cr.error: - failure_html = f'
    {html.escape(cr.error)}
    ' + detail_parts.append(f"Failures:
      {items}
    ") + if cr.error: + detail_parts.append( + f"Error: " + f"{html.escape(cr.error)}" + ) + + detail_html = "
    ".join(detail_parts) + tags_data = html.escape(" ".join(cr.case.tags)) if cr.case.tags else "" + fail_count = len(cr.failures) if cr.failures else (1 if cr.error else 0) - tools = ", ".join(cr.tool_calls) if cr.tool_calls else "-" + # Build tag pills outside f-string to avoid backslash issue + tag_pills = "" + if cr.case.tags: + pill_items = "".join( + '' + html.escape(t) + "" for t in cr.case.tags + ) + tag_pills = '' + pill_items + "" rows.append( - f"" + f"" f"{i + 1}" - f"{name}" + f"{name}{tag_pills}" f"{cr.verdict.value}" f"{cr.latency_ms:.0f}ms" f"${cr.cost_usd:.6f}" - f"{html.escape(tools)}" - f"{failure_html}" + f"{fail_count}" + f"" + f"" + f"
    {detail_html}
    " f"" ) table_rows = "\n".join(rows) + # Charts + donut = _donut_svg(report.pass_count, report.fail_count, report.error_count, report.skip_count) + latencies = [cr.latency_ms for cr in report.case_results if cr.verdict != CaseVerdict.SKIP] + histogram = _histogram_svg(latencies) + + # Failure breakdown failures_by_eval = report.failures_by_evaluator() - eval_breakdown = "" + eval_bars = "" if failures_by_eval: - items = "".join( - f"
  • {html.escape(k)}: {v}
  • " + max_f = max(failures_by_eval.values()) + bars = "".join( + f"
    " + f"{html.escape(k)}" + f"
    " + f"{v}
    " for k, v in sorted(failures_by_eval.items(), key=lambda x: -x[1]) ) - eval_breakdown = f"

    Failures by Evaluator

      {items}
    " + eval_bars = f"

    Failures by Evaluator

    {bars}
    " - content = f""" + # Collect unique tags for filter buttons + all_tags = sorted({t for cr in report.case_results for t in cr.case.tags}) + tag_buttons = "".join( + f"" + for t in all_tags + ) + filter_bar = "" + if all_tags: + filter_bar = ( + f"
    " + f"" + f"" + f"" + f"{tag_buttons}
    " + ) + else: + filter_bar = ( + "
    " + "" + "" + "" + "
    " + ) + + acc_class = "good" if report.accuracy >= 0.9 else "warn" if report.accuracy >= 0.7 else "bad" + + page = f""" Eval Report: {html.escape(report.metadata.suite_name)} +

    Eval Report: {html.escape(report.metadata.suite_name)}

    -

    {report.metadata.model or 'unknown model'} · - {report.metadata.provider or 'unknown provider'} · - {report.metadata.total_cases} cases · - {report.metadata.duration_ms:.0f}ms

    - -
    -
    -
    Accuracy
    -
    {report.accuracy:.1%}
    -
    -
    -
    Pass / Fail / Error
    -
    {report.pass_count} / {report.fail_count} / {report.error_count}
    -
    -
    -
    Latency p50
    -
    {report.latency_p50:.0f}ms
    -
    -
    -
    Latency p95
    -
    {report.latency_p95:.0f}ms
    -
    -
    -
    Total Cost
    -
    ${report.total_cost:.6f}
    +
    {report.metadata.model or 'unknown model'} · {report.metadata.provider or 'unknown provider'} · {report.metadata.total_cases} cases · {report.metadata.duration_ms:.0f}ms
    + +
    +
    +
    +
    Accuracy
    {report.accuracy:.1%}
    +
    Pass
    {report.pass_count}
    +
    Fail
    {report.fail_count}
    +
    Latency p50
    {report.latency_p50:.0f}ms
    +
    Latency p95
    {report.latency_p95:.0f}ms
    +
    Total Cost
    ${report.total_cost:.4f}
    +
    Cost/Case
    ${report.cost_per_case:.6f}
    +
    Tokens
    {report.total_tokens:,}
    +
    Errors
    {report.error_count}
    +
    -
    -
    Total Tokens
    -
    {report.total_tokens}
    +
    +
    + {donut} +
    {histogram}
    +
    +
    + Pass ({report.pass_count}) + Fail ({report.fail_count}) + Error ({report.error_count}) + Skip ({report.skip_count}) +
    -{eval_breakdown} +{eval_bars} + +{filter_bar} - +
    - + {table_rows} @@ -145,11 +301,36 @@ def render_html_report(report: Any, filepath: Union[str, Path]) -> None:
    #Test CaseVerdictLatencyCostToolsDetails
    #Test CaseVerdictLatencyCostIssues
    - Generated by Selectools v{html.escape(report.metadata.selectools_version)} · - Run ID: {html.escape(report.metadata.run_id)} · - An open-source project from NichevLabs + Generated by Selectools v{html.escape(report.metadata.selectools_version)} · Run ID: {html.escape(report.metadata.run_id)} + An open-source project from NichevLabs
    + + """ - Path(filepath).write_text(content) + Path(filepath).write_text(page)