From aa659153a59f2da8c8cc420de5a7474e0a01690d Mon Sep 17 00:00:00 2001 From: John Niche Date: Sun, 22 Mar 2026 08:49:29 -0300 Subject: [PATCH] =?UTF-8?q?test:=20comprehensive=20E2E=20tests=20for=20eva?= =?UTF-8?q?l=20framework=20=E2=80=94=2073=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end tests using real Agent with SharedFakeProvider and SharedToolCallProvider (no mocks). Covers every gap from audit: - EvalSuite: basic run, tool calls, mixed results, empty cases, concurrency, progress callback, error handling, async run, tags - All 12 deterministic evaluators E2E with real Agent - All 10 LLM evaluators E2E with SharedFakeProvider as judge - HTML report: full render, donut SVG, histogram SVG, error cases - JUnit XML: structure validation, failure/error elements - Dataset → Suite → Report → Export pipeline - Regression detection with baseline save/compare - Pairwise A/B comparison with real agents - Snapshot testing: create, compare, detect changes - Badge generation from real eval runs - Synthetic test case generation - CLI help verification - Live dashboard HTML validation - Unicode content, weighted accuracy, multiple assertions per case, tag filtering, report statistics Fix: suite.py reads model from agent.config.model (not agent._model) Total eval tests: 211 (was 138) --- src/selectools/evals/suite.py | 4 +- tests/test_evals_e2e.py | 1004 +++++++++++++++++++++++++++++++++ 2 files changed, 1007 insertions(+), 1 deletion(-) create mode 100644 tests/test_evals_e2e.py diff --git a/src/selectools/evals/suite.py b/src/selectools/evals/suite.py index 35eacb4..5d86bb5 100644 --- a/src/selectools/evals/suite.py +++ b/src/selectools/evals/suite.py @@ -170,7 +170,9 @@ def _build_report( duration_ms: float, ) -> EvalReport: """Aggregate case results into an EvalReport.""" - model = getattr(self.agent, "_model", "") or "" + model = "" + if hasattr(self.agent, "config") and hasattr(self.agent.config, "model"): + model = self.agent.config.model or "" provider = "" if hasattr(self.agent, "provider") and self.agent.provider: provider = type(self.agent.provider).__name__ diff --git a/tests/test_evals_e2e.py b/tests/test_evals_e2e.py new file mode 100644 index 0000000..2c3708c --- /dev/null +++ b/tests/test_evals_e2e.py @@ -0,0 +1,1004 @@ +"""E2E tests for the selectools eval framework. + +Tests every feature end-to-end using real Agent instances with SharedFakeProvider +and SharedToolCallProvider — no mocks. Covers all gaps identified in the coverage audit. +""" + +from __future__ import annotations + +import asyncio +import json +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any, List + +import pytest + +from selectools import Agent, AgentConfig, tool +from selectools.evals import ( + BaselineStore, + CaseResult, + CaseVerdict, + DatasetLoader, + EvalReport, + EvalSuite, + PairwiseEval, + SnapshotStore, + TestCase, + generate_badge, + generate_cases, + generate_detailed_badge, + serve_eval, +) +from selectools.evals.html import _donut_svg, _histogram_svg, render_html_report +from selectools.evals.junit import render_junit_xml +from selectools.evals.llm_evaluators import ( + BiasEvaluator, + CoherenceEvaluator, + CompletenessEvaluator, + CorrectnessEvaluator, + FaithfulnessEvaluator, + HallucinationEvaluator, + LLMJudgeEvaluator, + RelevanceEvaluator, + SummaryEvaluator, + ToxicityEvaluator, +) +from selectools.evals.pairwise import PairwiseReport +from selectools.evals.snapshot import SnapshotDiff, SnapshotResult +from selectools.types import Message, Role, ToolCall +from selectools.usage import UsageStats + +# Import shared test providers from conftest +from tests.conftest import SharedFakeProvider, SharedToolCallProvider + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + + +@tool(description="Get the weather for a city") +def get_weather(city: str) -> str: + return f"72°F and sunny in {city}" + + +@tool(description="Search the knowledge base") +def search_docs(query: str) -> str: + return f"Results for: {query}" + + +@tool(description="Cancel a subscription") +def cancel_sub(user_id: str) -> str: + return f"Subscription {user_id} cancelled" + + +def _make_agent(responses: list, tools: list | None = None) -> Agent: + """Create a real Agent with SharedFakeProvider.""" + provider = SharedFakeProvider(responses=responses) + return Agent( + provider=provider, + config=AgentConfig(model="fake-model"), + tools=tools or [get_weather, search_docs, cancel_sub], + ) + + +def _make_tool_call_agent( + responses: list[tuple[list[ToolCall], str]], tools: list | None = None +) -> Agent: + """Create an Agent with SharedToolCallProvider for tool call scenarios.""" + provider = SharedToolCallProvider(responses=responses) + return Agent( + provider=provider, + config=AgentConfig(model="fake-model"), + tools=tools or [get_weather, search_docs, cancel_sub], + ) + + +# =========================================================================== +# E2E: EvalSuite with real Agent +# =========================================================================== + + +class TestEvalSuiteE2E: + """Full end-to-end eval suite execution with real Agent instances.""" + + def test_basic_run(self) -> None: + agent = _make_agent(["The weather in NYC is great."]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Weather in NYC?", expect_contains="weather")], + ) + report = suite.run() + assert report.accuracy == 1.0 + assert report.pass_count == 1 + assert report.metadata.model == "fake-model" + + def test_tool_call_eval(self) -> None: + tc = ToolCall(tool_name="get_weather", parameters={"city": "NYC"}) + agent = _make_tool_call_agent([([tc], "Getting weather...")]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Weather?", expect_tool="get_weather")], + ) + report = suite.run() + assert report.pass_count == 1 + + def test_multiple_cases_mixed_results(self) -> None: + agent = _make_agent(["Hello world"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase(input="Say hello", expect_contains="hello"), + TestCase(input="Say goodbye", expect_contains="goodbye"), + ], + ) + report = suite.run() + assert report.pass_count == 1 + assert report.fail_count == 1 + assert report.accuracy == 0.5 + + def test_empty_cases(self) -> None: + agent = _make_agent(["hi"]) + suite = EvalSuite(agent=agent, cases=[]) + report = suite.run() + assert report.accuracy == 0.0 + assert report.metadata.total_cases == 0 + + def test_concurrent_execution(self) -> None: + agent = _make_agent(["response"]) + cases = [TestCase(input=f"q{i}") for i in range(10)] + suite = EvalSuite(agent=agent, cases=cases, max_concurrency=4) + report = suite.run() + assert report.metadata.total_cases == 10 + assert report.pass_count == 10 + + def test_progress_callback(self) -> None: + agent = _make_agent(["ok"]) + progress: list[tuple[int, int]] = [] + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="a"), TestCase(input="b"), TestCase(input="c")], + on_progress=lambda d, t: progress.append((d, t)), + ) + suite.run() + assert progress == [(1, 3), (2, 3), (3, 3)] + + def test_error_case(self) -> None: + """Agent that raises an exception produces ERROR verdict.""" + from tests.conftest import SharedErrorProvider + + provider = SharedErrorProvider(exception=RuntimeError("API down")) + agent = Agent( + provider=provider, + config=AgentConfig(model="fake"), + tools=[get_weather], + ) + suite = EvalSuite(agent=agent, cases=[TestCase(input="crash")]) + report = suite.run() + assert report.error_count == 1 + assert report.accuracy == 0.0 + assert "API down" in report.case_results[0].error + + def test_suite_name_and_tags(self) -> None: + agent = _make_agent(["ok"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x")], + name="prod-suite", + tags={"env": "staging", "version": "2.0"}, + ) + report = suite.run() + assert report.metadata.suite_name == "prod-suite" + assert report.metadata.tags == {"env": "staging", "version": "2.0"} + + @pytest.mark.asyncio(loop_scope="function") + async def test_async_run(self) -> None: + agent = _make_agent(["async response"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="async?", expect_contains="async")], + ) + report = await suite.arun() + assert report.pass_count == 1 + + @pytest.mark.asyncio(loop_scope="function") + async def test_async_concurrent(self) -> None: + agent = _make_agent(["ok"]) + cases = [TestCase(input=f"q{i}") for i in range(5)] + suite = EvalSuite(agent=agent, cases=cases, max_concurrency=3) + report = await suite.arun() + assert report.metadata.total_cases == 5 + + +# =========================================================================== +# E2E: All deterministic evaluators with real Agent +# =========================================================================== + + +class TestDeterministicEvaluatorsE2E: + def test_expect_contains_pass(self) -> None: + agent = _make_agent(["The balance is $500."]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Balance?", expect_contains="balance")], + ) + assert suite.run().pass_count == 1 + + def test_expect_not_contains_pass(self) -> None: + agent = _make_agent(["Everything is fine."]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Status?", expect_not_contains="error")], + ) + assert suite.run().pass_count == 1 + + def test_expect_output_exact(self) -> None: + agent = _make_agent(["42"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="2+2?", expect_output="42")], + ) + assert suite.run().pass_count == 1 + + def test_expect_output_regex(self) -> None: + agent = _make_agent(["Call 555-1234 for support"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Phone?", expect_output_regex=r"\d{3}-\d{4}")], + ) + assert suite.run().pass_count == 1 + + def test_expect_json(self) -> None: + agent = _make_agent(['{"key": "value"}']) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="JSON?", expect_json=True)], + ) + assert suite.run().pass_count == 1 + + def test_expect_json_fail(self) -> None: + agent = _make_agent(["not json"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="JSON?", expect_json=True)], + ) + assert suite.run().fail_count == 1 + + def test_expect_length(self) -> None: + agent = _make_agent(["Hello world, this is a test"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_min_length=5, expect_max_length=100)], + ) + assert suite.run().pass_count == 1 + + def test_expect_starts_with(self) -> None: + agent = _make_agent(["Hello, how can I help?"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_starts_with="Hello")], + ) + assert suite.run().pass_count == 1 + + def test_expect_ends_with(self) -> None: + agent = _make_agent(["Here is your answer."]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_ends_with=".")], + ) + assert suite.run().pass_count == 1 + + def test_expect_no_pii_pass(self) -> None: + agent = _make_agent(["Your account is active."]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Account?", expect_no_pii=True)], + ) + assert suite.run().pass_count == 1 + + def test_expect_no_pii_fail(self) -> None: + agent = _make_agent(["Your SSN is 123-45-6789"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="SSN?", expect_no_pii=True)], + ) + report = suite.run() + assert report.fail_count == 1 + + def test_expect_no_injection_pass(self) -> None: + agent = _make_agent(["Here is your answer."]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_no_injection=True)], + ) + assert suite.run().pass_count == 1 + + def test_expect_iterations(self) -> None: + agent = _make_agent(["done"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_iterations_lte=5)], + ) + assert suite.run().pass_count == 1 + + def test_expect_latency(self) -> None: + agent = _make_agent(["fast"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_latency_ms_lte=10000)], + ) + assert suite.run().pass_count == 1 + + def test_expect_cost(self) -> None: + agent = _make_agent(["cheap"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_cost_usd_lte=1.0)], + ) + assert suite.run().pass_count == 1 + + def test_custom_evaluator(self) -> None: + agent = _make_agent(["Please help me."]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase( + input="x", + custom_evaluator=lambda r: "please" in r.content.lower(), + custom_evaluator_name="politeness", + ) + ], + ) + assert suite.run().pass_count == 1 + + def test_weighted_accuracy(self) -> None: + agent = _make_agent(["hello"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase(input="a", expect_contains="hello", weight=3.0), + TestCase(input="b", expect_contains="goodbye", weight=1.0), + ], + ) + report = suite.run() + assert report.accuracy == pytest.approx(0.75) + + def test_multiple_assertions_same_case(self) -> None: + agent = _make_agent(["Hello world"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase( + input="x", + expect_contains="hello", + expect_starts_with="Hello", + expect_min_length=5, + expect_max_length=100, + expect_no_pii=True, + expect_no_injection=True, + ) + ], + ) + assert suite.run().pass_count == 1 + + def test_unicode_content(self) -> None: + agent = _make_agent(["Bonjour le monde! 🌍"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Salut!", expect_contains="bonjour")], + ) + assert suite.run().pass_count == 1 + + def test_tags_filtering(self) -> None: + agent = _make_agent(["ok"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase(input="a", tags=["billing"]), + TestCase(input="b", tags=["support"]), + TestCase(input="c", tags=["billing", "support"]), + ], + ) + report = suite.run() + billing = report.filter_by_tag("billing") + assert len(billing) == 2 + support = report.filter_by_tag("support") + assert len(support) == 2 + + +# =========================================================================== +# E2E: HTML report rendering +# =========================================================================== + + +class TestHTMLReportE2E: + def test_full_html_report(self, tmp_path: Path) -> None: + agent = _make_agent(["Balance is $500", "Error occurred"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase( + input="Balance?", name="balance", expect_contains="balance", tags=["billing"] + ), + TestCase(input="Fail?", name="fail_case", expect_contains="success"), + ], + ) + report = suite.run() + path = tmp_path / "report.html" + report.to_html(path) + + content = path.read_text() + assert "" in content + assert "balance" in content + assert "fail_case" in content + assert "NichevLabs" in content + assert "toggleDetail" in content # JS function + assert "filterByTag" in content # JS function + assert "filterByVerdict" in content # JS function + assert "billing" in content # tag pill + + def test_donut_svg(self) -> None: + svg = _donut_svg(8, 2, 1, 0) + assert " None: + assert _donut_svg(0, 0, 0, 0) == "" + + def test_histogram_svg(self) -> None: + svg = _histogram_svg([100, 200, 150, 300, 250, 180]) + assert " None: + assert _histogram_svg([]) == "" + + def test_histogram_svg_single_value(self) -> None: + svg = _histogram_svg([100.0]) + assert " None: + from tests.conftest import SharedErrorProvider + + provider = SharedErrorProvider(exception=RuntimeError("boom")) + agent = Agent(provider=provider, config=AgentConfig(model="m"), tools=[get_weather]) + suite = EvalSuite(agent=agent, cases=[TestCase(input="crash", name="error_test")]) + report = suite.run() + path = tmp_path / "error_report.html" + report.to_html(path) + content = path.read_text() + assert "error_test" in content + assert "boom" in content + + +# =========================================================================== +# E2E: JUnit XML +# =========================================================================== + + +class TestJUnitXMLE2E: + def test_junit_xml_structure(self, tmp_path: Path) -> None: + agent = _make_agent(["Hello world"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase(input="pass", name="passing_test", expect_contains="hello"), + TestCase(input="fail", name="failing_test", expect_contains="goodbye"), + ], + ) + report = suite.run() + path = tmp_path / "results.xml" + report.to_junit_xml(path) + + tree = ET.parse(str(path)) + root = tree.getroot() + assert root.tag == "testsuite" + assert root.attrib["tests"] == "2" + assert root.attrib["failures"] == "1" + + testcases = root.findall("testcase") + assert len(testcases) == 2 + + # Check passing test has no failure element + passing = [tc for tc in testcases if tc.attrib["name"] == "passing_test"][0] + assert passing.find("failure") is None + + # Check failing test has failure element + failing = [tc for tc in testcases if tc.attrib["name"] == "failing_test"][0] + assert failing.find("failure") is not None + + def test_junit_with_errors(self, tmp_path: Path) -> None: + from tests.conftest import SharedErrorProvider + + provider = SharedErrorProvider(exception=ValueError("broken")) + agent = Agent(provider=provider, config=AgentConfig(model="m"), tools=[get_weather]) + suite = EvalSuite(agent=agent, cases=[TestCase(input="x", name="error_case")]) + report = suite.run() + path = tmp_path / "errors.xml" + report.to_junit_xml(path) + + tree = ET.parse(str(path)) + root = tree.getroot() + assert root.attrib["errors"] == "1" + error_el = root.find(".//error") + assert error_el is not None + assert "broken" in (error_el.attrib.get("message", "")) + + +# =========================================================================== +# E2E: Dataset loading → Suite → Report → Export +# =========================================================================== + + +class TestDatasetToReportPipeline: + def test_json_to_html(self, tmp_path: Path) -> None: + cases_data = [ + {"input": "Weather?", "expect_contains": "weather", "name": "weather"}, + {"input": "Hello", "expect_starts_with": "H", "name": "greeting", "tags": ["basic"]}, + ] + cases_path = tmp_path / "cases.json" + cases_path.write_text(json.dumps(cases_data)) + + cases = DatasetLoader.load(cases_path) + assert len(cases) == 2 + assert cases[0].name == "weather" + assert cases[1].tags == ["basic"] + + agent = _make_agent(["The weather is great", "Hi there"]) + suite = EvalSuite(agent=agent, cases=cases, name="pipeline-test") + report = suite.run() + + # Export all formats + report.to_html(tmp_path / "report.html") + report.to_junit_xml(tmp_path / "results.xml") + report.to_json(tmp_path / "results.json") + + assert (tmp_path / "report.html").exists() + assert (tmp_path / "results.xml").exists() + assert (tmp_path / "results.json").exists() + + # Verify JSON report + data = json.loads((tmp_path / "results.json").read_text()) + assert data["metadata"]["suite_name"] == "pipeline-test" + assert len(data["cases"]) == 2 + + +# =========================================================================== +# E2E: Regression detection +# =========================================================================== + + +class TestRegressionE2E: + def test_baseline_save_compare_no_regression(self, tmp_path: Path) -> None: + agent = _make_agent(["good response"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", name="stable", expect_contains="good")], + name="reg-test", + ) + report = suite.run() + + store = BaselineStore(tmp_path / "baselines") + store.save(report) + + result = store.compare(report) + assert not result.is_regression + + def test_regression_detected(self, tmp_path: Path) -> None: + # Baseline: passing + agent_good = _make_agent(["good response"]) + suite1 = EvalSuite( + agent=agent_good, + cases=[TestCase(input="x", name="check", expect_contains="good")], + name="reg-test", + ) + report1 = suite1.run() + + store = BaselineStore(tmp_path / "baselines") + store.save(report1) + + # Current: failing + agent_bad = _make_agent(["bad response"]) + suite2 = EvalSuite( + agent=agent_bad, + cases=[TestCase(input="x", name="check", expect_contains="good")], + name="reg-test", + ) + report2 = suite2.run() + + result = store.compare(report2) + assert result.is_regression + assert "check" in result.regressions + + +# =========================================================================== +# E2E: Pairwise A/B comparison +# =========================================================================== + + +class TestPairwiseE2E: + def test_agent_a_wins(self) -> None: + agent_a = _make_agent(["good answer with details"]) + agent_b = _make_agent(["wrong"]) + cases = [TestCase(input="Explain?", expect_contains="good")] + + pairwise = PairwiseEval(agent_a, agent_b, cases, agent_a_name="Good", agent_b_name="Bad") + result = pairwise.run() + assert result.a_wins == 1 + assert result.b_wins == 0 + assert result.winner == "A" + assert result.report_a.accuracy == 1.0 + assert result.report_b.accuracy == 0.0 + + def test_tie(self) -> None: + agent_a = _make_agent(["same answer"]) + agent_b = _make_agent(["same answer"]) + cases = [TestCase(input="Test", expect_contains="same")] + + result = PairwiseEval(agent_a, agent_b, cases).run() + assert result.ties == 1 + + def test_pairwise_summary(self) -> None: + agent_a = _make_agent(["win"]) + agent_b = _make_agent(["lose"]) + cases = [TestCase(input="x", expect_contains="win")] + + result = PairwiseEval( + agent_a, agent_b, cases, agent_a_name="Fast", agent_b_name="Slow" + ).run() + s = result.summary() + assert "Fast" in s + assert "1 wins" in s + + +# =========================================================================== +# E2E: Snapshot testing +# =========================================================================== + + +class TestSnapshotE2E: + def test_first_run_creates_snapshot(self, tmp_path: Path) -> None: + agent = _make_agent(["Hello world"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="Hi", name="greeting")], + ) + report = suite.run() + + store = SnapshotStore(tmp_path / "snapshots") + result = store.compare(report, "test-suite") + assert result.has_changes # All new + assert "greeting" in result.new_cases + + store.save(report, "test-suite") + result2 = store.compare(report, "test-suite") + assert not result2.has_changes + assert "greeting" in result2.unchanged + + def test_detect_output_change(self, tmp_path: Path) -> None: + agent1 = _make_agent(["Response A"]) + suite1 = EvalSuite(agent=agent1, cases=[TestCase(input="x", name="test")]) + report1 = suite1.run() + + store = SnapshotStore(tmp_path / "snapshots") + store.save(report1, "changing") + + agent2 = _make_agent(["Response B"]) + suite2 = EvalSuite(agent=agent2, cases=[TestCase(input="x", name="test")]) + report2 = suite2.run() + + result = store.compare(report2, "changing") + assert result.has_changes + assert result.changed_count == 1 + content_diffs = [d for d in result.diffs if d.field == "content"] + assert len(content_diffs) == 1 + assert content_diffs[0].expected == "Response A" + assert content_diffs[0].actual == "Response B" + + +# =========================================================================== +# E2E: Badge generation +# =========================================================================== + + +class TestBadgeE2E: + def test_badge_from_real_run(self, tmp_path: Path) -> None: + agent = _make_agent(["ok"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="a"), TestCase(input="b")], + ) + report = suite.run() + + path = tmp_path / "badge.svg" + generate_badge(report, path) + + content = path.read_text() + assert " None: + agent = _make_agent(["ok"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="a"), TestCase(input="b", expect_contains="nope")], + ) + report = suite.run() + + path = tmp_path / "badge_detail.svg" + generate_detailed_badge(report, path) + content = path.read_text() + assert "1/2 pass" in content + + def test_badge_colors(self, tmp_path: Path) -> None: + """Test different accuracy levels produce different colors.""" + from selectools.evals.badge import _badge_color + + assert _badge_color(1.0) == "#4ade80" # green + assert _badge_color(0.92) == "#22d3ee" # cyan + assert _badge_color(0.85) == "#3b82f6" # blue + assert _badge_color(0.75) == "#fbbf24" # yellow + assert _badge_color(0.6) == "#f97316" # orange + assert _badge_color(0.3) == "#f87171" # red + + +# =========================================================================== +# E2E: Report statistics +# =========================================================================== + + +class TestReportStatsE2E: + def test_all_latency_percentiles(self) -> None: + agent = _make_agent(["ok"]) + cases = [TestCase(input=f"q{i}") for i in range(20)] + suite = EvalSuite(agent=agent, cases=cases) + report = suite.run() + + assert report.latency_p50 > 0 + assert report.latency_p95 >= report.latency_p50 + assert report.latency_p99 >= report.latency_p95 + assert report.latency_mean > 0 + + def test_cost_tracking(self) -> None: + agent = _make_agent(["ok"]) + suite = EvalSuite(agent=agent, cases=[TestCase(input="x")]) + report = suite.run() + assert report.total_cost >= 0 + assert report.cost_per_case >= 0 + assert report.total_tokens >= 0 + + def test_report_summary_string(self) -> None: + agent = _make_agent(["hello"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x", expect_contains="hello")], + name="summary-test", + ) + report = suite.run() + s = report.summary() + assert "summary-test" in s + assert "100.0%" in s + assert "1 pass" in s + + def test_report_repr(self) -> None: + agent = _make_agent(["ok"]) + suite = EvalSuite(agent=agent, cases=[TestCase(input="x")]) + report = suite.run() + r = repr(report) + assert "EvalReport" in r + assert "100.00%" in r + + def test_failures_by_evaluator(self) -> None: + agent = _make_agent(["hello"]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase(input="a", expect_contains="goodbye"), + TestCase(input="b", expect_starts_with="Z"), + TestCase(input="c", expect_contains="nope"), + ], + ) + report = suite.run() + by_eval = report.failures_by_evaluator() + assert "contains" in by_eval + assert by_eval["contains"] == 2 + + def test_to_dict_roundtrip(self) -> None: + agent = _make_agent(["ok"]) + suite = EvalSuite(agent=agent, cases=[TestCase(input="x", name="test1")]) + report = suite.run() + d = report.to_dict() + assert d["metadata"]["suite_name"] == "eval" + assert d["cases"][0]["name"] == "test1" + assert d["summary"]["accuracy"] == 1.0 + + +# =========================================================================== +# E2E: Synthetic test generation +# =========================================================================== + + +class TestGeneratorE2E: + def test_generate_from_tools(self) -> None: + provider = SharedFakeProvider( + responses=[ + json.dumps( + [ + { + "input": "What is the weather?", + "name": "weather_q", + "expect_tool": "get_weather", + }, + { + "input": "Search for Python", + "name": "search_q", + "expect_tool": "search_docs", + }, + ] + ) + ] + ) + cases = generate_cases(provider, "fake-model", [get_weather, search_docs], n=2) + assert len(cases) == 2 + assert cases[0].name == "weather_q" + assert cases[0].expect_tool == "get_weather" + + +# =========================================================================== +# E2E: LLM evaluators with real Agent + judge provider +# =========================================================================== + + +class TestLLMEvaluatorsE2E: + def _run_with_llm_eval(self, evaluator: Any) -> EvalReport: + agent = _make_agent(["Paris is the capital of France."]) + suite = EvalSuite( + agent=agent, + cases=[ + TestCase( + input="What is the capital of France?", + reference="Paris is the capital of France.", + context="France is a country in Europe. Its capital is Paris.", + rubric="Check if the answer is factually correct.", + ) + ], + evaluators=[evaluator], + ) + return suite.run() + + def _make_judge(self, score: int) -> SharedFakeProvider: + return SharedFakeProvider(responses=[f"Analysis complete. Score: {score}"]) + + def test_correctness_pass(self) -> None: + judge = self._make_judge(9) + report = self._run_with_llm_eval(CorrectnessEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_correctness_fail(self) -> None: + judge = self._make_judge(3) + report = self._run_with_llm_eval(CorrectnessEvaluator(judge, "fake")) + assert report.fail_count == 1 + + def test_relevance(self) -> None: + judge = self._make_judge(9) + report = self._run_with_llm_eval(RelevanceEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_faithfulness(self) -> None: + judge = self._make_judge(9) + report = self._run_with_llm_eval(FaithfulnessEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_hallucination(self) -> None: + judge = self._make_judge(9) + report = self._run_with_llm_eval(HallucinationEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_toxicity(self) -> None: + judge = self._make_judge(10) + report = self._run_with_llm_eval(ToxicityEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_coherence(self) -> None: + judge = self._make_judge(8) + report = self._run_with_llm_eval(CoherenceEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_completeness(self) -> None: + judge = self._make_judge(8) + report = self._run_with_llm_eval(CompletenessEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_bias(self) -> None: + judge = self._make_judge(9) + report = self._run_with_llm_eval(BiasEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_summary_eval(self) -> None: + judge = self._make_judge(8) + report = self._run_with_llm_eval(SummaryEvaluator(judge, "fake")) + assert report.pass_count == 1 + + def test_llm_judge_custom_rubric(self) -> None: + judge = self._make_judge(9) + report = self._run_with_llm_eval( + LLMJudgeEvaluator(judge, "fake", default_rubric="Be accurate", threshold=7.0) + ) + assert report.pass_count == 1 + + def test_llm_evaluator_skip_without_fields(self) -> None: + """LLM evaluators that require reference/context skip if not provided.""" + judge = self._make_judge(10) + agent = _make_agent(["ok"]) + suite = EvalSuite( + agent=agent, + cases=[TestCase(input="x")], # no reference or context + evaluators=[ + CorrectnessEvaluator(judge, "fake"), + FaithfulnessEvaluator(judge, "fake"), + HallucinationEvaluator(judge, "fake"), + SummaryEvaluator(judge, "fake"), + ], + ) + report = suite.run() + assert report.pass_count == 1 # all skip → pass + + +# =========================================================================== +# E2E: CLI (__main__) +# =========================================================================== + + +class TestCLIE2E: + def test_cli_help(self) -> None: + import subprocess + + result = subprocess.run( + ["python3", "-m", "selectools.evals", "--help"], + capture_output=True, + text=True, + ) + assert result.returncode == 0 + assert "run" in result.stdout + assert "compare" in result.stdout + + def test_cli_run_help(self) -> None: + import subprocess + + result = subprocess.run( + ["python3", "-m", "selectools.evals", "run", "--help"], + capture_output=True, + text=True, + ) + assert result.returncode == 0 + assert "--html" in result.stdout + assert "--junit" in result.stdout + assert "--provider" in result.stdout + + +# =========================================================================== +# E2E: Serve dashboard (unit-level — no HTTP server) +# =========================================================================== + + +class TestServeDashboard: + def test_dashboard_html_has_all_elements(self) -> None: + from selectools.evals.serve import _DASHBOARD_HTML + + assert "" in _DASHBOARD_HTML + assert "Live Dashboard" in _DASHBOARD_HTML + assert "/api/state" in _DASHBOARD_HTML + assert "NichevLabs" in _DASHBOARD_HTML + assert "progress-bar" in _DASHBOARD_HTML + assert "accuracy" in _DASHBOARD_HTML + assert "poll()" in _DASHBOARD_HTML + + def test_dashboard_handler_state(self) -> None: + from selectools.evals.serve import _DashboardHandler + + state = {"status": "running", "completed": 5, "total_cases": 10} + _DashboardHandler.dashboard_state = state + assert _DashboardHandler.dashboard_state["status"] == "running"