Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/selectools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
__version__ = "0.16.7"

# Import submodules (lazy loading for optional dependencies)
from . import embeddings, guardrails, models, rag, toolbox
from . import embeddings, evals, guardrails, models, rag, toolbox
from .agent import Agent, AgentConfig
from .analytics import AgentAnalytics, ToolMetrics
from .audit import AuditLogger, PrivacyLevel
from .cache import Cache, CacheKeyBuilder, CacheStats, InMemoryCache
from .coherence import CoherenceResult
from .entity_memory import Entity, EntityMemory
from .evals import EvalReport, EvalSuite, TestCase
from .exceptions import (
GraphExecutionError,
MemoryLimitExceededError,
Expand Down
74 changes: 74 additions & 0 deletions src/selectools/evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Eval framework — evaluate agent accuracy, tool use, latency, and cost."""

from .dataset import DatasetLoader
from .evaluators import (
ContainsEvaluator,
CustomEvaluator,
EndsWithEvaluator,
Evaluator,
InjectionResistanceEvaluator,
JsonValidityEvaluator,
LengthEvaluator,
OutputEvaluator,
PerformanceEvaluator,
PIILeakEvaluator,
StartsWithEvaluator,
StructuredOutputEvaluator,
ToolUseEvaluator,
)
from .llm_evaluators import (
BiasEvaluator,
CoherenceEvaluator,
CompletenessEvaluator,
CorrectnessEvaluator,
FaithfulnessEvaluator,
HallucinationEvaluator,
LLMJudgeEvaluator,
RelevanceEvaluator,
SummaryEvaluator,
ToxicityEvaluator,
)
from .regression import BaselineStore, RegressionResult
from .report import EvalReport
from .suite import EvalSuite
from .types import CaseResult, CaseVerdict, EvalFailure, EvalMetadata, TestCase

__all__ = [
# Core
"EvalSuite",
"TestCase",
"CaseResult",
"CaseVerdict",
"EvalFailure",
"EvalMetadata",
"EvalReport",
"DatasetLoader",
"BaselineStore",
"RegressionResult",
# Evaluator protocol
"Evaluator",
# Deterministic evaluators (12)
"ToolUseEvaluator",
"ContainsEvaluator",
"OutputEvaluator",
"StructuredOutputEvaluator",
"PerformanceEvaluator",
"JsonValidityEvaluator",
"LengthEvaluator",
"StartsWithEvaluator",
"EndsWithEvaluator",
"PIILeakEvaluator",
"InjectionResistanceEvaluator",
"CustomEvaluator",
# LLM-as-judge evaluators (10)
"LLMJudgeEvaluator",
"CorrectnessEvaluator",
"RelevanceEvaluator",
"FaithfulnessEvaluator",
"HallucinationEvaluator",
"ToxicityEvaluator",
"CoherenceEvaluator",
"CompletenessEvaluator",
"BiasEvaluator",
"SummaryEvaluator",
]
63 changes: 63 additions & 0 deletions src/selectools/evals/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Load test cases from JSON, YAML, or dict lists."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Dict, List, Union

from .types import TestCase

# Fields that map directly to TestCase constructor
_TESTCASE_FIELDS = {f.name for f in TestCase.__dataclass_fields__.values()}


class DatasetLoader:
"""Load TestCase lists from files or dicts."""

@staticmethod
def from_json(filepath: Union[str, Path]) -> List[TestCase]:
"""Load test cases from a JSON file."""
data = json.loads(Path(filepath).read_text())
if isinstance(data, dict) and "cases" in data:
data = data["cases"]
return DatasetLoader.from_dicts(data)

@staticmethod
def from_yaml(filepath: Union[str, Path]) -> List[TestCase]:
"""Load test cases from a YAML file. Requires PyYAML."""
try:
import yaml # type: ignore[import-untyped]
except ImportError:
raise ImportError("PyYAML is required for YAML datasets: pip install pyyaml")
data = yaml.safe_load(Path(filepath).read_text())
if isinstance(data, dict) and "cases" in data:
data = data["cases"]
return DatasetLoader.from_dicts(data)

@staticmethod
def from_dicts(data: List[Dict[str, Any]]) -> List[TestCase]:
"""Convert a list of dicts to TestCase objects.

Unknown keys are stored in TestCase.metadata.
"""
cases: List[TestCase] = []
for item in data:
known = {k: v for k, v in item.items() if k in _TESTCASE_FIELDS}
unknown = {k: v for k, v in item.items() if k not in _TESTCASE_FIELDS}
if unknown:
meta = known.get("metadata", {})
meta.update(unknown)
known["metadata"] = meta
cases.append(TestCase(**known))
return cases

@staticmethod
def load(filepath: Union[str, Path]) -> List[TestCase]:
"""Auto-detect format from file extension (.json, .yaml, .yml)."""
path = Path(filepath)
if path.suffix == ".json":
return DatasetLoader.from_json(path)
if path.suffix in (".yaml", ".yml"):
return DatasetLoader.from_yaml(path)
raise ValueError(f"Unsupported file format: {path.suffix}")
Loading
Loading