intuit · sridharswain · Mar 19, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,7 +1,7 @@
 blank_issues_enabled: true
 contact_links:
   - name: Documentation
-    url: https://github.com/intuit/fasteval/tree/main/docs
+    url: https://fasteval.io
     about: Browse the documentation for guides and API reference.
   - name: Discussions
     url: https://github.com/intuit/fasteval/discussions

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -54,13 +54,14 @@ jobs:
       - name: Install dependencies
         run: uv sync --all-extras --group test
 
-      - name: Run tests
+      - name: Run tests with coverage
         run: |
           uv run pytest tests/ \
             --color=yes \
             --cov=fasteval \
             --cov-report=term \
             --cov-report=xml:coverage.xml \
+            --cov-fail-under=85 \
             -v
 
       - name: Upload coverage
@@ -76,7 +77,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        plugin: [fasteval-langfuse, fasteval-langgraph, fasteval-observe]
+        include:
+          - plugin: fasteval-langfuse
+            module: fasteval_langfuse
+          - plugin: fasteval-langgraph
+            module: fasteval_langgraph
+          - plugin: fasteval-observe
+            module: fasteval_observe
     steps:
       - uses: actions/checkout@v4
 
@@ -93,6 +100,16 @@ jobs:
         working-directory: plugins/${{ matrix.plugin }}
         run: uv sync --all-extras --group dev
 
-      - name: Run plugin tests
+      - name: Install coverage tools
         working-directory: plugins/${{ matrix.plugin }}
-        run: uv run pytest tests/ -v --color=yes
+        run: uv pip install pytest-cov
+
+      - name: Run plugin tests with coverage
+        working-directory: plugins/${{ matrix.plugin }}
+        run: |
+          uv run pytest tests/ \
+            --color=yes \
+            --cov=${{ matrix.module }} \
+            --cov-report=term \
+            --cov-fail-under=85 \
+            -v
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -30,10 +30,10 @@ Thank you for your interest in contributing to fasteval! Whether it's fixing a b
    cd fasteval
    ```
 
-2. Install dependencies:
+2. Install all dependencies (including dev and test groups):
 
    ```bash
-   uv sync --all-extras
+   uv sync --all-extras --group dev --group test
    ```
 
 3. Verify everything works:
@@ -97,26 +97,33 @@ uv run mypy .
 ## Testing
 
 - All new functionality must have corresponding tests
-- Maintain code coverage at or above 80%
+- Maintain code coverage at or above **85%**
 - Tests live in `tests/` for the core package and `plugins/*/tests/` for plugins
+- Coverage configuration is in `pyproject.toml` under `[tool.coverage.run]` and `[tool.coverage.report]` -- models, vision/audio/multimodal metrics, and other non-logic files are excluded from measurement
 
 Run tests:
 
 ```bash
 # Full test suite across Python versions
 uv run tox
 
-# Quick single-version test
-uv run pytest tests/ -v --cov=fasteval
+# Quick single-version test with coverage
+uv run --group test pytest tests/ --cov=fasteval --cov-report=term -v
 
 # Run a specific test
-uv run pytest tests/test_example.py::test_name -v
+uv run --group test pytest tests/test_example.py::test_name -v
+
+# Run plugin tests (from plugin directory)
+cd plugins/fasteval-langgraph
+uv run pytest tests/ -v
 ```
 
+> **Note**: The project includes a custom pytest plugin (`fasteval.testing.plugin`). When running tests with coverage, the plugin is automatically disabled via `addopts` in `pyproject.toml` (`-p no:fasteval`) to ensure accurate coverage tracking.
+
 ## Pull Request Process
 
 1. Ensure all tests pass and linting is clean.
-2. Update documentation if your change affects user-facing behavior (see `docs/`).
+2. Update documentation if your change affects user-facing behavior. Docs are published at [fasteval.io](https://fasteval.io) and source lives in `docs/`.
 3. Open a pull request against `main` with a clear description of your changes.
 4. A maintainer will review your PR, typically within a few business days.
 5. Once approved, a maintainer will merge your contribution.
@@ -129,6 +136,14 @@ uv run pytest tests/test_example.py::test_name -v
 - Adherence to the existing code style
 - Clear, focused commits (one logical change per commit)
 
+### Writing Custom Metrics
+
+If you're contributing a new metric, see the [Custom Metrics guide](https://fasteval.io/docs/advanced/custom-metrics) for the expected patterns. All metrics should:
+- Extend `Metric` (deterministic) or `BaseLLMMetric` (LLM-based)
+- Include a corresponding decorator in `fasteval/core/decorators.py`
+- Be registered in `METRIC_REGISTRY` in `fasteval/core/evaluator.py`
+- Have tests with >85% coverage
+
 ## Project Structure
 
 ```
@@ -137,16 +152,18 @@ fasteval/
 ├── metrics/        # Metric implementations (LLM, deterministic, conversation)
 ├── models/         # Pydantic models (EvalInput, EvalResult, MetricResult)
 ├── providers/      # LLM provider clients (OpenAI, Anthropic)
-├── cache/          # Caching utilities
+├── cache/          # In-memory LRU caching
+├── collectors/     # Result collection and reporting
+│   └── reporters/  # Output reporters (JSON, HTML)
 ├── utils/          # Helpers (formatting, JSON parsing, async)
-└── testing/        # pytest plugin
+└── testing/        # pytest plugin (--fe-output, --fe-summary, --no-interactive)
 
 plugins/
 ├── fasteval-langfuse/   # Langfuse production trace evaluation
 ├── fasteval-langgraph/  # LangGraph agent testing
 └── fasteval-observe/    # Runtime monitoring
 
-docs/                    # MDX documentation
+docs/                    # MDX documentation (published at fasteval.io)
 tests/                   # Core package tests
 ```
 

diff --git a/README.md b/README.md
@@ -7,19 +7,34 @@
 ![Python versions](https://img.shields.io/badge/python-3.10_|_3.11_|_3.12_|_3.13_|_3.14-blue?logo=python)
 [![CI](https://github.com/intuit/fasteval/actions/workflows/ci.yml/badge.svg)](https://github.com/intuit/fasteval/actions/workflows/ci.yml)
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Docs](https://img.shields.io/badge/docs-fasteval.io-blue)](https://fasteval.io)
 
-A **decorator-first LLM evaluation library** for testing AI agents and LLMs. Stack decorators to define evaluation criteria, run with pytest.
+A **decorator-first LLM evaluation library** for testing AI agents and LLMs. Stack decorators to define evaluation criteria, run with pytest. [Read the docs](https://fasteval.io/docs).
+
+<p align="center">
+  <img src="hero-evaluation-journey.png" alt="The Evaluation Journey -- from non-deterministic LLM outputs to reliable engineering metrics" width="800">
+</p>
 
 ## Features
 
-- **Decorator-based metrics** -- stack `@fe.correctness`, `@fe.relevance`, `@fe.hallucination`, and 30+ more
+- **50+ built-in metrics** -- stack `@fe.correctness`, `@fe.relevance`, `@fe.hallucination`, and more
 - **pytest native** -- run evaluations with `pytest`, get familiar pass/fail output
 - **LLM-as-judge + deterministic** -- semantic LLM metrics alongside ROUGE, exact match, JSON schema, regex
+- **Custom criteria** -- `@fe.criteria("Is the response empathetic?")` for any evaluation you can describe in plain English
 - **Multi-modal** -- evaluate vision, audio, and image generation models
 - **Conversation metrics** -- context retention, topic drift, consistency for multi-turn agents
 - **RAG metrics** -- faithfulness, contextual precision, contextual recall, answer correctness
 - **Tool trajectory** -- verify agent tool calls, argument matching, call sequences
-- **Pluggable providers** -- OpenAI (default), Anthropic, Azure OpenAI, Ollama
+- **Reusable metric stacks** -- `@fe.stack()` to compose and reuse metric sets across tests
+- **Human-in-the-loop** -- `@fe.human_review()` for manual review alongside automated metrics
+- **Data-driven testing** -- `@fe.csv("test_data.csv")` to load test cases from CSV files
+- **Pluggable providers** -- OpenAI (default), Anthropic, or bring your own `LLMClient`
+
+## How It Works
+
+<p align="center">
+  <img src="fasteval-overview.png" alt="How fasteval works -- Decorate, Test, Score, Evaluate, Result" width="800">
+</p>
 
 ## Quick Start
 
@@ -96,6 +111,23 @@ def test_summary_quality():
     fe.score(actual_output=summary, expected_output=reference)
 ```
 
+### Custom Criteria
+
+```python
+@fe.criteria("Is the response empathetic and professional?")
+def test_tone():
+    response = agent("I'm frustrated with this product!")
+    fe.score(response)
+
+@fe.criteria(
+    "Does the response include a legal disclaimer?",
+    threshold=0.9,
+)
+def test_compliance():
+    response = agent("Can I break my lease?")
+    fe.score(response)
+```
+
 ### RAG Evaluation
 
 ```python
@@ -117,45 +149,79 @@ def test_rag_pipeline():
 def test_agent_tools():
     result = agent.run("Book a flight to Paris")
     fe.score(
-        actual_tools=result.tool_calls,
+        result.response,
+        tool_calls=result.tool_calls,
         expected_tools=[
             {"name": "search_flights", "args": {"destination": "Paris"}},
             {"name": "book_flight"},
         ],
     )
 ```
 
+### Multi-Turn Conversations
+
+```python
+@fe.context_retention(threshold=0.8)
+@fe.conversation([
+    {"query": "My name is Alice and I'm a vegetarian"},
+    {"query": "Suggest a restaurant for me"},
+    {"query": "What dietary restriction should they accommodate?"},
+])
+async def test_memory(query, expected, history):
+    response = await agent(query, history=history)
+    fe.score(response, input=query, history=history)
+```
+
 ### Metric Stacks
 
 ```python
+# Define a reusable metric stack
+@fe.stack()
 @fe.correctness(threshold=0.8, weight=2.0)
 @fe.relevance(threshold=0.7, weight=1.0)
 @fe.coherence(threshold=0.6, weight=1.0)
-def test_comprehensive():
+def quality_metrics():
+    pass
+
+# Apply to multiple tests
+@quality_metrics
+def test_chatbot():
     response = agent("Explain quantum computing")
     fe.score(response, expected_output=reference_answer, input="Explain quantum computing")
+
+@quality_metrics
+def test_summarizer():
+    summary = summarize(long_article)
+    fe.score(summary, expected_output=reference_summary)
 ```
 
 ## Plugins
 
 | Plugin | Description | Install |
 |--------|-------------|---------|
-| [fasteval-langfuse](./plugins/fasteval-langfuse/) | Evaluate Langfuse production traces with fasteval metrics | `pip install fasteval-langfuse` |
-| [fasteval-langgraph](./plugins/fasteval-langgraph/) | Test harness for LangGraph agents | `pip install fasteval-langgraph` |
-| [fasteval-observe](./plugins/fasteval-observe/) | Runtime monitoring with async sampling | `pip install fasteval-observe` |
+| [fasteval-langfuse](https://fasteval.io/docs/plugins/langfuse/overview) | Evaluate Langfuse production traces with fasteval metrics | `pip install fasteval-langfuse` |
+| [fasteval-langgraph](https://fasteval.io/docs/plugins/langgraph/overview) | Test harness for LangGraph agents | `pip install fasteval-langgraph` |
+| [fasteval-observe](https://fasteval.io/docs/plugins/observe/overview) | Runtime monitoring with async sampling | `pip install fasteval-observe` |
+
+<p align="center">
+  <img src="testing-pyramid-agents.png" alt="Testing Pyramid for Agents -- layered testing strategy with fasteval-langgraph" width="700">
+</p>
 
 ## Local Development
 
 ```bash
 # Install uv
 brew install uv
 
-# Create virtual environment and install dependencies
-uv sync --all-extras
+# Create virtual environment and install all dependencies
+uv sync --all-extras --group dev --group test
 
 # Run the test suite
 uv run tox
 
+# Run tests with coverage
+uv run pytest tests/ --cov=fasteval --cov-report=term -v
+
 # Format code
 uv run black .
 uv run isort .
@@ -166,17 +232,23 @@ uv run mypy .
 
 ## Documentation
 
-Full documentation is available in the [docs/](./docs/) directory, covering:
-
-- [Getting Started](./docs/getting-started/) -- installation, quickstart
-- [Core Concepts](./docs/core-concepts/) -- decorators, metrics, scoring, data sources
-- [LLM Metrics](./docs/llm-metrics/) -- correctness, relevance, hallucination, and more
-- [Deterministic Metrics](./docs/deterministic-metrics/) -- ROUGE, exact match, regex, JSON schema
-- [RAG Metrics](./docs/rag-metrics/) -- faithfulness, contextual precision/recall
-- [Conversation Metrics](./docs/conversation-metrics/) -- context retention, consistency
-- [Multi-Modal](./docs/multimodal/) -- vision, audio, image generation evaluation
-- [Plugins](./docs/plugins/) -- Langfuse, LangGraph, Observe
-- [API Reference](./docs/api-reference/) -- decorators, evaluator, models, score
+Full documentation is available at **[fasteval.io](https://fasteval.io)**.
+
+- [Getting Started](https://fasteval.io/docs/getting-started/quickstart) -- installation and quickstart guide
+- [Why FastEval](https://fasteval.io/docs/getting-started/introduction/why-fasteval) -- motivation and design philosophy
+- [Core Concepts](https://fasteval.io/docs/core-concepts/decorators) -- decorators, metrics, scoring, data sources
+- [Concepts](https://fasteval.io/docs/concepts/llm-as-judge) -- LLM-as-judge, scoring thresholds, evaluation strategies
+- [LLM Metrics](https://fasteval.io/docs/llm-metrics/correctness) -- correctness, relevance, hallucination, and more
+- [Deterministic Metrics](https://fasteval.io/docs/deterministic-metrics/exact-match) -- ROUGE, exact match, regex, JSON schema
+- [RAG Metrics](https://fasteval.io/docs/rag-metrics/faithfulness) -- faithfulness, contextual precision/recall
+- [Tool Trajectory](https://fasteval.io/docs/tool-tranjectory-metrics/tool-call-accuracy) -- tool call accuracy, sequence, argument matching
+- [Conversation Metrics](https://fasteval.io/docs/conversation-metrics/context-retention) -- context retention, consistency, topic drift
+- [Multi-Modal](https://fasteval.io/docs/multimodal/overview) -- vision, audio, image generation evaluation
+- [Human Review](https://fasteval.io/docs/human-review/overview) -- human-in-the-loop evaluation
+- [Cookbooks](https://fasteval.io/docs/cookbooks/rag-pipeline) -- RAG pipelines, CI/CD setup, prompt regression, production monitoring
+- [Plugins](https://fasteval.io/docs/plugins/langfuse/overview) -- Langfuse, LangGraph, Observe
+- [Advanced](https://fasteval.io/docs/advanced/custom-metrics) -- custom metrics, providers, output collectors, traces
+- [API Reference](https://fasteval.io/docs/api-reference/decorators) -- decorators, evaluator, models, score
 
 ## Contributing
 

diff --git a/docs/assets/fasteval-overview.svg b/docs/assets/fasteval-overview.svg
diff --git a/fasteval-overview.png b/fasteval-overview.png
diff --git a/hero-evaluation-journey.png b/hero-evaluation-journey.png