diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 459499d..f5383b9 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,19 +1,104 @@
 name: Performance Benchmarks
-on: [push, pull_request]
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
 
 jobs:
   benchmark:
     runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
     steps:
-      - uses: actions/checkout@v4
-      - uses: dtolnay/rust-toolchain@stable
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Cache Rust dependencies
+        uses: Swatinem/rust-cache@v2
+
       - name: Build benchmark tool
-        run: cargo build --release --bin string-pipeline-bench
+        run: cargo build --release --bin bench_throughput
+
       - name: Run benchmarks
         run: |
-          ./target/release/string-pipeline-bench --iterations 5000 > benchmark_results.txt
-      - name: Upload results
+          # Run benchmarks with multiple sizes and save to JSON
+          ./target/release/bench_throughput \
+            --sizes 100,1000,10000 \
+            --iterations 50 \
+            --format json \
+            --output benchmark_results.json
+
+      - name: Download baseline benchmark
+        id: download-baseline
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          workflow: benchmark.yml
+          branch: main
+          name: benchmark-baseline
+          path: baseline
+          if_no_artifact_found: warn
+
+      - name: Compare with baseline
+        id: compare
+        run: |
+          if [ -f baseline/benchmark_results.json ]; then
+            echo "Baseline found, comparing results..."
+            python3 scripts/compare_benchmarks.py \
+              baseline/benchmark_results.json \
+              benchmark_results.json > comparison.md
+            echo "comparison_available=true" >> $GITHUB_OUTPUT
+          else
+            echo "No baseline found, this will become the new baseline"
+            echo "comparison_available=false" >> $GITHUB_OUTPUT
+            echo "## Benchmark Results\n\nNo baseline available for comparison. These results will be used as the baseline for future comparisons." > comparison.md
+          fi
+
+      - name: Comment PR with results
+        if: github.event_name == 'pull_request' && steps.compare.outputs.comparison_available == 'true'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const comparison = fs.readFileSync('comparison.md', 'utf8');
+
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: comparison
+            });
+
+      - name: Upload current results as artifact
         uses: actions/upload-artifact@v4
         with:
-          name: benchmark-results
+          name: benchmark-current
+          path: |
+            benchmark_results.json
+            comparison.md
+
+      - name: Upload as baseline (main branch only)
+        if: github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-baseline
           path: benchmark_results.json
+          retention-days: 90
+
+      - name: Fail if significant performance regression
+        if: steps.compare.outputs.comparison_available == 'true'
+        run: |
+          if grep -q "⚠️ PERFORMANCE REGRESSION" comparison.md; then
+            echo "::warning::Performance regression detected. Review comparison.md for details."
+            # Uncomment the next line to fail the build on regression
+            # exit 1
+          fi
diff --git a/.gitignore b/.gitignore
index ea8c4bf..7a9e023 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,7 @@
 /target
+
+# Benchmark results
+bench_results.json
+benchmark_results.json
+benchmark_results.txt
+comparison.md
diff --git a/Cargo.lock b/Cargo.lock
index e3af432..211cbad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -53,7 +53,7 @@ version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -64,7 +64,7 @@ checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -189,6 +189,17 @@ version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
 
+[[package]]
+name = "comfy-table"
+version = "7.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b03b7db8e0b4b2fdad6c551e634134e99ec000e5c8c3b6856c65e8bbaded7a3b"
+dependencies = [
+ "crossterm 0.29.0",
+ "unicode-segmentation",
+ "unicode-width",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.17"
@@ -256,6 +267,45 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
+[[package]]
+name = "crossterm"
+version = "0.28.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "mio",
+ "parking_lot",
+ "rustix 0.38.44",
+ "signal-hook",
+ "signal-hook-mio",
+ "winapi",
+]
+
+[[package]]
+name = "crossterm"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "document-features",
+ "parking_lot",
+ "rustix 1.0.7",
+ "winapi",
+]
+
+[[package]]
+name = "crossterm_winapi"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "crunchy"
 version = "0.2.3"
@@ -296,6 +346,15 @@ dependencies = [
  "crypto-common",
 ]
 
+[[package]]
+name = "document-features"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61"
+dependencies = [
+ "litrs",
+]
+
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -309,7 +368,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -346,7 +405,7 @@ dependencies = [
  "cfg-if",
  "libc",
  "r-efi",
- "wasi",
+ "wasi 0.14.2+wasi-0.2.4",
 ]
 
 [[package]]
@@ -423,12 +482,24 @@ version = "0.2.172"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
 
+[[package]]
+name = "litrs"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092"
+
 [[package]]
 name = "lock_api"
 version = "0.4.13"
@@ -451,6 +522,18 @@ version = "2.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
 
+[[package]]
+name = "mio"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873"
+dependencies = [
+ "libc",
+ "log",
+ "wasi 0.11.1+wasi-snapshot-preview1",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -662,6 +745,19 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "88f8660c1ff60292143c98d08fc6e2f654d722db50410e3f3797d40baaf9d8f3"
 
+[[package]]
+name = "rustix"
+version = "0.38.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.15",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "rustix"
 version = "1.0.7"
@@ -671,8 +767,8 @@ dependencies = [
  "bitflags",
  "errno",
  "libc",
- "linux-raw-sys",
- "windows-sys",
+ "linux-raw-sys 0.9.4",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -745,6 +841,36 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "signal-hook"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2"
+dependencies = [
+ "libc",
+ "signal-hook-registry",
+]
+
+[[package]]
+name = "signal-hook-mio"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "smallvec"
 version = "1.15.1"
@@ -757,7 +883,9 @@ version = "0.13.0"
 dependencies = [
  "clap",
  "clap_mangen",
+ "comfy-table",
  "criterion",
+ "crossterm 0.28.1",
  "dashmap",
  "fast-strip-ansi",
  "memchr",
@@ -766,8 +894,11 @@ dependencies = [
  "pest",
  "pest_derive",
  "regex",
+ "serde",
+ "serde_json",
  "smallvec",
  "tempfile",
+ "unicode-width",
 ]
 
 [[package]]
@@ -796,8 +927,8 @@ dependencies = [
  "fastrand",
  "getrandom",
  "once_cell",
- "rustix",
- "windows-sys",
+ "rustix 1.0.7",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -848,6 +979,18 @@ version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 
+[[package]]
+name = "unicode-segmentation"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
+
+[[package]]
+name = "unicode-width"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
+
 [[package]]
 name = "utf8parse"
 version = "0.2.2"
@@ -880,6 +1023,12 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
 [[package]]
 name = "wasi"
 version = "0.14.2+wasi-0.2.4"
@@ -957,15 +1106,43 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
 [[package]]
 name = "winapi-util"
 version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
@@ -975,6 +1152,15 @@ dependencies = [
  "windows-targets",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
diff --git a/Cargo.toml b/Cargo.toml
index 772ee29..a9489f1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,11 @@ parking_lot = "0.12.3"
 dashmap = "6.1.0"
 smallvec = "1.15.0"
 memchr = "2.7.4"
+crossterm = "0.28"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+comfy-table = "7.1"
+unicode-width = "0.2"
 
 [build-dependencies]
 clap = { version = "4.5.39", features = ["derive", "cargo"] }
@@ -38,6 +43,11 @@ path = "src/main.rs"
 name = "string-pipeline-bench"
 path = "src/bin/bench.rs"
 
+[[bin]]
+bench = false
+name = "bench_throughput"
+path = "src/bin/bench_throughput.rs"
+
 [profile.staging]
 inherits = "dev"
 opt-level = 3
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..4061422
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,203 @@
+# Benchmark CI/CD Scripts
+
+This directory contains scripts used by the GitHub Actions CI/CD pipeline to track and compare performance benchmarks.
+
+## Overview
+
+The benchmark CI/CD system automatically:
+1. Runs performance benchmarks on every push to `main` and on pull requests
+2. Compares results against the baseline (last `main` branch results)
+3. Generates a detailed comparison report
+4. Comments on PRs with performance changes
+5. Warns about significant performance regressions
+
+## Files
+
+### `compare_benchmarks.py`
+
+Python script that compares two benchmark JSON files and generates a markdown report.
+
+**Usage:**
+```bash
+python3 scripts/compare_benchmarks.py baseline.json current.json > report.md
+```
+
+**Features:**
+- Detects performance regressions (>5% slower)
+- Highlights improvements (>5% faster)
+- Compares avg/path latency, p99, and throughput
+- Color-coded indicators:
+  - 🟢 Significant improvement (>5% faster)
+  - ✅ Improvement (2-5% faster)
+  - ➖ Neutral (<2% change)
+  - 🟡 Caution (2-5% slower)
+  - ⚠️ Warning (5-10% slower)
+  - 🔴 Regression (>10% slower)
+
+## GitHub Actions Workflow
+
+The benchmark workflow (`.github/workflows/benchmark.yml`) runs automatically on:
+- Pushes to `main` branch
+- Pull requests
+
+### Workflow Steps
+
+1. **Build** - Compiles the `bench_throughput` tool in release mode
+2. **Run Benchmarks** - Executes benchmarks with multiple input sizes (100, 1K, 10K paths)
+3. **Download Baseline** - Fetches the last benchmark from `main` branch
+4. **Compare** - Runs the comparison script
+5. **Comment on PR** - Posts results as a comment on pull requests
+6. **Upload Artifacts** - Stores results for historical tracking
+7. **Update Baseline** - Saves results as new baseline (main branch only)
+8. **Check Regressions** - Warns if significant regressions detected
+
+### Artifacts
+
+The workflow stores three artifacts:
+
+1. **benchmark-current** - Current run results (JSON, text, comparison)
+   - Retained for 30 days
+   - Available for download from workflow runs
+
+2. **benchmark-baseline** - Baseline for comparison
+   - Updated only on `main` branch pushes
+   - Retained for 90 days
+   - Used for comparing future PRs
+
+## Running Benchmarks Locally
+
+### Run benchmarks and save to JSON:
+```bash
+cargo build --release --bin bench_throughput
+
+./target/release/bench_throughput \
+  --sizes 100,1000,10000 \
+  --iterations 50 \
+  --format json \
+  --output my_benchmark.json
+```
+
+### Compare two benchmark runs:
+```bash
+python3 scripts/compare_benchmarks.py \
+  baseline_benchmark.json \
+  my_benchmark.json > comparison.md
+
+# View the report
+cat comparison.md
+```
+
+## Configuration
+
+### Benchmark Parameters
+
+Default parameters in the CI workflow:
+- **Input sizes:** 100, 1,000, 10,000 paths
+- **Iterations:** 50 (per size)
+- **Output format:** JSON + human-readable text
+
+To change these, edit `.github/workflows/benchmark.yml`:
+```yaml
+./target/release/bench_throughput \
+  --sizes 100,1000,10000,100000 \  # Add more sizes
+  --iterations 100 \                # More iterations = more stable results
+  --format json \
+  --output benchmark_results.json
+```
+
+### Regression Thresholds
+
+The comparison script uses these thresholds:
+
+| Change | Classification | Emoji |
+|--------|---------------|-------|
+| >5% faster | Significant improvement | 🟢 |
+| 2-5% faster | Improvement | ✅ |
+| <2% change | Neutral (noise) | ➖ |
+| 2-5% slower | Caution | 🟡 |
+| 5-10% slower | Warning | ⚠️ |
+| >10% slower | Regression | 🔴 |
+
+To adjust thresholds, edit `scripts/compare_benchmarks.py`:
+```python
+def calculate_change(baseline: float, current: float):
+    # Modify these values:
+    if abs(change_pct) < 2:  # Noise threshold
+        ...
+    elif change_pct < -5:    # Improvement threshold
+        ...
+    elif change_pct > 10:    # Regression threshold
+        ...
+```
+
+### Failing on Regressions
+
+By default, the workflow **warns** about regressions but doesn't fail the build.
+
+To fail on regressions, uncomment this line in `.github/workflows/benchmark.yml`:
+```yaml
+- name: Fail if significant performance regression
+  run: |
+    if grep -q "⚠️ PERFORMANCE REGRESSION" comparison.md; then
+      echo "::warning::Performance regression detected."
+      exit 1  # Uncomment this line
+    fi
+```
+
+## Troubleshooting
+
+### No baseline found
+On the first run, there's no baseline for comparison. The first successful run on `main` will establish the baseline.
+
+### Benchmark variance
+Benchmarks can vary due to:
+- CI runner load
+- Background processes
+- Network conditions
+
+The 2% noise threshold accounts for normal variance. For more stable results:
+1. Increase iteration count
+2. Run benchmarks multiple times
+3. Use larger input sizes (less affected by noise)
+
+### Permission errors
+The workflow needs these permissions (already configured):
+```yaml
+permissions:
+  contents: write
+  pull-requests: write
+```
+
+## Example Report
+
+```markdown
+# 📊 Benchmark Comparison Report
+
+**Input Size:** 10,000 paths
+**Baseline Timestamp:** 1699123456
+**Current Timestamp:** 1699123789
+
+## Performance Comparison
+
+| Template | Avg/Path | Change | p99 | Change | Throughput | Change |
+|----------|----------|--------|-----|--------|------------|--------|
+| Strip ANSI | 304ns | ✅ -3.2% | 327ns | ➖ -1.1% | 3.29M/s | ✅ +3.3% |
+| Split all | 519ns | 🔴 +12.5% | 838ns | ⚠️ +8.2% | 1.93M/s | 🔴 -11.1% |
+
+## Summary
+
+- **Total templates compared:** 28
+- **Improvements:** 5 🟢
+- **Regressions:** 2 🔴
+- **Neutral:** 21 ➖
+
+### ⚠️ PERFORMANCE REGRESSIONS
+
+- **Split all**: +12.5% slower
+```
+
+## Further Reading
+
+- [Benchmark Tool Documentation](../src/bin/bench_throughput.rs)
+- [GitHub Actions Documentation](https://docs.github.com/en/actions)
+- [Rust Benchmarking Best Practices](https://nnethercote.github.io/perf-book/benchmarking.html)
diff --git a/scripts/compare_benchmarks.py b/scripts/compare_benchmarks.py
new file mode 100755
index 0000000..c2c2ec2
--- /dev/null
+++ b/scripts/compare_benchmarks.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Compare benchmark results and generate a markdown report.
+Detects performance regressions and improvements.
+"""
+
+import json
+import sys
+from typing import Dict, List, Tuple
+from pathlib import Path
+
+
+def format_duration_ns(ns: int) -> str:
+    """Format nanoseconds to human-readable duration."""
+    if ns < 1_000:
+        return f"{ns}ns"
+    elif ns < 1_000_000:
+        return f"{ns / 1_000:.2f}μs"
+    elif ns < 1_000_000_000:
+        return f"{ns / 1_000_000:.2f}ms"
+    else:
+        return f"{ns / 1_000_000_000:.2f}s"
+
+
+def format_throughput(paths_per_sec: float) -> str:
+    """Format throughput to human-readable format."""
+    if paths_per_sec >= 1_000_000:
+        return f"{paths_per_sec / 1_000_000:.2f}M/s"
+    elif paths_per_sec >= 1_000:
+        return f"{paths_per_sec / 1_000:.2f}K/s"
+    else:
+        return f"{paths_per_sec:.2f}/s"
+
+
+def calculate_change(baseline: float, current: float) -> Tuple[float, str]:
+    """Calculate percentage change and return emoji indicator."""
+    if baseline == 0:
+        return 0.0, "➖"
+
+    change_pct = ((current - baseline) / baseline) * 100
+
+    # For latency metrics (lower is better)
+    if abs(change_pct) < 2:  # Less than 2% change is noise
+        emoji = "➖"
+    elif change_pct < -5:  # >5% faster is significant improvement
+        emoji = "🟢"
+    elif change_pct < -2:  # 2-5% faster is improvement
+        emoji = "✅"
+    elif change_pct > 10:  # >10% slower is regression
+        emoji = "🔴"
+    elif change_pct > 5:  # 5-10% slower is warning
+        emoji = "⚠️"
+    else:  # 2-5% slower is caution
+        emoji = "🟡"
+
+    return change_pct, emoji
+
+
+def load_benchmark_results(filepath: str) -> Dict:
+    """Load benchmark results from JSON file."""
+    with open(filepath, 'r') as f:
+        return json.load(f)
+
+
+def compare_benchmarks(baseline_path: str, current_path: str) -> str:
+    """Compare two benchmark results and generate markdown report."""
+    baseline = load_benchmark_results(baseline_path)
+    current = load_benchmark_results(current_path)
+
+    # Build lookup dictionaries for easier comparison
+    baseline_results = {}
+    for bench in baseline['benchmarks']:
+        template_name = bench['template_name']
+        # Get the largest input size result
+        if bench['results']:
+            baseline_results[template_name] = bench['results'][-1]
+
+    current_results = {}
+    for bench in current['benchmarks']:
+        template_name = bench['template_name']
+        if bench['results']:
+            current_results[template_name] = bench['results'][-1]
+
+    # Generate report
+    report = []
+    report.append("# 📊 Benchmark Comparison Report\n")
+
+    # Get input size from first template
+    input_size = 0
+    if current['benchmarks'] and current['benchmarks'][0]['results']:
+        input_size = current['benchmarks'][0]['results'][-1]['input_size']
+
+    report.append(f"**Input Size:** {input_size:,} paths\n")
+    report.append(f"**Baseline Timestamp:** {baseline.get('timestamp', 'unknown')}")
+    report.append(f"**Current Timestamp:** {current.get('timestamp', 'unknown')}\n")
+
+    # Summary statistics
+    regressions = []
+    improvements = []
+    neutral = []
+
+    # Build comparison table
+    report.append("## Performance Comparison\n")
+    report.append("| Template | Avg/Path | Change | p95 | Change | Throughput | Change |")
+    report.append("|----------|----------|--------|-----|--------|------------|--------|")
+
+    # Sort by template name for consistent ordering
+    all_templates = sorted(set(baseline_results.keys()) | set(current_results.keys()))
+
+    for template_name in all_templates:
+        if template_name not in baseline_results or template_name not in current_results:
+            continue  # Skip if not in both sets
+
+        base = baseline_results[template_name]
+        curr = current_results[template_name]
+
+        # Compare avg time per path
+        base_avg_ns = base['avg_time_per_path']
+        curr_avg_ns = curr['avg_time_per_path']
+        avg_change, avg_emoji = calculate_change(base_avg_ns, curr_avg_ns)
+
+        # Compare p95
+        base_p95 = base['latency_stats']['p95']
+        curr_p95 = curr['latency_stats']['p95']
+        p95_change, p95_emoji = calculate_change(base_p95, curr_p95)
+
+        # Compare throughput (higher is better, so invert the change)
+        base_throughput = base['throughput_paths_per_sec']
+        curr_throughput = curr['throughput_paths_per_sec']
+        throughput_change = ((curr_throughput - base_throughput) / base_throughput) * 100
+        # Invert emoji logic for throughput
+        if abs(throughput_change) < 2:
+            throughput_emoji = "➖"
+        elif throughput_change > 5:
+            throughput_emoji = "🟢"
+        elif throughput_change > 2:
+            throughput_emoji = "✅"
+        elif throughput_change < -10:
+            throughput_emoji = "🔴"
+        elif throughput_change < -5:
+            throughput_emoji = "⚠️"
+        elif throughput_change < -2:
+            throughput_emoji = "🟡"
+        else:
+            throughput_emoji = "➖"
+
+        # Track regressions/improvements based on avg latency
+        if avg_change > 10:
+            regressions.append((template_name, avg_change))
+        elif avg_change < -5:
+            improvements.append((template_name, avg_change))
+        else:
+            neutral.append(template_name)
+
+        # Format table row
+        report.append(
+            f"| {template_name} "
+            f"| {format_duration_ns(curr_avg_ns)} "
+            f"| {avg_emoji} {avg_change:+.1f}% "
+            f"| {format_duration_ns(curr_p95)} "
+            f"| {p95_emoji} {p95_change:+.1f}% "
+            f"| {format_throughput(curr_throughput)} "
+            f"| {throughput_emoji} {throughput_change:+.1f}% |"
+        )
+
+    report.append("")
+
+    # Summary section
+    report.append("## Summary\n")
+    report.append(f"- **Total templates compared:** {len(all_templates)}")
+    report.append(f"- **Improvements:** {len(improvements)} 🟢")
+    report.append(f"- **Regressions:** {len(regressions)} 🔴")
+    report.append(f"- **Neutral:** {len(neutral)} ➖\n")
+
+    # Highlight significant changes
+    if regressions:
+        report.append("### ⚠️ PERFORMANCE REGRESSIONS\n")
+        for template, change in sorted(regressions, key=lambda x: x[1], reverse=True):
+            report.append(f"- **{template}**: {change:+.1f}% slower")
+        report.append("")
+
+    if improvements:
+        report.append("### ✨ Performance Improvements\n")
+        for template, change in sorted(improvements, key=lambda x: x[1]):
+            report.append(f"- **{template}**: {abs(change):.1f}% faster")
+        report.append("")
+
+    # Legend
+    report.append("---\n")
+    report.append("### Legend")
+    report.append("- 🟢 Significant improvement (>5% faster)")
+    report.append("- ✅ Improvement (2-5% faster)")
+    report.append("- ➖ Neutral (<2% change)")
+    report.append("- 🟡 Caution (2-5% slower)")
+    report.append("- ⚠️ Warning (5-10% slower)")
+    report.append("- 🔴 Regression (>10% slower)")
+
+    return "\n".join(report)
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: compare_benchmarks.py <baseline.json> <current.json>")
+        sys.exit(1)
+
+    baseline_path = sys.argv[1]
+    current_path = sys.argv[2]
+
+    if not Path(baseline_path).exists():
+        print(f"Error: Baseline file not found: {baseline_path}")
+        sys.exit(1)
+
+    if not Path(current_path).exists():
+        print(f"Error: Current file not found: {current_path}")
+        sys.exit(1)
+
+    try:
+        report = compare_benchmarks(baseline_path, current_path)
+        print(report)
+    except Exception as e:
+        print(f"Error comparing benchmarks: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/bin/bench_throughput.rs b/src/bin/bench_throughput.rs
new file mode 100644
index 0000000..8a8746d
--- /dev/null
+++ b/src/bin/bench_throughput.rs
@@ -0,0 +1,952 @@
+use clap::{Arg, Command};
+use comfy_table::{
+    Attribute as TableAttribute, Cell, Color as TableColor, ContentArrangement, Table,
+    presets::UTF8_FULL,
+};
+use crossterm::{
+    cursor, execute, queue,
+    style::{Attribute, Color, Print, ResetColor, SetAttribute, SetForegroundColor},
+    terminal::{Clear, ClearType},
+};
+use serde::{Serialize, Serializer};
+use std::io::{self, Write};
+use std::time::{Duration, Instant};
+use string_pipeline::Template;
+use unicode_width::UnicodeWidthStr;
+
+// Helper to serialize Duration as nanoseconds
+fn serialize_duration<S>(duration: &Duration, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    serializer.serialize_u128(duration.as_nanos())
+}
+
+/// Represents the results of a throughput benchmark for a specific input size
+#[derive(Debug, Clone, Serialize)]
+struct BenchmarkResult {
+    input_size: usize,
+    #[serde(serialize_with = "serialize_duration")]
+    parse_time: Duration,
+    #[serde(serialize_with = "serialize_duration")]
+    total_format_time: Duration,
+    #[serde(serialize_with = "serialize_duration")]
+    avg_time_per_path: Duration,
+    throughput_paths_per_sec: f64,
+    parse_percentage: f64,
+    latency_stats: LatencyStatistics,
+}
+
+/// Statistical analysis of latency distribution
+#[derive(Debug, Clone, Serialize)]
+struct LatencyStatistics {
+    #[serde(serialize_with = "serialize_duration")]
+    min: Duration,
+    #[serde(serialize_with = "serialize_duration")]
+    p50: Duration,
+    #[serde(serialize_with = "serialize_duration")]
+    p95: Duration,
+    #[serde(serialize_with = "serialize_duration")]
+    p99: Duration,
+    #[serde(serialize_with = "serialize_duration")]
+    max: Duration,
+    stddev: f64,
+    sample_count: usize,
+}
+
+impl BenchmarkResult {
+    fn new(
+        input_size: usize,
+        parse_time: Duration,
+        total_format_time: Duration,
+        individual_times: Vec<Duration>,
+    ) -> Self {
+        let avg_time_per_path = total_format_time / input_size as u32;
+        let throughput_paths_per_sec = input_size as f64 / total_format_time.as_secs_f64();
+        let total_time = parse_time + total_format_time;
+        let parse_percentage = (parse_time.as_secs_f64() / total_time.as_secs_f64()) * 100.0;
+
+        let latency_stats = Self::calculate_statistics(&individual_times);
+
+        BenchmarkResult {
+            input_size,
+            parse_time,
+            total_format_time,
+            avg_time_per_path,
+            throughput_paths_per_sec,
+            parse_percentage,
+            latency_stats,
+        }
+    }
+
+    fn calculate_statistics(times: &[Duration]) -> LatencyStatistics {
+        let sample_count = times.len();
+
+        if times.is_empty() {
+            return LatencyStatistics {
+                min: Duration::ZERO,
+                p50: Duration::ZERO,
+                p95: Duration::ZERO,
+                p99: Duration::ZERO,
+                max: Duration::ZERO,
+                stddev: 0.0,
+                sample_count: 0,
+            };
+        }
+
+        let mut sorted_times: Vec<Duration> = times.to_vec();
+        sorted_times.sort();
+
+        let min = sorted_times[0];
+        let max = sorted_times[sorted_times.len() - 1];
+
+        // Nearest-rank percentile calculation: ceil(p * n) - 1
+        let n = sorted_times.len() as f64;
+        let p50_idx = ((n * 0.50).ceil() as usize).saturating_sub(1);
+        let p95_idx = ((n * 0.95).ceil() as usize).saturating_sub(1);
+        let p99_idx = ((n * 0.99).ceil() as usize).saturating_sub(1);
+
+        let p50 = sorted_times[p50_idx];
+        let p95 = sorted_times[p95_idx];
+        let p99 = sorted_times[p99_idx];
+
+        // Calculate standard deviation
+        let mean = times.iter().map(|d| d.as_nanos() as f64).sum::<f64>() / times.len() as f64;
+        let variance = times
+            .iter()
+            .map(|d| {
+                let diff = d.as_nanos() as f64 - mean;
+                diff * diff
+            })
+            .sum::<f64>()
+            / times.len() as f64;
+        let stddev = variance.sqrt();
+
+        LatencyStatistics {
+            min,
+            p50,
+            p95,
+            p99,
+            max,
+            stddev,
+            sample_count,
+        }
+    }
+
+    fn scaling_factor(&self, baseline: &BenchmarkResult) -> f64 {
+        let expected = self.input_size as f64 / baseline.input_size as f64;
+        let actual =
+            self.total_format_time.as_secs_f64() / baseline.total_format_time.as_secs_f64();
+        actual / expected
+    }
+}
+
+/// Generates realistic absolute path strings for benchmarking
+struct PathGenerator {
+    directories: Vec<&'static str>,
+    filenames: Vec<&'static str>,
+    extensions: Vec<&'static str>,
+}
+
+impl PathGenerator {
+    fn new() -> Self {
+        PathGenerator {
+            directories: vec![
+                "home",
+                "usr",
+                "var",
+                "opt",
+                "etc",
+                "lib",
+                "bin",
+                "sbin",
+                "tmp",
+                "dev",
+                "projects",
+                "workspace",
+                "repos",
+                "src",
+                "tests",
+                "docs",
+                "config",
+                "data",
+                "cache",
+                "logs",
+                "build",
+                "dist",
+                "target",
+                "node_modules",
+                "vendor",
+                "components",
+                "services",
+                "models",
+                "controllers",
+                "views",
+                "utils",
+            ],
+            filenames: vec![
+                "main",
+                "lib",
+                "index",
+                "app",
+                "server",
+                "client",
+                "config",
+                "utils",
+                "helper",
+                "handler",
+                "service",
+                "model",
+                "controller",
+                "router",
+                "middleware",
+                "test",
+                "spec",
+                "readme",
+                "license",
+                "changelog",
+                "makefile",
+                "dockerfile",
+                "package",
+                "cargo",
+                "mod",
+                "types",
+                "constants",
+                "errors",
+                "validation",
+            ],
+            extensions: vec![
+                "rs", "txt", "md", "json", "toml", "yaml", "yml", "js", "ts", "py", "go", "c",
+                "cpp", "h", "sh",
+            ],
+        }
+    }
+
+    /// Generate a single path with specified seed and depth
+    fn generate_path(&self, seed: usize, depth: usize) -> String {
+        let mut parts = vec![];
+
+        // Generate directory components
+        for i in 0..depth {
+            let idx = (seed + i * 7) % self.directories.len();
+            parts.push(self.directories[idx]);
+        }
+
+        // Add filename with extension
+        let filename_idx = (seed * 13) % self.filenames.len();
+        let ext_idx = (seed * 17) % self.extensions.len();
+        let filename = format!(
+            "{}.{}",
+            self.filenames[filename_idx], self.extensions[ext_idx]
+        );
+        parts.push(&filename);
+
+        format!("/{}", parts.join("/"))
+    }
+
+    /// Generate N unique paths with varying depths
+    fn generate_paths(&self, count: usize) -> Vec<String> {
+        (0..count)
+            .map(|i| {
+                let depth = 2 + (i % 9); // Depths from 2 to 10
+                self.generate_path(i, depth)
+            })
+            .collect()
+    }
+}
+
+/// Comprehensive template set covering all operations and real-world use cases
+struct TemplateSet;
+
+impl TemplateSet {
+    fn get_templates() -> Vec<(&'static str, &'static str)> {
+        vec![
+            // Core individual operations
+            ("Split all", "{split:/:..}"),
+            ("Split last index", "{split:/:-1}"),
+            ("Join", "{split:/:..|join:/}"),
+            ("Upper", "{split:/:-1|upper}"),
+            ("Lower", "{split:/:-1|lower}"),
+            ("Trim", "{split:/:-1|trim}"),
+            ("Replace simple", "{replace:s/\\.txt$/.md/}"),
+            ("Replace complex", "{replace:s/\\/\\/+/\\//g}"),
+            ("Substring", "{split:/:-1|substring:0..10}"),
+            ("Reverse", "{split:/:-1|reverse}"),
+            ("Strip ANSI", "{strip_ansi}"),
+            ("Filter", "{split:/:..|filter:^[a-z]|join:/}"),
+            ("Sort", "{split:/:..|sort|join:/}"),
+            ("Unique", "{split:/:..|unique|join:/}"),
+            ("Pad", "{split:/:-1|pad:50: :right}"),
+            // Real-world path templates (television use cases)
+            ("Extract filename", "{split:/:-1}"),
+            ("Extract directory", "{split:/:0..-1|join:/}"),
+            ("Basename no ext", "{split:/:-1|split:.:0}"),
+            ("File extension", "{split:/:-1|split:.:-1}"),
+            ("Regex extract filename", "{regex_extract:[^/]+$}"),
+            (
+                "Uppercase all components",
+                "{split:/:..|map:{upper}|join:/}",
+            ),
+            ("Remove hidden dirs", "{split:/:..|filter_not:^\\.|join:/}"),
+            ("Normalize filename", "{split:/:-1|trim|lower}"),
+            ("Slug generation", "{replace:s/ /_/g|lower}"),
+            ("Breadcrumb last 3", "{split:/:..|slice:-3..|join: > }"),
+            // Complex chains
+            ("Chain: trim+upper+pad", "{split:/:-1|trim|upper|pad:20}"),
+            (
+                "Chain: split+filter+sort+join",
+                "{split:/:..|filter:^[a-z]|sort|join:-}",
+            ),
+            (
+                "Chain: map complex",
+                "{split:/:..|map:{trim|lower|replace:s/_/-/g}|join:/}",
+            ),
+        ]
+    }
+}
+
+/// Runs a benchmark for a single template with varying input sizes
+fn benchmark_template(
+    _template_name: &str,
+    template_str: &str,
+    sizes: &[usize],
+    iterations: usize,
+) -> Result<Vec<BenchmarkResult>, Box<dyn std::error::Error>> {
+    let generator = PathGenerator::new();
+    let mut results = Vec::new();
+
+    // Parse template N times and average
+    let mut total_parse_time = Duration::ZERO;
+    for _ in 0..iterations {
+        let parse_start = Instant::now();
+        let _ = Template::parse(template_str)?;
+        total_parse_time += parse_start.elapsed();
+    }
+    let avg_parse_time = total_parse_time / iterations as u32;
+
+    // Parse once for actual use
+    let template = Template::parse(template_str)?;
+
+    for &size in sizes {
+        // Generate N paths for this size
+        let paths = generator.generate_paths(size);
+
+        // Warmup: format all paths once
+        for path in &paths {
+            let _ = template.format(path)?;
+        }
+
+        // Measure: time complete iterations, calculate avg per-path for each iteration
+        let mut iteration_total_times = Vec::new();
+        let mut iteration_avg_times = Vec::new();
+
+        for _ in 0..iterations {
+            let iteration_start = Instant::now();
+            for path in &paths {
+                let _ = template.format(path)?;
+            }
+            let iteration_time = iteration_start.elapsed();
+            iteration_total_times.push(iteration_time);
+
+            // Calculate average time per path for this iteration (for statistics)
+            let avg_per_path = iteration_time / size as u32;
+            iteration_avg_times.push(avg_per_path);
+        }
+
+        // Calculate average total time across all iterations
+        let total_duration: Duration = iteration_total_times.iter().sum();
+        let avg_format_time = total_duration / iterations as u32;
+
+        let result =
+            BenchmarkResult::new(size, avg_parse_time, avg_format_time, iteration_avg_times);
+
+        results.push(result);
+    }
+
+    Ok(results)
+}
+
+fn format_duration(duration: Duration) -> String {
+    let nanos = duration.as_nanos();
+    if nanos < 1_000 {
+        format!("{nanos}ns")
+    } else if nanos < 1_000_000 {
+        format!("{:.2}μs", nanos as f64 / 1_000.0)
+    } else if nanos < 1_000_000_000 {
+        format!("{:.2}ms", nanos as f64 / 1_000_000.0)
+    } else {
+        format!("{:.2}s", duration.as_secs_f64())
+    }
+}
+
+fn format_throughput(paths_per_sec: f64) -> String {
+    if paths_per_sec >= 1_000_000.0 {
+        format!("{:.2}M/s", paths_per_sec / 1_000_000.0)
+    } else if paths_per_sec >= 1_000.0 {
+        format!("{:.2}K/s", paths_per_sec / 1_000.0)
+    } else {
+        format!("{:.2}/s", paths_per_sec)
+    }
+}
+
+fn format_size(size: usize) -> String {
+    if size >= 1_000_000 {
+        format!("{}M", size / 1_000_000)
+    } else if size >= 1_000 {
+        format!("{}K", size / 1_000)
+    } else {
+        size.to_string()
+    }
+}
+
+// Styled output helpers
+fn print_header(text: &str) {
+    let mut stdout = io::stdout();
+    let text_width = text.width();
+    let _ = execute!(
+        stdout,
+        SetForegroundColor(Color::Cyan),
+        SetAttribute(Attribute::Bold),
+        Print("╔"),
+        Print("═".repeat(78)),
+        Print("╗\n║ "),
+        Print(text),
+        Print(" ".repeat(77 - text_width)),
+        Print("║\n╚"),
+        Print("═".repeat(78)),
+        Print("╝\n"),
+        ResetColor
+    );
+}
+
+fn print_section_header(text: &str) {
+    let mut stdout = io::stdout();
+    let _ = execute!(
+        stdout,
+        Print("\n"),
+        SetForegroundColor(Color::Cyan),
+        SetAttribute(Attribute::Bold),
+        Print(text),
+        ResetColor,
+        Print("\n"),
+        SetForegroundColor(Color::DarkGrey),
+        Print("─".repeat(80)),
+        ResetColor
+    );
+}
+
+fn print_error(msg: &str) {
+    let mut stdout = io::stdout();
+    let _ = execute!(
+        stdout,
+        SetForegroundColor(Color::Red),
+        Print("✗ "),
+        ResetColor,
+        Print(msg),
+        Print("\n")
+    );
+}
+
+fn print_progress_bar(current: usize, total: usize, template_name: &str) {
+    let mut stdout = io::stdout();
+    let progress = (current as f64 / total as f64) * 100.0;
+    let filled = ((progress / 100.0) * 40.0) as usize;
+    let _ = queue!(
+        stdout,
+        cursor::MoveToColumn(0),
+        Clear(ClearType::CurrentLine),
+        SetForegroundColor(Color::Cyan),
+        Print("["),
+        SetForegroundColor(Color::Green),
+        Print("█".repeat(filled)),
+        SetForegroundColor(Color::DarkGrey),
+        Print("░".repeat(40 - filled)),
+        SetForegroundColor(Color::Cyan),
+        Print("]"),
+        ResetColor,
+        Print(format!(" {:.0}% ({}/{}) - ", progress, current, total)),
+        SetAttribute(Attribute::Dim),
+        Print(template_name),
+        ResetColor
+    );
+    stdout.flush().ok();
+}
+
+fn print_template_results(template_name: &str, results: &[BenchmarkResult]) {
+    print_section_header(&format!("Template: {}", template_name));
+
+    // Create results table with comfy-table
+    let mut table = Table::new();
+    table
+        .load_preset(UTF8_FULL)
+        .set_content_arrangement(ContentArrangement::Dynamic)
+        .set_header(vec![
+            Cell::new("Input Size")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Parse Time")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Total Time")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Avg/Path")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Throughput")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Parse %")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Scaling")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+        ]);
+
+    for (idx, result) in results.iter().enumerate() {
+        let scaling = if idx == 0 {
+            "baseline".to_string()
+        } else {
+            format!("{:.2}x", result.scaling_factor(&results[0]))
+        };
+
+        table.add_row(vec![
+            Cell::new(format_size(result.input_size)),
+            Cell::new(format_duration(result.parse_time)),
+            Cell::new(format_duration(result.total_format_time)),
+            Cell::new(format_duration(result.avg_time_per_path)),
+            Cell::new(format_throughput(result.throughput_paths_per_sec)),
+            Cell::new(format!("{:.2}%", result.parse_percentage)),
+            Cell::new(scaling),
+        ]);
+    }
+
+    println!("\n{}", table);
+
+    // Scaling analysis
+    if results.len() >= 2 {
+        let first = &results[0];
+        let last = &results[results.len() - 1];
+
+        let size_ratio = last.input_size as f64 / first.input_size as f64;
+        let time_ratio =
+            last.total_format_time.as_secs_f64() / first.total_format_time.as_secs_f64();
+        let scaling_quality = time_ratio / size_ratio;
+
+        let mut stdout = io::stdout();
+        let _ = execute!(
+            stdout,
+            Print("\n"),
+            SetForegroundColor(Color::Magenta),
+            Print("📊 Scaling Analysis:\n"),
+            ResetColor
+        );
+        println!(
+            "   Size increase: {:.0}x ({} → {})",
+            size_ratio,
+            format_size(first.input_size),
+            format_size(last.input_size)
+        );
+        println!(
+            "   Time increase: {:.2}x ({} → {})",
+            time_ratio,
+            format_duration(first.total_format_time),
+            format_duration(last.total_format_time)
+        );
+
+        let scaling_desc = if scaling_quality < 0.95 {
+            "Sub-linear (improving with scale!) 🚀"
+        } else if scaling_quality <= 1.05 {
+            "Linear (perfect scaling) ✓"
+        } else if scaling_quality <= 1.5 {
+            "Slightly super-linear"
+        } else {
+            "Super-linear (degrading with scale)"
+        };
+
+        println!(
+            "   Scaling behavior: {:.2}x - {}",
+            scaling_quality, scaling_desc
+        );
+        println!(
+            "   Parse cost reduction: {:.2}% → {:.2}%",
+            first.parse_percentage, last.parse_percentage
+        );
+    }
+
+    // Latency statistics for largest size
+    if !results.is_empty() {
+        let largest_result = results.last().unwrap();
+
+        // Latency statistics
+        let stats = &largest_result.latency_stats;
+        println!(
+            "\n📈 Latency Statistics (at {} inputs):",
+            format_size(largest_result.input_size)
+        );
+        println!(
+            "   Min: {}  p50: {}  p95: {}  p99: {}  Stddev: {}",
+            format_duration(stats.min),
+            format_duration(stats.p50),
+            format_duration(stats.p95),
+            format_duration(stats.p99),
+            format_duration(Duration::from_nanos(stats.stddev as u64))
+        );
+
+        // Performance consistency analysis
+        let p50_ns = stats.p50.as_nanos() as f64;
+        let p99_ns = stats.p99.as_nanos() as f64;
+
+        if p50_ns > 0.0 {
+            let p99_p50_ratio = p99_ns / p50_ns;
+            let stddev_percent = (stats.stddev / p50_ns) * 100.0;
+
+            println!("   Analysis:");
+
+            // Consistency (p99/p50 ratio)
+            print!("   - Consistency: {:.2}x", p99_p50_ratio);
+            if p99_p50_ratio < 2.0 {
+                println!(" (excellent - very predictable)");
+            } else if p99_p50_ratio < 3.0 {
+                println!(" (good - mostly consistent)");
+            } else if p99_p50_ratio < 5.0 {
+                println!(" (fair - some variance)");
+            } else {
+                println!(" (poor - high variance)");
+            }
+
+            // Variance (stddev %)
+            print!("   - Variance: {:.1}%", stddev_percent);
+            if stddev_percent < 20.0 {
+                println!(" (low - stable)");
+            } else if stddev_percent < 40.0 {
+                println!(" (moderate)");
+            } else {
+                println!(" (high - jittery)");
+            }
+        }
+
+        println!();
+    }
+}
+
+fn print_statistics_explanation(sample_count: usize) {
+    print_header("📖 LATENCY STATISTICS METHODOLOGY");
+
+    println!(
+        "   Latency statistics calculated from {} iteration samples",
+        sample_count
+    );
+    println!("   Each sample = average time per path for one complete iteration");
+    println!();
+    println!("   Statistical Methods:");
+    println!("   - Percentiles: Nearest-rank method on sorted iteration averages");
+    println!("     • p50 = value at index ceil(n × 0.50) - 1");
+    println!("     • p95 = value at index ceil(n × 0.95) - 1");
+    println!("     • p99 = value at index ceil(n × 0.99) - 1");
+    println!();
+    println!("   - Consistency: p99/p50 ratio (lower = more predictable)");
+    println!("   - Variance: (stddev/p50) × 100% (lower = more stable)");
+    println!("   - Stddev: √(Σ(x - mean)² / n) over iteration samples");
+    println!();
+}
+
+fn print_summary(all_results: &[(&str, Vec<BenchmarkResult>)]) {
+    // Get the largest input size for the header
+    let largest_size = all_results
+        .iter()
+        .filter_map(|(_, results)| results.last().map(|r| r.input_size))
+        .max()
+        .unwrap_or(0);
+
+    let header_text = format!(
+        "📊 SUMMARY - Performance at Largest Input Size ({})",
+        format_size(largest_size)
+    );
+    print_header(&header_text);
+
+    // Collect results with latency stats for sorting
+    let mut summary_data: Vec<(&str, Duration, Duration, Duration, f64, f64)> = all_results
+        .iter()
+        .filter_map(|(name, results)| {
+            results.last().map(|last| {
+                (
+                    *name,
+                    last.avg_time_per_path,
+                    last.latency_stats.p95,
+                    last.latency_stats.p99,
+                    last.latency_stats.stddev,
+                    last.throughput_paths_per_sec,
+                )
+            })
+        })
+        .collect();
+
+    // Sort by throughput (highest first)
+    summary_data.sort_by(|a, b| b.5.partial_cmp(&a.5).unwrap());
+
+    // Create summary table with comfy-table
+    let mut table = Table::new();
+    table
+        .load_preset(UTF8_FULL)
+        .set_content_arrangement(ContentArrangement::Dynamic)
+        .set_header(vec![
+            Cell::new("Template")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Avg/Path")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("p95")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("p99")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Stddev")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+            Cell::new("Throughput")
+                .add_attribute(TableAttribute::Bold)
+                .fg(TableColor::Yellow),
+        ]);
+
+    for (idx, (template_name, avg_time, p95, p99, stddev, throughput)) in
+        summary_data.iter().enumerate()
+    {
+        // Highlight fastest (green) and slowest (yellow)
+        let color = if idx == 0 {
+            TableColor::Green
+        } else if idx == summary_data.len() - 1 {
+            TableColor::Yellow
+        } else {
+            TableColor::Reset
+        };
+
+        table.add_row(vec![
+            Cell::new(template_name).fg(color),
+            Cell::new(format_duration(*avg_time)).fg(color),
+            Cell::new(format_duration(*p95)).fg(color),
+            Cell::new(format_duration(*p99)).fg(color),
+            Cell::new(format_duration(Duration::from_nanos(*stddev as u64))).fg(color),
+            Cell::new(format_throughput(*throughput)).fg(color),
+        ]);
+    }
+
+    println!("{}", table);
+}
+
+/// Output results in JSON format for tracking over time
+#[derive(Serialize)]
+struct BenchmarkOutput<'a> {
+    timestamp: u64,
+    benchmarks: Vec<TemplateBenchmark<'a>>,
+}
+
+#[derive(Serialize)]
+struct TemplateBenchmark<'a> {
+    template_name: &'a str,
+    results: &'a [BenchmarkResult],
+}
+
+fn output_json(
+    all_results: &[(&str, Vec<BenchmarkResult>)],
+    output_path: Option<&str>,
+) -> Result<(), Box<dyn std::error::Error>> {
+    let timestamp = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)?
+        .as_secs();
+
+    let benchmarks: Vec<TemplateBenchmark> = all_results
+        .iter()
+        .map(|(name, results)| TemplateBenchmark {
+            template_name: name,
+            results,
+        })
+        .collect();
+
+    let output = BenchmarkOutput {
+        timestamp,
+        benchmarks,
+    };
+
+    let json_string = serde_json::to_string_pretty(&output)?;
+
+    if let Some(path) = output_path {
+        std::fs::write(path, json_string)?;
+        let mut stdout = io::stdout();
+        let _ = execute!(
+            stdout,
+            Print("\n"),
+            SetForegroundColor(Color::Green),
+            Print("✓ JSON output written to: "),
+            ResetColor,
+            Print(format!("{}\n", path))
+        );
+    } else {
+        println!("\n{}", json_string);
+    }
+
+    Ok(())
+}
+
+fn main() {
+    let matches = Command::new("String Pipeline Throughput Benchmark")
+        .version(env!("CARGO_PKG_VERSION"))
+        .about("Benchmarks batch processing throughput with varying input sizes")
+        .arg(
+            Arg::new("sizes")
+                .short('s')
+                .long("sizes")
+                .value_name("COUNTS")
+                .help("Comma-separated input sizes (number of paths to process)")
+                .default_value("100,500,1000,5000,10000,50000,100000"),
+        )
+        .arg(
+            Arg::new("iterations")
+                .short('i')
+                .long("iterations")
+                .value_name("COUNT")
+                .help("Number of measurement iterations per size for stability")
+                .default_value("50"),
+        )
+        .arg(
+            Arg::new("format")
+                .short('f')
+                .long("format")
+                .value_name("FORMAT")
+                .help("Output format: console or json")
+                .default_value("console"),
+        )
+        .arg(
+            Arg::new("output")
+                .short('o')
+                .long("output")
+                .value_name("FILE")
+                .help("Output file path (for JSON format)"),
+        )
+        .arg(
+            Arg::new("verbose")
+                .short('v')
+                .long("verbose")
+                .action(clap::ArgAction::SetTrue)
+                .help("Show detailed output for each template (default shows only summary)"),
+        )
+        .get_matches();
+
+    // Parse arguments
+    let sizes_str = matches.get_one::<String>("sizes").unwrap();
+    let sizes: Vec<usize> = sizes_str
+        .split(',')
+        .map(|s| {
+            s.trim()
+                .parse()
+                .unwrap_or_else(|_| panic!("Invalid size value: {}", s))
+        })
+        .collect();
+
+    let iterations: usize = matches
+        .get_one::<String>("iterations")
+        .unwrap()
+        .parse()
+        .expect("Invalid iteration count");
+
+    let format = matches.get_one::<String>("format").unwrap();
+    let output_path = matches.get_one::<String>("output");
+    let verbose = matches.get_flag("verbose");
+
+    if sizes.is_empty() {
+        eprintln!("Error: At least one input size is required");
+        std::process::exit(1);
+    }
+
+    // Always show header
+    print_header("String Pipeline Throughput Benchmark v0.13.0");
+    let mut stdout = io::stdout();
+    let _ = execute!(
+        stdout,
+        Print("Measuring batch processing performance with varying input sizes\n"),
+        Print("Pattern: Parse and format N paths with M iterations for stability\n\n"),
+        SetForegroundColor(Color::Cyan),
+        Print("Input sizes: "),
+        ResetColor,
+        Print(format!(
+            "{:?}\n",
+            sizes.iter().map(|s| format_size(*s)).collect::<Vec<_>>()
+        )),
+        SetForegroundColor(Color::Cyan),
+        Print("Measurement iterations: "),
+        ResetColor,
+        Print(format!("{}\n", iterations)),
+        SetForegroundColor(Color::Cyan),
+        Print("Output format: "),
+        ResetColor,
+        Print(format!("{}\n", format))
+    );
+
+    let templates = TemplateSet::get_templates();
+    let mut all_results = Vec::new();
+    let total_templates = templates.len();
+
+    for (idx, (template_name, template_str)) in templates.iter().enumerate() {
+        // Always show progress bar
+        print_progress_bar(idx + 1, total_templates, template_name);
+
+        match benchmark_template(template_name, template_str, &sizes, iterations) {
+            Ok(results) => {
+                let mut stdout = io::stdout();
+                let _ = execute!(
+                    stdout,
+                    cursor::MoveToColumn(0),
+                    Clear(ClearType::CurrentLine)
+                );
+                if verbose {
+                    print_template_results(template_name, &results);
+                }
+                all_results.push((*template_name, results));
+            }
+            Err(e) => {
+                let mut stdout = io::stdout();
+                let _ = execute!(
+                    stdout,
+                    cursor::MoveToColumn(0),
+                    Clear(ClearType::CurrentLine)
+                );
+                print_error(&format!("Failed to benchmark '{}': {}", template_name, e));
+            }
+        }
+    }
+
+    // Get iteration count from first template for statistics explanation
+    let sample_count = if !all_results.is_empty() && !all_results[0].1.is_empty() {
+        all_results[0].1[0].latency_stats.sample_count
+    } else {
+        iterations
+    };
+
+    // In verbose mode, show statistics explanation before summary
+    if verbose {
+        print_statistics_explanation(sample_count);
+    }
+
+    // Always show summary
+    print_summary(&all_results);
+
+    if format == "json"
+        && let Err(e) = output_json(&all_results, output_path.map(|s| s.as_str()))
+    {
+        eprintln!("Error writing JSON output: {}", e);
+        std::process::exit(1);
+    }
+
+    // Always show completion message
+    let mut stdout = io::stdout();
+    let _ = execute!(
+        stdout,
+        SetForegroundColor(Color::Green),
+        SetAttribute(Attribute::Bold),
+        Print("✓ Benchmark complete!\n"),
+        ResetColor
+    );
+}