# DSPy Optimizer Benchmark

This notebook benchmarks all available DSPy optimizers against the AI text detection task from `dspy-in-8-steps.ipynb`.

**Metrics tracked for each optimizer:**
- API calls (number of LM calls)
- Total tokens used (prompt + completion)
- Cost estimate (based on OpenAI pricing)
- Accuracy uplift vs baseline
- Time taken to optimize

**Models used:**
- Task model: `openai/gpt-5-mini`
- Teacher/reflection model: `openai/gpt-5`
- Fine-tuning model: `Qwen3-1.7B` (for GRPO, BootstrapFinetune, BetterTogether)

In [1]:
# Install dependencies
!uv pip install dspy pandas python-dotenv gepa -q

In [2]:
import dspy
import pandas as pd
import time
import os
import random
import copy
import threading
import contextlib
import signal
import sys
import time as time_module
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
import tqdm
import logging
from dotenv import load_dotenv
from dataclasses import dataclass, field
from typing import Any

# Load environment variables
load_dotenv()

# Pricing per 1M tokens (as of Dec 2025)
# Source: https://openai.com/api/pricing/
PRICING = {
    "openai/gpt-5-mini": {"input": 0.25, "output": 2.00},
    "openai/gpt-5": {"input": 1.25, "output": 10.00},
}

NUM_THREADS = 16

def calculate_cost(usage: dict) -> float:
    """Calculate cost from usage dict."""
    total_cost = 0.0
    for model, data in usage.items():
        if model in PRICING:
            prompt_tokens = data.get("prompt_tokens", 0)
            completion_tokens = data.get("completion_tokens", 0)
            total_cost += (prompt_tokens / 1_000_000) * PRICING[model]["input"]
            total_cost += (completion_tokens / 1_000_000) * PRICING[model]["output"]
    return total_cost



In [3]:

# =============================================================================
# Monkey-patch for DSPy parallelizer bug (affects dspy.track_usage())
# 
# Bug: Line 95 in parallelizer.py tries to access thread_local_overrides.overrides
# which doesn't exist (ContextVar has no .overrides attribute). This silently
# breaks context propagation to worker threads.
#
# Fix: Build the complete overrides dict BEFORE calling set(), and don't deep
# copy the usage_tracker so it's shared across threads.
# =============================================================================
from dspy.utils.parallelizer import ParallelExecutor
from dspy.dsp.utils.settings import thread_local_overrides

_logger = logging.getLogger(__name__)

def _fixed_execute_parallel(self, function, data):
    """Fixed version of ParallelExecutor._execute_parallel."""
    results = [None] * len(data)
    job_cancelled = "cancelled"
    start_time_map = {}
    start_time_lock = threading.Lock()
    resubmitted = set()
    
    parent_overrides = thread_local_overrides.get().copy()

    def worker(parent_ovr, submission_id, index, item):
        if self.cancel_jobs.is_set():
            return index, job_cancelled
        with start_time_lock:
            start_time_map[submission_id] = time_module.time()

        original = thread_local_overrides.get()
        # FIX: Build complete overrides dict BEFORE calling set()
        # Don't deep copy usage_tracker - share it across threads for aggregation
        new_overrides = {**original, **parent_ovr}
        token = thread_local_overrides.set(new_overrides)

        try:
            return index, function(item)
        finally:
            thread_local_overrides.reset(token)

    @contextlib.contextmanager
    def interrupt_manager():
        if threading.current_thread() is threading.main_thread():
            orig_handler = signal.getsignal(signal.SIGINT)
            def handler(sig, frame):
                self.cancel_jobs.set()
                _logger.warning("SIGINT received. Cancelling.")
                orig_handler(sig, frame)
            signal.signal(signal.SIGINT, handler)
            try:
                yield
            finally:
                signal.signal(signal.SIGINT, orig_handler)
        else:
            yield

    executor = ThreadPoolExecutor(max_workers=self.num_threads)
    try:
        with interrupt_manager():
            futures_map = {}
            futures_set = set()
            submission_counter = 0

            for idx, item in enumerate(data):
                f = executor.submit(worker, parent_overrides, submission_counter, idx, item)
                futures_map[f] = (submission_counter, idx, item)
                futures_set.add(f)
                submission_counter += 1

            pbar = tqdm.tqdm(
                total=len(data),
                dynamic_ncols=True,
                disable=self.disable_progress_bar,
                file=sys.stdout,
            )

            def all_done():
                return all(r is not None for r in results)

            while futures_set and not self.cancel_jobs.is_set():
                if all_done():
                    break
                done, not_done = wait(futures_set, timeout=1, return_when=FIRST_COMPLETED)
                for f in done:
                    futures_set.remove(f)
                    try:
                        index, outcome = f.result()
                    except Exception:
                        pass
                    else:
                        if outcome != job_cancelled and results[index] is None:
                            if isinstance(outcome, Exception):
                                with self.error_lock:
                                    self.failed_indices.append(index)
                                    self.exceptions_map[index] = outcome
                                results[index] = None
                            else:
                                results[index] = outcome

                        if self.compare_results:
                            vals = [r[-1] for r in results if r is not None]
                            self._update_progress(pbar, sum(vals), len(vals))
                        else:
                            self._update_progress(pbar, len([r for r in results if r is not None]), len(data))

                if all_done():
                    break

                if 0 < self.timeout and len(not_done) <= self.straggler_limit:
                    now = time_module.time()
                    for f in list(not_done):
                        if f not in resubmitted:
                            sid, idx, item = futures_map[f]
                            with start_time_lock:
                                st = start_time_map.get(sid, None)
                            if st and (now - st) >= self.timeout:
                                resubmitted.add(f)
                                nf = executor.submit(worker, parent_overrides, submission_counter, idx, item)
                                futures_map[nf] = (submission_counter, idx, item)
                                futures_set.add(nf)
                                submission_counter += 1

            pbar.close()

    finally:
        executor.shutdown(wait=False)

    if self.cancel_jobs.is_set():
        _logger.warning("Execution cancelled due to errors or interruption.")
        raise Exception("Execution cancelled due to errors or interruption.")

    return results

# Apply the fix
ParallelExecutor._execute_parallel = _fixed_execute_parallel
print("Applied DSPy parallelizer fix for usage tracking")

Applied DSPy parallelizer fix for usage tracking


In [4]:
# Initialize models
task_lm = dspy.LM(
    "openai/gpt-5-mini",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=1.0,
    max_tokens=16000,
    cache=False # In order to track latency and cost
)

teacher_lm = dspy.LM(
    "openai/gpt-5",  # Teacher/reflection model
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=1.0,
    max_tokens=16000,
    cache=False # In order to track latency and cost
)

# Configure default LM
dspy.configure(lm=task_lm)

In [5]:
# Load dataset (same as dspy-in-8-steps.ipynb)
csv_path = 'ai_vs_human200.csv'
df = pd.read_csv(csv_path)
examples = df.to_dict(orient='records')

# Convert to DSPy Examples
dataset = [
    dspy.Example(**ex).with_inputs("text")
    for ex in examples
]

# Reproducible shuffle and split
random.seed(42)
random.shuffle(dataset)

trainset = dataset[:len(dataset)//2]
valset = dataset[len(dataset)//2:]

print(f"Loaded: {csv_path}")
print(f"Training set: {len(trainset)} examples")
print(f"Validation set: {len(valset)} examples")

Loaded: ai_vs_human200.csv
Training set: 101 examples
Validation set: 101 examples


In [6]:
# Define Signature and Module (same as dspy-in-8-steps.ipynb)
class DetectAIText(dspy.Signature):
    """Detects whether text is AI-generated."""
    text: str = dspy.InputField(description="The text to analyze")
    is_ai: bool = dspy.OutputField(description="Whether the text is AI-generated")


class AIDetector(dspy.Module):
    def __init__(self):
        super().__init__()
        # Chain of thought reasoning
        self.detect = dspy.ChainOfThought(DetectAIText)

    def forward(self, text: str):
        return self.detect(text=text)

In [7]:
# Define metrics (same as dspy-in-8-steps.ipynb)

# Standard exact match metric
def exact_match(example, response, trace=None):
    return 1 if example.is_ai == response.is_ai else 0


# Exact match metric with feedback for GEPA
def exact_match_with_feedback(example, response, trace=None, pred_name=None, pred_trace=None):
    score = 1 if example.is_ai == response.is_ai else 0
    if pred_name:
        return dspy.Prediction(score=score, feedback=example.notes)
    else:
        return score

In [8]:
@dataclass
class BenchmarkResult:
    """Results from benchmarking an optimizer."""
    optimizer_name: str
    baseline_accuracy: float
    optimized_accuracy: float
    accuracy_uplift: float
    total_tokens: int
    prompt_tokens: int
    completion_tokens: int
    cost_usd: float
    time_seconds: float
    notes: str = ""
    usage_by_model: dict = field(default_factory=dict)


def run_baseline_evaluation(module_class, devset, metric):
    """Run baseline evaluation and return accuracy."""
    detector = module_class()
    evaluator = dspy.Evaluate(
        devset=devset,
        metric=metric,
        num_threads=NUM_THREADS,
        display_progress=True,
        display_table=False,
    )
    result = evaluator(detector)
    return result.score / 100  # Convert percentage to decimal


def evaluate_optimized(optimized_module, devset, metric):
    """Evaluate optimized module and return accuracy."""
    evaluator = dspy.Evaluate(
        devset=devset,
        metric=metric,
        num_threads=NUM_THREADS,
        display_progress=True,
        display_table=False,
    )
    result = evaluator(optimized_module)
    return result.score / 100


def print_benchmark_result(result: BenchmarkResult):
    """Print benchmark results in a standard format."""
    print("\n" + "="*60)
    print(f"OPTIMIZER: {result.optimizer_name}")
    print("="*60)
    print(f"Baseline Accuracy:   {result.baseline_accuracy*100:.1f}%")
    print(f"Optimized Accuracy:  {result.optimized_accuracy*100:.1f}%")
    print(f"Accuracy Uplift:     {result.accuracy_uplift*100:+.1f}%")
    print("-"*60)
    print(f"Total Tokens:        {result.total_tokens:,}")
    print(f"  - Prompt:          {result.prompt_tokens:,}")
    print(f"  - Completion:      {result.completion_tokens:,}")
    print(f"Estimated Cost:      ${result.cost_usd:.4f}")
    print(f"Time Taken:          {result.time_seconds:.1f}s")
    if result.notes:
        print(f"Notes:               {result.notes}")
    print("="*60 + "\n")


# Store all results for final comparison
all_results: list[BenchmarkResult] = []

## Baseline Evaluation

First, let's establish our baseline accuracy without any optimization.

In [9]:
# Run baseline evaluation
print("Running baseline evaluation...")
baseline_accuracy = run_baseline_evaluation(AIDetector, valset, exact_match)

print(f"\nBaseline accuracy: {baseline_accuracy*100:.1f}%")

Running baseline evaluation...
Average Metric: 66.00 / 101 (65.3%): 100%|██████████| 101/101 [00:55<00:00,  1.82it/s]

2025/12/04 12:15:52 INFO dspy.evaluate.evaluate: Average Metric: 66 / 101 (65.3%)




Baseline accuracy: 65.3%


---
## 1. LabeledFewShot

The simplest optimizer - just adds labeled examples as few-shot demonstrations. No metric needed.

In [30]:
from dspy.teleprompt import LabeledFewShot

print("Running LabeledFewShot optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = LabeledFewShot(k=4)
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

# Calculate totals
total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="LabeledFewShot",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes="No metric required. Simply adds k random examples as demos.",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

Running LabeledFewShot optimizer...
Average Metric: 87.00 / 101 (86.1%): 100%|██████████| 101/101 [00:49<00:00,  2.05it/s]

2025/12/03 15:41:48 INFO dspy.evaluate.evaluate: Average Metric: 87 / 101 (86.1%)




OPTIMIZER: LabeledFewShot
Baseline Accuracy:   72.3%
Optimized Accuracy:  86.1%
Accuracy Uplift:     +13.9%
------------------------------------------------------------
Total Tokens:        94,497
  - Prompt:          54,814
  - Completion:      39,683
Estimated Cost:      $0.0931
Time Taken:          49.3s
Notes:               No metric required. Simply adds k random examples as demos.



---
## 2. BootstrapFewShot

Bootstraps demonstrations by running the teacher on training examples and selecting successful traces.

In [31]:
from dspy.teleprompt import BootstrapFewShot

print("Running BootstrapFewShot optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = BootstrapFewShot(
        metric=exact_match,
        max_bootstrapped_demos=4,
        max_labeled_demos=4,
        max_rounds=1,
    )
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset,
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="BootstrapFewShot",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes="Bootstraps successful traces as demonstrations.",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

Running BootstrapFewShot optimizer...


  4%|▍         | 4/101 [00:27<11:01,  6.82s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 97.00 / 101 (96.0%): 100%|██████████| 101/101 [00:42<00:00,  2.39it/s]

2025/12/03 15:44:26 INFO dspy.evaluate.evaluate: Average Metric: 97 / 101 (96.0%)




OPTIMIZER: BootstrapFewShot
Baseline Accuracy:   72.3%
Optimized Accuracy:  96.0%
Accuracy Uplift:     +23.8%
------------------------------------------------------------
Total Tokens:        117,456
  - Prompt:          79,121
  - Completion:      38,335
Estimated Cost:      $0.0965
Time Taken:          69.7s
Notes:               Bootstraps successful traces as demonstrations.



---
## 3. BootstrapFewShotWithRandomSearch

Tries multiple random configurations of bootstrapped demos and selects the best.

In [32]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

print("Running BootstrapFewShotWithRandomSearch optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = BootstrapFewShotWithRandomSearch(
        metric=exact_match,
        max_bootstrapped_demos=4,
        max_labeled_demos=4,
        max_rounds=1,
        num_candidate_programs=8,  # Reduced for faster benchmarking
        num_threads=NUM_THREADS,
    )
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset,
        valset=valset,
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="BootstrapFewShotWithRandomSearch",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes="Random search over demo configurations.",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

Running BootstrapFewShotWithRandomSearch optimizer...
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 8 candidate sets.
Average Metric: 68.00 / 101 (67.3%): 100%|██████████| 101/101 [00:54<00:00,  1.85it/s]

2025/12/03 15:48:38 INFO dspy.evaluate.evaluate: Average Metric: 68 / 101 (67.3%)



New best score: 67.33 for seed -3
Scores so far: [67.33]
Best score so far: 67.33
Average Metric: 83.00 / 101 (82.2%): 100%|██████████| 101/101 [00:51<00:00,  1.94it/s]

2025/12/03 15:49:31 INFO dspy.evaluate.evaluate: Average Metric: 83 / 101 (82.2%)



New best score: 82.18 for seed -2
Scores so far: [67.33, 82.18]
Best score so far: 82.18


  5%|▍         | 5/101 [00:30<09:41,  6.06s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Average Metric: 98.00 / 101 (97.0%): 100%|██████████| 101/101 [00:41<00:00,  2.46it/s]

2025/12/03 15:50:42 INFO dspy.evaluate.evaluate: Average Metric: 98 / 101 (97.0%)



New best score: 97.03 for seed -1
Scores so far: [67.33, 82.18, 97.03]
Best score so far: 97.03


  4%|▍         | 4/101 [00:24<09:42,  6.01s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [00:35<00:00,  2.86it/s]

2025/12/03 15:51:41 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)



New best score: 99.01 for seed 0
Scores so far: [67.33, 82.18, 97.03, 99.01]
Best score so far: 99.01


  2%|▏         | 2/101 [00:18<15:04,  9.13s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [00:34<00:00,  2.89it/s]

2025/12/03 15:52:35 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01]
Best score so far: 99.01


  1%|          | 1/101 [00:04<07:52,  4.72s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 96.00 / 101 (95.0%): 100%|██████████| 101/101 [00:37<00:00,  2.69it/s]

2025/12/03 15:53:17 INFO dspy.evaluate.evaluate: Average Metric: 96 / 101 (95.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01, 95.05]
Best score so far: 99.01


  2%|▏         | 2/101 [00:11<09:13,  5.59s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [00:38<00:00,  2.59it/s]

2025/12/03 15:54:07 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01, 95.05, 99.01]
Best score so far: 99.01


  2%|▏         | 2/101 [00:09<08:05,  4.91s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [00:34<00:00,  2.90it/s]

2025/12/03 15:54:52 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01, 95.05, 99.01, 99.01]
Best score so far: 99.01


  3%|▎         | 3/101 [00:17<09:47,  6.00s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Average Metric: 97.00 / 101 (96.0%): 100%|██████████| 101/101 [00:34<00:00,  2.92it/s]

2025/12/03 15:55:45 INFO dspy.evaluate.evaluate: Average Metric: 97 / 101 (96.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01, 95.05, 99.01, 99.01, 96.04]
Best score so far: 99.01


  1%|          | 1/101 [00:04<07:28,  4.48s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 99.00 / 101 (98.0%): 100%|██████████| 101/101 [00:35<00:00,  2.86it/s]

2025/12/03 15:56:25 INFO dspy.evaluate.evaluate: Average Metric: 99 / 101 (98.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01, 95.05, 99.01, 99.01, 96.04, 98.02]
Best score so far: 99.01


  3%|▎         | 3/101 [00:18<09:56,  6.09s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Average Metric: 99.00 / 101 (98.0%): 100%|██████████| 101/101 [00:39<00:00,  2.54it/s]

2025/12/03 15:57:23 INFO dspy.evaluate.evaluate: Average Metric: 99 / 101 (98.0%)



Scores so far: [67.33, 82.18, 97.03, 99.01, 99.01, 95.05, 99.01, 99.01, 96.04, 98.02, 98.02]
Best score so far: 99.01
11 candidate programs found.
Average Metric: 98.00 / 101 (97.0%): 100%|██████████| 101/101 [00:38<00:00,  2.65it/s]

2025/12/03 15:58:01 INFO dspy.evaluate.evaluate: Average Metric: 98 / 101 (97.0%)




OPTIMIZER: BootstrapFewShotWithRandomSearch
Baseline Accuracy:   72.3%
Optimized Accuracy:  97.0%
Accuracy Uplift:     +24.8%
------------------------------------------------------------
Total Tokens:        1,189,910
  - Prompt:          752,282
  - Completion:      437,628
Estimated Cost:      $1.0633
Time Taken:          617.3s
Notes:               Random search over demo configurations.



---
## 4. COPRO

Optimizes instructions using an iterative refinement approach.

In [33]:
from dspy.teleprompt import COPRO

print("Running COPRO optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = COPRO(
        prompt_model=teacher_lm,
        metric=exact_match,
        breadth=5,  # Number of candidates per iteration
        depth=2,    # Number of refinement iterations
        init_temperature=1.0,
        track_stats=True,
    )
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset,
        eval_kwargs={"num_threads": NUM_THREADS, "display_progress": True, "display_table": False},
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="COPRO",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes=f"Uses gpt-5 for instruction generation. depth={optimizer.depth}, breadth={optimizer.breadth}",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

Running COPRO optimizer...


2025/12/03 16:10:45 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/2.
2025/12/03 16:10:45 INFO dspy.teleprompt.copro_optimizer: At Depth 1/2, Evaluating Prompt Candidate #1/5 for Predictor 1 of 1.






[34m[2025-12-03T16:10:45.526022][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the ta

2025/12/03 16:14:34 INFO dspy.evaluate.evaluate: Average Metric: 56 / 101 (55.4%)
2025/12/03 16:14:34 INFO dspy.teleprompt.copro_optimizer: At Depth 1/2, Evaluating Prompt Candidate #2/5 for Predictor 1 of 1.







[34m[2025-12-03T16:10:45.526022][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the t

2025/12/03 16:18:26 INFO dspy.evaluate.evaluate: Average Metric: 49 / 101 (48.5%)
2025/12/03 16:18:26 INFO dspy.teleprompt.copro_optimizer: At Depth 1/2, Evaluating Prompt Candidate #3/5 for Predictor 1 of 1.







[34m[2025-12-03T16:10:45.526022][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the t

2025/12/03 16:23:44 INFO dspy.evaluate.evaluate: Average Metric: 49 / 101 (48.5%)
2025/12/03 16:23:44 INFO dspy.teleprompt.copro_optimizer: At Depth 1/2, Evaluating Prompt Candidate #4/5 for Predictor 1 of 1.







[34m[2025-12-03T16:10:45.526022][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the t

2025/12/03 16:28:51 INFO dspy.evaluate.evaluate: Average Metric: 64 / 101 (63.4%)
2025/12/03 16:28:51 INFO dspy.teleprompt.copro_optimizer: At Depth 1/2, Evaluating Prompt Candidate #5/5 for Predictor 1 of 1.







[34m[2025-12-03T16:10:45.526022][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the t

2025/12/03 16:29:36 INFO dspy.evaluate.evaluate: Average Metric: 72 / 101 (71.3%)







[34m[2025-12-03T16:10:45.526022][0m

[31mSystem message:[0m

Your input fields are:
1. `basic_instruction` (str): The initial instructions before optimization
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Your task is to propose an instruction that will lead a good language model to perform the t

2025/12/03 16:30:26 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 2/2.
2025/12/03 16:30:26 INFO dspy.teleprompt.copro_optimizer: At Depth 2/2, Evaluating Prompt Candidate #1/5 for Predictor 1 of 1.


Average Metric: 67.00 / 101 (66.3%): 100%|██████████| 101/101 [02:06<00:00,  1.25s/it]

2025/12/03 16:32:33 INFO dspy.evaluate.evaluate: Average Metric: 67 / 101 (66.3%)
2025/12/03 16:32:33 INFO dspy.teleprompt.copro_optimizer: At Depth 2/2, Evaluating Prompt Candidate #2/5 for Predictor 1 of 1.







[34m[2025-12-03T16:30:26.440183][0m

[31mSystem message:[0m

Your input fields are:
1. `attempted_instructions` (str):
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## attempted_instructions ## ]]
{attempted_instructions}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give some task instructions I've tried, along with their corresponding validation scores. The instructions are arranged in increasing order based on their scores, where higher scores indicat

2025/12/03 16:34:34 INFO dspy.evaluate.evaluate: Average Metric: 72 / 101 (71.3%)
2025/12/03 16:34:34 INFO dspy.teleprompt.copro_optimizer: At Depth 2/2, Evaluating Prompt Candidate #3/5 for Predictor 1 of 1.







[34m[2025-12-03T16:30:26.440183][0m

[31mSystem message:[0m

Your input fields are:
1. `attempted_instructions` (str):
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## attempted_instructions ## ]]
{attempted_instructions}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give some task instructions I've tried, along with their corresponding validation scores. The instructions are arranged in increasing order based on their scores, where higher scores indicat

2025/12/03 16:36:07 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)
2025/12/03 16:36:07 INFO dspy.teleprompt.copro_optimizer: At Depth 2/2, Evaluating Prompt Candidate #4/5 for Predictor 1 of 1.







[34m[2025-12-03T16:30:26.440183][0m

[31mSystem message:[0m

Your input fields are:
1. `attempted_instructions` (str):
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## attempted_instructions ## ]]
{attempted_instructions}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give some task instructions I've tried, along with their corresponding validation scores. The instructions are arranged in increasing order based on their scores, where higher scores indicat

2025/12/03 16:38:26 INFO dspy.evaluate.evaluate: Average Metric: 78 / 101 (77.2%)
2025/12/03 16:38:26 INFO dspy.teleprompt.copro_optimizer: At Depth 2/2, Evaluating Prompt Candidate #5/5 for Predictor 1 of 1.







[34m[2025-12-03T16:30:26.440183][0m

[31mSystem message:[0m

Your input fields are:
1. `attempted_instructions` (str):
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## attempted_instructions ## ]]
{attempted_instructions}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give some task instructions I've tried, along with their corresponding validation scores. The instructions are arranged in increasing order based on their scores, where higher scores indicat

2025/12/03 16:43:05 INFO dspy.evaluate.evaluate: Average Metric: 49 / 101 (48.5%)







[34m[2025-12-03T16:30:26.440183][0m

[31mSystem message:[0m

Your input fields are:
1. `attempted_instructions` (str):
Your output fields are:
1. `proposed_instruction` (str): The improved instructions for the language model
2. `proposed_prefix_for_output_field` (str): The string at the end of the prompt, which will help the model start solving the task
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## attempted_instructions ## ]]
{attempted_instructions}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## proposed_prefix_for_output_field ## ]]
{proposed_prefix_for_output_field}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an instruction optimizer for large language models. I will give some task instructions I've tried, along with their corresponding validation scores. The instructions are arranged in increasing order based on their scores, where higher scores indicat

2025/12/03 16:44:45 INFO dspy.evaluate.evaluate: Average Metric: 101 / 101 (100.0%)




OPTIMIZER: COPRO
Baseline Accuracy:   72.3%
Optimized Accuracy:  100.0%
Accuracy Uplift:     +27.7%
------------------------------------------------------------
Total Tokens:        3,136,535
  - Prompt:          1,157,672
  - Completion:      1,978,863
Estimated Cost:      $4.4435
Time Taken:          2119.4s
Notes:               Uses gpt-4o for instruction generation. depth=2, breadth=5



---
## 5. MIPROv2

Combines instruction optimization with few-shot demo selection using Bayesian optimization (Optuna).

In [34]:
from dspy.teleprompt import MIPROv2

print("Running MIPROv2 optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = MIPROv2(
        metric=exact_match,
        prompt_model=teacher_lm,
        task_model=task_lm,
        auto="light",  # Use light preset for faster benchmarking
        num_threads=NUM_THREADS,
        max_bootstrapped_demos=4,
        max_labeled_demos=4,
        track_stats=True,
    )
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset,
        valset=valset,
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="MIPROv2",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes="Uses gpt-4o for instruction generation. auto='light' preset.",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

2025/12/03 16:46:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/12/03 16:46:24 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/03 16:46:24 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/03 16:46:24 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Running MIPROv2 optimizer...
Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  5%|▍         | 5/101 [00:34<11:01,  6.89s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/6


  1%|          | 1/101 [00:07<11:48,  7.09s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/6


  4%|▍         | 4/101 [00:29<12:00,  7.43s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 6/6


  1%|          | 1/101 [00:04<07:45,  4.65s/it]
2025/12/03 16:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/03 16:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/12/03 16:58:03 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/03 17:01:08 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/03 17:01:08 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Detects whether text is AI-generated.

2025/12/03 17:01:08 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are the final reviewer before an automated integrity system flags or clears a writer. A false positive could wrongly suspend a student; a false negative could let spam flood a newsroom. Your job: decide if the given text is AI-generated, and show your evidence clearly and briefly.

Do:
- Analyze only the provided text. Ignore topic quality, morality, or external knowledge.
- Base your decision on stylistic cues across diction, syntax, modality, openings, rhythm, and narrative grounding.
- Weigh both sides, then decide. In close calls, prefer False (human) to minimize harm.
- Keep reasoning concise (3–6 sentences), citing the mo

Average Metric: 67.00 / 100 (67.0%): 100%|██████████| 100/100 [00:44<00:00,  2.25it/s]

2025/12/03 17:01:53 INFO dspy.evaluate.evaluate: Average Metric: 67 / 100 (67.0%)
2025/12/03 17:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 67.0

2025/12/03 17:01:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==



Average Metric: 32.00 / 35 (91.4%): 100%|██████████| 35/35 [00:16<00:00,  2.07it/s]

2025/12/03 17:02:10 INFO dspy.evaluate.evaluate: Average Metric: 32 / 35 (91.4%)
2025/12/03 17:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/03 17:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43]
2025/12/03 17:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0]
2025/12/03 17:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 67.0


2025/12/03 17:02:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==



Average Metric: 34.00 / 35 (97.1%): 100%|██████████| 35/35 [00:45<00:00,  1.29s/it] 

2025/12/03 17:02:55 INFO dspy.evaluate.evaluate: Average Metric: 34 / 35 (97.1%)
2025/12/03 17:02:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/03 17:02:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14]
2025/12/03 17:02:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0]
2025/12/03 17:02:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 67.0


2025/12/03 17:02:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==



Average Metric: 33.00 / 35 (94.3%): 100%|██████████| 35/35 [00:19<00:00,  1.79it/s]

2025/12/03 17:03:15 INFO dspy.evaluate.evaluate: Average Metric: 33 / 35 (94.3%)
2025/12/03 17:03:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 94.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/03 17:03:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29]
2025/12/03 17:03:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0]
2025/12/03 17:03:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 67.0


2025/12/03 17:03:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==



Average Metric: 34.00 / 35 (97.1%): 100%|██████████| 35/35 [00:36<00:00,  1.05s/it]

2025/12/03 17:03:52 INFO dspy.evaluate.evaluate: Average Metric: 34 / 35 (97.1%)
2025/12/03 17:03:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/03 17:03:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14]
2025/12/03 17:03:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0]
2025/12/03 17:03:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 67.0


2025/12/03 17:03:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==



Average Metric: 34.00 / 35 (97.1%): 100%|██████████| 35/35 [00:15<00:00,  2.22it/s] 

2025/12/03 17:04:07 INFO dspy.evaluate.evaluate: Average Metric: 34 / 35 (97.1%)
2025/12/03 17:04:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/03 17:04:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14, 97.14]
2025/12/03 17:04:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0]
2025/12/03 17:04:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 67.0


2025/12/03 17:04:07 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/12/03 17:04:07 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 97.14) from minibatch trials...



Average Metric: 98.00 / 100 (98.0%): 100%|██████████| 100/100 [01:43<00:00,  1.03s/it]

2025/12/03 17:05:51 INFO dspy.evaluate.evaluate: Average Metric: 98 / 100 (98.0%)
2025/12/03 17:05:51 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 98.0
2025/12/03 17:05:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0]
2025/12/03 17:05:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0
2025/12/03 17:05:51 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/03 17:05:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==



Average Metric: 34.00 / 35 (97.1%): 100%|██████████| 35/35 [00:46<00:00,  1.32s/it] 

2025/12/03 17:06:37 INFO dspy.evaluate.evaluate: Average Metric: 34 / 35 (97.1%)
2025/12/03 17:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/03 17:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14, 97.14, 97.14]
2025/12/03 17:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0]
2025/12/03 17:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0


2025/12/03 17:06:37 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==



Average Metric: 34.00 / 35 (97.1%): 100%|██████████| 35/35 [00:25<00:00,  1.35it/s] 

2025/12/03 17:07:03 INFO dspy.evaluate.evaluate: Average Metric: 34 / 35 (97.1%)
2025/12/03 17:07:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/12/03 17:07:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14, 97.14, 97.14, 97.14]
2025/12/03 17:07:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0]
2025/12/03 17:07:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0


2025/12/03 17:07:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:17<00:00,  1.99it/s]

2025/12/03 17:07:21 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)
2025/12/03 17:07:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/12/03 17:07:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14, 97.14, 97.14, 97.14, 80.0]
2025/12/03 17:07:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0]
2025/12/03 17:07:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0


2025/12/03 17:07:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==



Average Metric: 35.00 / 35 (100.0%): 100%|██████████| 35/35 [00:17<00:00,  2.00it/s]

2025/12/03 17:07:39 INFO dspy.evaluate.evaluate: Average Metric: 35 / 35 (100.0%)
2025/12/03 17:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 2'].
2025/12/03 17:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14, 97.14, 97.14, 97.14, 80.0, 100.0]
2025/12/03 17:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0]
2025/12/03 17:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0


2025/12/03 17:07:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==



Average Metric: 33.00 / 35 (94.3%): 100%|██████████| 35/35 [00:15<00:00,  2.28it/s] 

2025/12/03 17:07:54 INFO dspy.evaluate.evaluate: Average Metric: 33 / 35 (94.3%)
2025/12/03 17:07:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 94.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 2'].
2025/12/03 17:07:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [91.43, 97.14, 94.29, 97.14, 97.14, 97.14, 97.14, 80.0, 100.0, 94.29]
2025/12/03 17:07:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0]
2025/12/03 17:07:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0


2025/12/03 17:07:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/12/03 17:07:54 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 97.14500000000001) from minibatch trials...



Average Metric: 98.00 / 100 (98.0%): 100%|██████████| 100/100 [00:42<00:00,  2.38it/s]

2025/12/03 17:08:36 INFO dspy.evaluate.evaluate: Average Metric: 98 / 100 (98.0%)
2025/12/03 17:08:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [67.0, 98.0, 98.0]





2025/12/03 17:08:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 98.0
2025/12/03 17:08:36 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/03 17:08:36 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 98.0!


Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [01:52<00:00,  1.11s/it]

2025/12/03 17:10:29 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)




OPTIMIZER: MIPROv2
Baseline Accuracy:   72.3%
Optimized Accuracy:  99.0%
Accuracy Uplift:     +26.7%
------------------------------------------------------------
Total Tokens:        1,217,881
  - Prompt:          700,424
  - Completion:      517,457
Estimated Cost:      $1.5882
Time Taken:          1445.0s
Notes:               Uses gpt-4o for instruction generation. auto='light' preset.



---
## 6. GEPA

Evolutionary optimizer that uses reflection to evolve instructions based on feedback.

In [35]:
from dspy.teleprompt import GEPA

print("Running GEPA optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = GEPA(
        metric=exact_match_with_feedback,
        auto="light",
        reflection_lm=teacher_lm,
        num_threads=4,
        track_stats=True,
        use_merge=False,  # Disable merge for faster benchmarking
    )
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset,
        valset=valset,
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="GEPA",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes="Evolutionary optimizer with reflection. Uses gpt-4o for reflection.",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

2025/12/03 17:15:15 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 784 metric calls of the program. This amounts to 3.88 full evals on the train+val set.
2025/12/03 17:15:15 INFO dspy.teleprompt.gepa.gepa: Using 101 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.


Running GEPA optimizer...


GEPA Optimization:   0%|          | 0/784 [00:00<?, ?rollouts/s]2025/12/03 17:18:10 INFO dspy.evaluate.evaluate: Average Metric: 69 / 101 (68.3%)
2025/12/03 17:18:10 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.6831683168316832
GEPA Optimization:  13%|█▎        | 101/784 [02:54<19:42,  1.73s/rollouts]2025/12/03 17:18:10 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.6831683168316832


Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]

2025/12/03 17:18:18 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:18:18 INFO dspy.teleprompt.gepa.gepa: Iteration 1: All subsample scores perfect. Skipping.
2025/12/03 17:18:18 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Reflective mutation did not propose a new candidate
GEPA Optimization:  13%|█▎        | 104/784 [03:03<20:08,  1.78s/rollouts]2025/12/03 17:18:18 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.6831683168316832



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.53s/it]

2025/12/03 17:18:26 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:18:26 INFO dspy.teleprompt.gepa.gepa: Iteration 2: All subsample scores perfect. Skipping.
2025/12/03 17:18:26 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Reflective mutation did not propose a new candidate
GEPA Optimization:  14%|█▎        | 107/784 [03:11<20:31,  1.82s/rollouts]2025/12/03 17:18:26 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Selected program 0 score: 0.6831683168316832



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:11<00:00,  3.71s/it] 

2025/12/03 17:18:37 INFO dspy.evaluate.evaluate: Average Metric: 2 / 3 (66.7%)





2025/12/03 17:19:20 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for detect.predict: You are given a single piece of text and must judge whether it is more likely AI-generated. Produce two fields:
- reasoning: 1–4 concise sentences explaining the key signals you used. Reference concrete features of the input (you may quote short phrases). Calibrate uncertainty briefly if the sample is very short, but avoid long disclaimers.
- is_ai: a boolean (True if more likely AI-generated, False otherwise).

Base your decision only on the provided text. Do not use external tools or metadata. This is a probabilistic judgment; weigh multiple signals:

Signals that increase likelihood of AI-generated:
- Generic, “textbook” informational tone or common-sense advice with no concrete context.
- Enumerative or list-like phrasing, even within one sentence (e.g., “like/such as A and B”), covering multiple recommendations succinctly.
- Overly neutral, balanced, or polished style without idi

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.61s/it]

2025/12/03 17:23:22 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:23:22 INFO dspy.teleprompt.gepa.gepa: Iteration 4: All subsample scores perfect. Skipping.
2025/12/03 17:23:22 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Reflective mutation did not propose a new candidate
GEPA Optimization:  28%|██▊       | 217/784 [08:07<23:20,  2.47s/rollouts]2025/12/03 17:23:22 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Selected program 1 score: 0.9207920792079208



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:10<00:00,  3.44s/it]

2025/12/03 17:23:33 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:23:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: All subsample scores perfect. Skipping.
2025/12/03 17:23:33 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Reflective mutation did not propose a new candidate
GEPA Optimization:  28%|██▊       | 220/784 [08:17<23:34,  2.51s/rollouts]2025/12/03 17:23:33 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Selected program 1 score: 0.9207920792079208



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:11<00:00,  3.80s/it]

2025/12/03 17:23:44 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:23:44 INFO dspy.teleprompt.gepa.gepa: Iteration 6: All subsample scores perfect. Skipping.
2025/12/03 17:23:44 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Reflective mutation did not propose a new candidate
GEPA Optimization:  28%|██▊       | 223/784 [08:29<24:06,  2.58s/rollouts]2025/12/03 17:23:44 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Selected program 1 score: 0.9207920792079208



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:20<00:00,  6.79s/it]

2025/12/03 17:24:05 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:24:05 INFO dspy.teleprompt.gepa.gepa: Iteration 7: All subsample scores perfect. Skipping.
2025/12/03 17:24:05 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Reflective mutation did not propose a new candidate
GEPA Optimization:  29%|██▉       | 226/784 [08:49<26:44,  2.88s/rollouts]2025/12/03 17:24:05 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Selected program 1 score: 0.9207920792079208



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.91s/it]

2025/12/03 17:24:13 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:24:13 INFO dspy.teleprompt.gepa.gepa: Iteration 8: All subsample scores perfect. Skipping.
2025/12/03 17:24:13 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Reflective mutation did not propose a new candidate
GEPA Optimization:  29%|██▉       | 229/784 [08:58<26:38,  2.88s/rollouts]2025/12/03 17:24:13 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Selected program 0 score: 0.6831683168316832



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:10<00:00,  3.56s/it]

2025/12/03 17:24:24 INFO dspy.evaluate.evaluate: Average Metric: 1 / 3 (33.3%)





2025/12/03 17:25:11 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for detect.predict: Task
- Determine whether a given text snippet is likely AI-generated.

Input format
- You will receive an object with a single field:
  - text: the snippet to analyze (often a single sentence).

Output format
- Return an object with exactly these fields:
  - reasoning: 2–5 sentences explaining the key signals you used, grounded in the text.
  - is_ai: a boolean (True/False) indicating whether the text is likely AI-generated.

Decision rubric
Weigh concrete, text-internal signals rather than generic “AI fingerprint” claims. For short snippets, be explicit about uncertainty in your reasoning, but still choose a side.

AI-likely signals (heavier weight for short, context-free lines):
- Generic, abstract, or “timeless” statements with vague terms (e.g., broad psychological explanations like “often appears when,” “can help,” “benefit from”) and no concrete details.
- Polished, neutral, “co

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:07<00:00,  2.39s/it]

2025/12/03 17:28:13 INFO dspy.evaluate.evaluate: Average Metric: 2 / 3 (66.7%)





2025/12/03 17:29:15 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for detect.predict: You are given a short piece of text and must judge whether it was likely written by an AI system or by a human. Return a concise rationale and a boolean classification.

Input
- A single field:
  - text: the text snippet to evaluate (often just one sentence).

Output
- reasoning: 1–4 sentences explaining the most diagnostic cues in the text and your uncertainty. Refer to stylistic and structural features, not topic correctness.
- is_ai: True if you judge the text likely AI-generated; False if you judge it likely human-authored.

General approach
1) Note the length and register. For very short snippets, certainty is inherently low; still provide a decision and state low/medium/high confidence in your reasoning text.
2) Examine voice, specificity, and style:
   - Human-leaning cues:
     - First-person perspective with specific, concrete details (habits, triggers, time, place).
     - 

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.95s/it]

2025/12/03 17:32:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:32:45 INFO dspy.teleprompt.gepa.gepa: Iteration 11: All subsample scores perfect. Skipping.
2025/12/03 17:32:45 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Reflective mutation did not propose a new candidate
GEPA Optimization:  57%|█████▋    | 446/784 [17:29<13:41,  2.43s/rollouts]2025/12/03 17:32:45 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.73s/it]

2025/12/03 17:32:53 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:32:53 INFO dspy.teleprompt.gepa.gepa: Iteration 12: All subsample scores perfect. Skipping.
2025/12/03 17:32:53 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Reflective mutation did not propose a new candidate
GEPA Optimization:  57%|█████▋    | 449/784 [17:37<13:37,  2.44s/rollouts]2025/12/03 17:32:53 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.22s/it]

2025/12/03 17:33:03 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:03 INFO dspy.teleprompt.gepa.gepa: Iteration 13: All subsample scores perfect. Skipping.
2025/12/03 17:33:03 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Reflective mutation did not propose a new candidate
GEPA Optimization:  58%|█████▊    | 452/784 [17:47<13:40,  2.47s/rollouts]2025/12/03 17:33:03 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.33s/it]

2025/12/03 17:33:10 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:10 INFO dspy.teleprompt.gepa.gepa: Iteration 14: All subsample scores perfect. Skipping.
2025/12/03 17:33:10 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Reflective mutation did not propose a new candidate
GEPA Optimization:  58%|█████▊    | 455/784 [17:54<13:30,  2.46s/rollouts]2025/12/03 17:33:10 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.27s/it]

2025/12/03 17:33:16 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:16 INFO dspy.teleprompt.gepa.gepa: Iteration 15: All subsample scores perfect. Skipping.
2025/12/03 17:33:16 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Reflective mutation did not propose a new candidate
GEPA Optimization:  58%|█████▊    | 458/784 [18:01<13:19,  2.45s/rollouts]2025/12/03 17:33:16 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:04<00:00,  1.56s/it]

2025/12/03 17:33:21 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:21 INFO dspy.teleprompt.gepa.gepa: Iteration 16: All subsample scores perfect. Skipping.
2025/12/03 17:33:21 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Reflective mutation did not propose a new candidate
GEPA Optimization:  59%|█████▉    | 461/784 [18:06<12:45,  2.37s/rollouts]2025/12/03 17:33:21 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.13s/it]

2025/12/03 17:33:31 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:31 INFO dspy.teleprompt.gepa.gepa: Iteration 17: All subsample scores perfect. Skipping.
2025/12/03 17:33:31 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Reflective mutation did not propose a new candidate
GEPA Optimization:  59%|█████▉    | 464/784 [18:15<13:06,  2.46s/rollouts]2025/12/03 17:33:31 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.41s/it]

2025/12/03 17:33:38 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:38 INFO dspy.teleprompt.gepa.gepa: Iteration 18: All subsample scores perfect. Skipping.
2025/12/03 17:33:38 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Reflective mutation did not propose a new candidate
GEPA Optimization:  60%|█████▉    | 467/784 [18:22<12:57,  2.45s/rollouts]2025/12/03 17:33:38 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.07s/it]

2025/12/03 17:33:47 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:47 INFO dspy.teleprompt.gepa.gepa: Iteration 19: All subsample scores perfect. Skipping.
2025/12/03 17:33:47 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Reflective mutation did not propose a new candidate
GEPA Optimization:  60%|█████▉    | 470/784 [18:32<13:23,  2.56s/rollouts]2025/12/03 17:33:47 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.42s/it]

2025/12/03 17:33:54 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:33:54 INFO dspy.teleprompt.gepa.gepa: Iteration 20: All subsample scores perfect. Skipping.
2025/12/03 17:33:54 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Reflective mutation did not propose a new candidate
GEPA Optimization:  60%|██████    | 473/784 [18:39<13:08,  2.54s/rollouts]2025/12/03 17:33:54 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.87s/it]

2025/12/03 17:34:03 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:03 INFO dspy.teleprompt.gepa.gepa: Iteration 21: All subsample scores perfect. Skipping.
2025/12/03 17:34:03 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Reflective mutation did not propose a new candidate
GEPA Optimization:  61%|██████    | 476/784 [18:48<13:24,  2.61s/rollouts]2025/12/03 17:34:03 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.51s/it]

2025/12/03 17:34:11 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:11 INFO dspy.teleprompt.gepa.gepa: Iteration 22: All subsample scores perfect. Skipping.
2025/12/03 17:34:11 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Reflective mutation did not propose a new candidate
GEPA Optimization:  61%|██████    | 479/784 [18:55<13:09,  2.59s/rollouts]2025/12/03 17:34:11 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:11<00:00,  3.77s/it]

2025/12/03 17:34:22 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:22 INFO dspy.teleprompt.gepa.gepa: Iteration 23: All subsample scores perfect. Skipping.
2025/12/03 17:34:22 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Reflective mutation did not propose a new candidate
GEPA Optimization:  61%|██████▏   | 482/784 [19:06<14:33,  2.89s/rollouts]2025/12/03 17:34:22 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:05<00:00,  1.93s/it]

2025/12/03 17:34:28 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:28 INFO dspy.teleprompt.gepa.gepa: Iteration 24: All subsample scores perfect. Skipping.
2025/12/03 17:34:28 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Reflective mutation did not propose a new candidate
GEPA Optimization:  62%|██████▏   | 485/784 [19:12<13:09,  2.64s/rollouts]2025/12/03 17:34:28 INFO dspy.teleprompt.gepa.gepa: Iteration 25: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:10<00:00,  3.44s/it]

2025/12/03 17:34:38 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:38 INFO dspy.teleprompt.gepa.gepa: Iteration 25: All subsample scores perfect. Skipping.
2025/12/03 17:34:38 INFO dspy.teleprompt.gepa.gepa: Iteration 25: Reflective mutation did not propose a new candidate
GEPA Optimization:  62%|██████▏   | 488/784 [19:23<14:07,  2.86s/rollouts]2025/12/03 17:34:38 INFO dspy.teleprompt.gepa.gepa: Iteration 26: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.66s/it]

2025/12/03 17:34:46 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:46 INFO dspy.teleprompt.gepa.gepa: Iteration 26: All subsample scores perfect. Skipping.
2025/12/03 17:34:46 INFO dspy.teleprompt.gepa.gepa: Iteration 26: Reflective mutation did not propose a new candidate
GEPA Optimization:  63%|██████▎   | 491/784 [19:31<13:42,  2.81s/rollouts]2025/12/03 17:34:46 INFO dspy.teleprompt.gepa.gepa: Iteration 27: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.92s/it]

2025/12/03 17:34:55 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:34:55 INFO dspy.teleprompt.gepa.gepa: Iteration 27: All subsample scores perfect. Skipping.
2025/12/03 17:34:55 INFO dspy.teleprompt.gepa.gepa: Iteration 27: Reflective mutation did not propose a new candidate
GEPA Optimization:  63%|██████▎   | 494/784 [19:39<13:44,  2.84s/rollouts]2025/12/03 17:34:55 INFO dspy.teleprompt.gepa.gepa: Iteration 28: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.57s/it]

2025/12/03 17:35:03 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:03 INFO dspy.teleprompt.gepa.gepa: Iteration 28: All subsample scores perfect. Skipping.
2025/12/03 17:35:03 INFO dspy.teleprompt.gepa.gepa: Iteration 28: Reflective mutation did not propose a new candidate
GEPA Optimization:  63%|██████▎   | 497/784 [19:47<13:14,  2.77s/rollouts]2025/12/03 17:35:03 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.13s/it]

2025/12/03 17:35:09 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:09 INFO dspy.teleprompt.gepa.gepa: Iteration 29: All subsample scores perfect. Skipping.
2025/12/03 17:35:09 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Reflective mutation did not propose a new candidate
GEPA Optimization:  64%|██████▍   | 500/784 [19:54<12:14,  2.59s/rollouts]2025/12/03 17:35:09 INFO dspy.teleprompt.gepa.gepa: Iteration 30: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.32s/it]

2025/12/03 17:35:16 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:16 INFO dspy.teleprompt.gepa.gepa: Iteration 30: All subsample scores perfect. Skipping.
2025/12/03 17:35:16 INFO dspy.teleprompt.gepa.gepa: Iteration 30: Reflective mutation did not propose a new candidate
GEPA Optimization:  64%|██████▍   | 503/784 [20:01<11:45,  2.51s/rollouts]2025/12/03 17:35:16 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.37s/it]

2025/12/03 17:35:23 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:23 INFO dspy.teleprompt.gepa.gepa: Iteration 31: All subsample scores perfect. Skipping.
2025/12/03 17:35:23 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Reflective mutation did not propose a new candidate
GEPA Optimization:  65%|██████▍   | 506/784 [20:08<11:27,  2.47s/rollouts]2025/12/03 17:35:23 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.24s/it]

2025/12/03 17:35:30 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:30 INFO dspy.teleprompt.gepa.gepa: Iteration 32: All subsample scores perfect. Skipping.
2025/12/03 17:35:30 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Reflective mutation did not propose a new candidate
GEPA Optimization:  65%|██████▍   | 509/784 [20:15<11:02,  2.41s/rollouts]2025/12/03 17:35:30 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.55s/it]

2025/12/03 17:35:38 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:38 INFO dspy.teleprompt.gepa.gepa: Iteration 33: All subsample scores perfect. Skipping.
2025/12/03 17:35:38 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Reflective mutation did not propose a new candidate
GEPA Optimization:  65%|██████▌   | 512/784 [20:22<11:07,  2.45s/rollouts]2025/12/03 17:35:38 INFO dspy.teleprompt.gepa.gepa: Iteration 34: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.87s/it]

2025/12/03 17:35:46 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:46 INFO dspy.teleprompt.gepa.gepa: Iteration 34: All subsample scores perfect. Skipping.
2025/12/03 17:35:46 INFO dspy.teleprompt.gepa.gepa: Iteration 34: Reflective mutation did not propose a new candidate
GEPA Optimization:  66%|██████▌   | 515/784 [20:31<11:33,  2.58s/rollouts]2025/12/03 17:35:46 INFO dspy.teleprompt.gepa.gepa: Iteration 35: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.35s/it]

2025/12/03 17:35:53 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:35:53 INFO dspy.teleprompt.gepa.gepa: Iteration 35: All subsample scores perfect. Skipping.
2025/12/03 17:35:53 INFO dspy.teleprompt.gepa.gepa: Iteration 35: Reflective mutation did not propose a new candidate
GEPA Optimization:  66%|██████▌   | 518/784 [20:38<11:08,  2.51s/rollouts]2025/12/03 17:35:53 INFO dspy.teleprompt.gepa.gepa: Iteration 36: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.36s/it]

2025/12/03 17:36:01 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:01 INFO dspy.teleprompt.gepa.gepa: Iteration 36: All subsample scores perfect. Skipping.
2025/12/03 17:36:01 INFO dspy.teleprompt.gepa.gepa: Iteration 36: Reflective mutation did not propose a new candidate
GEPA Optimization:  66%|██████▋   | 521/784 [20:45<10:50,  2.47s/rollouts]2025/12/03 17:36:01 INFO dspy.teleprompt.gepa.gepa: Iteration 37: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.86s/it]

2025/12/03 17:36:09 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:09 INFO dspy.teleprompt.gepa.gepa: Iteration 37: All subsample scores perfect. Skipping.
2025/12/03 17:36:09 INFO dspy.teleprompt.gepa.gepa: Iteration 37: Reflective mutation did not propose a new candidate
GEPA Optimization:  67%|██████▋   | 524/784 [20:54<11:13,  2.59s/rollouts]2025/12/03 17:36:09 INFO dspy.teleprompt.gepa.gepa: Iteration 38: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.73s/it]

2025/12/03 17:36:17 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:17 INFO dspy.teleprompt.gepa.gepa: Iteration 38: All subsample scores perfect. Skipping.
2025/12/03 17:36:17 INFO dspy.teleprompt.gepa.gepa: Iteration 38: Reflective mutation did not propose a new candidate
GEPA Optimization:  67%|██████▋   | 527/784 [21:02<11:17,  2.64s/rollouts]2025/12/03 17:36:17 INFO dspy.teleprompt.gepa.gepa: Iteration 39: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.11s/it]

2025/12/03 17:36:24 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:24 INFO dspy.teleprompt.gepa.gepa: Iteration 39: All subsample scores perfect. Skipping.
2025/12/03 17:36:24 INFO dspy.teleprompt.gepa.gepa: Iteration 39: Reflective mutation did not propose a new candidate
GEPA Optimization:  68%|██████▊   | 530/784 [21:08<10:30,  2.48s/rollouts]2025/12/03 17:36:24 INFO dspy.teleprompt.gepa.gepa: Iteration 40: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.47s/it]

2025/12/03 17:36:31 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:31 INFO dspy.teleprompt.gepa.gepa: Iteration 40: All subsample scores perfect. Skipping.
2025/12/03 17:36:31 INFO dspy.teleprompt.gepa.gepa: Iteration 40: Reflective mutation did not propose a new candidate
GEPA Optimization:  68%|██████▊   | 533/784 [21:16<10:22,  2.48s/rollouts]2025/12/03 17:36:31 INFO dspy.teleprompt.gepa.gepa: Iteration 41: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.12s/it]

2025/12/03 17:36:38 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:38 INFO dspy.teleprompt.gepa.gepa: Iteration 41: All subsample scores perfect. Skipping.
2025/12/03 17:36:38 INFO dspy.teleprompt.gepa.gepa: Iteration 41: Reflective mutation did not propose a new candidate
GEPA Optimization:  68%|██████▊   | 536/784 [21:22<09:48,  2.37s/rollouts]2025/12/03 17:36:38 INFO dspy.teleprompt.gepa.gepa: Iteration 42: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.53s/it]

2025/12/03 17:36:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:45 INFO dspy.teleprompt.gepa.gepa: Iteration 42: All subsample scores perfect. Skipping.
2025/12/03 17:36:45 INFO dspy.teleprompt.gepa.gepa: Iteration 42: Reflective mutation did not propose a new candidate
GEPA Optimization:  69%|██████▉   | 539/784 [21:30<09:54,  2.43s/rollouts]2025/12/03 17:36:45 INFO dspy.teleprompt.gepa.gepa: Iteration 43: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.16s/it]

2025/12/03 17:36:55 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:36:55 INFO dspy.teleprompt.gepa.gepa: Iteration 43: All subsample scores perfect. Skipping.
2025/12/03 17:36:55 INFO dspy.teleprompt.gepa.gepa: Iteration 43: Reflective mutation did not propose a new candidate
GEPA Optimization:  69%|██████▉   | 542/784 [21:39<10:40,  2.65s/rollouts]2025/12/03 17:36:55 INFO dspy.teleprompt.gepa.gepa: Iteration 44: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.11s/it]

2025/12/03 17:37:01 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:01 INFO dspy.teleprompt.gepa.gepa: Iteration 44: All subsample scores perfect. Skipping.
2025/12/03 17:37:01 INFO dspy.teleprompt.gepa.gepa: Iteration 44: Reflective mutation did not propose a new candidate
GEPA Optimization:  70%|██████▉   | 545/784 [21:46<09:54,  2.49s/rollouts]2025/12/03 17:37:01 INFO dspy.teleprompt.gepa.gepa: Iteration 45: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.48s/it]

2025/12/03 17:37:09 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:09 INFO dspy.teleprompt.gepa.gepa: Iteration 45: All subsample scores perfect. Skipping.
2025/12/03 17:37:09 INFO dspy.teleprompt.gepa.gepa: Iteration 45: Reflective mutation did not propose a new candidate
GEPA Optimization:  70%|██████▉   | 548/784 [21:53<09:47,  2.49s/rollouts]2025/12/03 17:37:09 INFO dspy.teleprompt.gepa.gepa: Iteration 46: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.31s/it]

2025/12/03 17:37:15 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:15 INFO dspy.teleprompt.gepa.gepa: Iteration 46: All subsample scores perfect. Skipping.
2025/12/03 17:37:15 INFO dspy.teleprompt.gepa.gepa: Iteration 46: Reflective mutation did not propose a new candidate
GEPA Optimization:  70%|███████   | 551/784 [22:00<09:28,  2.44s/rollouts]2025/12/03 17:37:15 INFO dspy.teleprompt.gepa.gepa: Iteration 47: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:10<00:00,  3.33s/it]

2025/12/03 17:37:26 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:26 INFO dspy.teleprompt.gepa.gepa: Iteration 47: All subsample scores perfect. Skipping.
2025/12/03 17:37:26 INFO dspy.teleprompt.gepa.gepa: Iteration 47: Reflective mutation did not propose a new candidate
GEPA Optimization:  71%|███████   | 554/784 [22:10<10:23,  2.71s/rollouts]2025/12/03 17:37:26 INFO dspy.teleprompt.gepa.gepa: Iteration 48: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]

2025/12/03 17:37:34 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:34 INFO dspy.teleprompt.gepa.gepa: Iteration 48: All subsample scores perfect. Skipping.
2025/12/03 17:37:34 INFO dspy.teleprompt.gepa.gepa: Iteration 48: Reflective mutation did not propose a new candidate
GEPA Optimization:  71%|███████   | 557/784 [22:19<10:24,  2.75s/rollouts]2025/12/03 17:37:34 INFO dspy.teleprompt.gepa.gepa: Iteration 49: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.48s/it]

2025/12/03 17:37:42 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:42 INFO dspy.teleprompt.gepa.gepa: Iteration 49: All subsample scores perfect. Skipping.
2025/12/03 17:37:42 INFO dspy.teleprompt.gepa.gepa: Iteration 49: Reflective mutation did not propose a new candidate
GEPA Optimization:  71%|███████▏  | 560/784 [22:26<09:59,  2.67s/rollouts]2025/12/03 17:37:42 INFO dspy.teleprompt.gepa.gepa: Iteration 50: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.09s/it]

2025/12/03 17:37:51 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:51 INFO dspy.teleprompt.gepa.gepa: Iteration 50: All subsample scores perfect. Skipping.
2025/12/03 17:37:51 INFO dspy.teleprompt.gepa.gepa: Iteration 50: Reflective mutation did not propose a new candidate
GEPA Optimization:  72%|███████▏  | 563/784 [22:35<10:19,  2.80s/rollouts]2025/12/03 17:37:51 INFO dspy.teleprompt.gepa.gepa: Iteration 51: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.25s/it]

2025/12/03 17:37:58 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:37:58 INFO dspy.teleprompt.gepa.gepa: Iteration 51: All subsample scores perfect. Skipping.
2025/12/03 17:37:58 INFO dspy.teleprompt.gepa.gepa: Iteration 51: Reflective mutation did not propose a new candidate
GEPA Optimization:  72%|███████▏  | 566/784 [22:42<09:35,  2.64s/rollouts]2025/12/03 17:37:58 INFO dspy.teleprompt.gepa.gepa: Iteration 52: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.64s/it]

2025/12/03 17:38:06 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:06 INFO dspy.teleprompt.gepa.gepa: Iteration 52: All subsample scores perfect. Skipping.
2025/12/03 17:38:06 INFO dspy.teleprompt.gepa.gepa: Iteration 52: Reflective mutation did not propose a new candidate
GEPA Optimization:  73%|███████▎  | 569/784 [22:50<09:28,  2.64s/rollouts]2025/12/03 17:38:06 INFO dspy.teleprompt.gepa.gepa: Iteration 53: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.46s/it]

2025/12/03 17:38:13 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:13 INFO dspy.teleprompt.gepa.gepa: Iteration 53: All subsample scores perfect. Skipping.
2025/12/03 17:38:13 INFO dspy.teleprompt.gepa.gepa: Iteration 53: Reflective mutation did not propose a new candidate
GEPA Optimization:  73%|███████▎  | 572/784 [22:58<09:09,  2.59s/rollouts]2025/12/03 17:38:13 INFO dspy.teleprompt.gepa.gepa: Iteration 54: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.36s/it]

2025/12/03 17:38:20 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:20 INFO dspy.teleprompt.gepa.gepa: Iteration 54: All subsample scores perfect. Skipping.
2025/12/03 17:38:20 INFO dspy.teleprompt.gepa.gepa: Iteration 54: Reflective mutation did not propose a new candidate
GEPA Optimization:  73%|███████▎  | 575/784 [23:05<08:47,  2.53s/rollouts]2025/12/03 17:38:20 INFO dspy.teleprompt.gepa.gepa: Iteration 55: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.23s/it]

2025/12/03 17:38:27 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:27 INFO dspy.teleprompt.gepa.gepa: Iteration 55: All subsample scores perfect. Skipping.
2025/12/03 17:38:27 INFO dspy.teleprompt.gepa.gepa: Iteration 55: Reflective mutation did not propose a new candidate
GEPA Optimization:  74%|███████▎  | 578/784 [23:11<08:22,  2.44s/rollouts]2025/12/03 17:38:27 INFO dspy.teleprompt.gepa.gepa: Iteration 56: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.70s/it]

2025/12/03 17:38:35 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:35 INFO dspy.teleprompt.gepa.gepa: Iteration 56: All subsample scores perfect. Skipping.
2025/12/03 17:38:35 INFO dspy.teleprompt.gepa.gepa: Iteration 56: Reflective mutation did not propose a new candidate
GEPA Optimization:  74%|███████▍  | 581/784 [23:19<08:31,  2.52s/rollouts]2025/12/03 17:38:35 INFO dspy.teleprompt.gepa.gepa: Iteration 57: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.81s/it]

2025/12/03 17:38:43 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:43 INFO dspy.teleprompt.gepa.gepa: Iteration 57: All subsample scores perfect. Skipping.
2025/12/03 17:38:43 INFO dspy.teleprompt.gepa.gepa: Iteration 57: Reflective mutation did not propose a new candidate
GEPA Optimization:  74%|███████▍  | 584/784 [23:28<08:42,  2.61s/rollouts]2025/12/03 17:38:43 INFO dspy.teleprompt.gepa.gepa: Iteration 58: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.15s/it]

2025/12/03 17:38:50 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:50 INFO dspy.teleprompt.gepa.gepa: Iteration 58: All subsample scores perfect. Skipping.
2025/12/03 17:38:50 INFO dspy.teleprompt.gepa.gepa: Iteration 58: Reflective mutation did not propose a new candidate
GEPA Optimization:  75%|███████▍  | 587/784 [23:34<08:07,  2.48s/rollouts]2025/12/03 17:38:50 INFO dspy.teleprompt.gepa.gepa: Iteration 59: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.31s/it]

2025/12/03 17:38:57 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:38:57 INFO dspy.teleprompt.gepa.gepa: Iteration 59: All subsample scores perfect. Skipping.
2025/12/03 17:38:57 INFO dspy.teleprompt.gepa.gepa: Iteration 59: Reflective mutation did not propose a new candidate
GEPA Optimization:  75%|███████▌  | 590/784 [23:41<07:51,  2.43s/rollouts]2025/12/03 17:38:57 INFO dspy.teleprompt.gepa.gepa: Iteration 60: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.18s/it]

2025/12/03 17:39:06 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:06 INFO dspy.teleprompt.gepa.gepa: Iteration 60: All subsample scores perfect. Skipping.
2025/12/03 17:39:06 INFO dspy.teleprompt.gepa.gepa: Iteration 60: Reflective mutation did not propose a new candidate
GEPA Optimization:  76%|███████▌  | 593/784 [23:51<08:27,  2.66s/rollouts]2025/12/03 17:39:06 INFO dspy.teleprompt.gepa.gepa: Iteration 61: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.54s/it]

2025/12/03 17:39:14 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:14 INFO dspy.teleprompt.gepa.gepa: Iteration 61: All subsample scores perfect. Skipping.
2025/12/03 17:39:14 INFO dspy.teleprompt.gepa.gepa: Iteration 61: Reflective mutation did not propose a new candidate
GEPA Optimization:  76%|███████▌  | 596/784 [23:59<08:13,  2.63s/rollouts]2025/12/03 17:39:14 INFO dspy.teleprompt.gepa.gepa: Iteration 62: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.64s/it]

2025/12/03 17:39:22 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:22 INFO dspy.teleprompt.gepa.gepa: Iteration 62: All subsample scores perfect. Skipping.
2025/12/03 17:39:22 INFO dspy.teleprompt.gepa.gepa: Iteration 62: Reflective mutation did not propose a new candidate
GEPA Optimization:  76%|███████▋  | 599/784 [24:07<08:07,  2.63s/rollouts]2025/12/03 17:39:22 INFO dspy.teleprompt.gepa.gepa: Iteration 63: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.36s/it]

2025/12/03 17:39:29 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:29 INFO dspy.teleprompt.gepa.gepa: Iteration 63: All subsample scores perfect. Skipping.
2025/12/03 17:39:29 INFO dspy.teleprompt.gepa.gepa: Iteration 63: Reflective mutation did not propose a new candidate
GEPA Optimization:  77%|███████▋  | 602/784 [24:14<07:45,  2.56s/rollouts]2025/12/03 17:39:29 INFO dspy.teleprompt.gepa.gepa: Iteration 64: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.25s/it]

2025/12/03 17:39:36 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:36 INFO dspy.teleprompt.gepa.gepa: Iteration 64: All subsample scores perfect. Skipping.
2025/12/03 17:39:36 INFO dspy.teleprompt.gepa.gepa: Iteration 64: Reflective mutation did not propose a new candidate
GEPA Optimization:  77%|███████▋  | 605/784 [24:21<07:21,  2.47s/rollouts]2025/12/03 17:39:36 INFO dspy.teleprompt.gepa.gepa: Iteration 65: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.05s/it]

2025/12/03 17:39:42 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:42 INFO dspy.teleprompt.gepa.gepa: Iteration 65: All subsample scores perfect. Skipping.
2025/12/03 17:39:42 INFO dspy.teleprompt.gepa.gepa: Iteration 65: Reflective mutation did not propose a new candidate
GEPA Optimization:  78%|███████▊  | 608/784 [24:27<06:52,  2.35s/rollouts]2025/12/03 17:39:42 INFO dspy.teleprompt.gepa.gepa: Iteration 66: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.27s/it]

2025/12/03 17:39:49 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:49 INFO dspy.teleprompt.gepa.gepa: Iteration 66: All subsample scores perfect. Skipping.
2025/12/03 17:39:49 INFO dspy.teleprompt.gepa.gepa: Iteration 66: Reflective mutation did not propose a new candidate
GEPA Optimization:  78%|███████▊  | 611/784 [24:34<06:42,  2.33s/rollouts]2025/12/03 17:39:49 INFO dspy.teleprompt.gepa.gepa: Iteration 67: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.09s/it]

2025/12/03 17:39:58 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:39:58 INFO dspy.teleprompt.gepa.gepa: Iteration 67: All subsample scores perfect. Skipping.
2025/12/03 17:39:58 INFO dspy.teleprompt.gepa.gepa: Iteration 67: Reflective mutation did not propose a new candidate
GEPA Optimization:  78%|███████▊  | 614/784 [24:43<07:14,  2.56s/rollouts]2025/12/03 17:39:58 INFO dspy.teleprompt.gepa.gepa: Iteration 68: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.28s/it]

2025/12/03 17:40:05 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:05 INFO dspy.teleprompt.gepa.gepa: Iteration 68: All subsample scores perfect. Skipping.
2025/12/03 17:40:05 INFO dspy.teleprompt.gepa.gepa: Iteration 68: Reflective mutation did not propose a new candidate
GEPA Optimization:  79%|███████▊  | 617/784 [24:50<06:53,  2.48s/rollouts]2025/12/03 17:40:05 INFO dspy.teleprompt.gepa.gepa: Iteration 69: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:05<00:00,  1.90s/it]

2025/12/03 17:40:11 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:11 INFO dspy.teleprompt.gepa.gepa: Iteration 69: All subsample scores perfect. Skipping.
2025/12/03 17:40:11 INFO dspy.teleprompt.gepa.gepa: Iteration 69: Reflective mutation did not propose a new candidate
GEPA Optimization:  79%|███████▉  | 620/784 [24:55<06:18,  2.31s/rollouts]2025/12/03 17:40:11 INFO dspy.teleprompt.gepa.gepa: Iteration 70: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.61s/it]

2025/12/03 17:40:19 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:19 INFO dspy.teleprompt.gepa.gepa: Iteration 70: All subsample scores perfect. Skipping.
2025/12/03 17:40:19 INFO dspy.teleprompt.gepa.gepa: Iteration 70: Reflective mutation did not propose a new candidate
GEPA Optimization:  79%|███████▉  | 623/784 [25:03<06:26,  2.40s/rollouts]2025/12/03 17:40:19 INFO dspy.teleprompt.gepa.gepa: Iteration 71: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.34s/it]

2025/12/03 17:40:26 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:26 INFO dspy.teleprompt.gepa.gepa: Iteration 71: All subsample scores perfect. Skipping.
2025/12/03 17:40:26 INFO dspy.teleprompt.gepa.gepa: Iteration 71: Reflective mutation did not propose a new candidate
GEPA Optimization:  80%|███████▉  | 626/784 [25:10<06:16,  2.39s/rollouts]2025/12/03 17:40:26 INFO dspy.teleprompt.gepa.gepa: Iteration 72: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.73s/it]

2025/12/03 17:40:34 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:34 INFO dspy.teleprompt.gepa.gepa: Iteration 72: All subsample scores perfect. Skipping.
2025/12/03 17:40:34 INFO dspy.teleprompt.gepa.gepa: Iteration 72: Reflective mutation did not propose a new candidate
GEPA Optimization:  80%|████████  | 629/784 [25:19<06:26,  2.49s/rollouts]2025/12/03 17:40:34 INFO dspy.teleprompt.gepa.gepa: Iteration 73: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:11<00:00,  3.79s/it]

2025/12/03 17:40:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:45 INFO dspy.teleprompt.gepa.gepa: Iteration 73: All subsample scores perfect. Skipping.
2025/12/03 17:40:45 INFO dspy.teleprompt.gepa.gepa: Iteration 73: Reflective mutation did not propose a new candidate
GEPA Optimization:  81%|████████  | 632/784 [25:30<07:18,  2.88s/rollouts]2025/12/03 17:40:45 INFO dspy.teleprompt.gepa.gepa: Iteration 74: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]

2025/12/03 17:40:54 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:40:54 INFO dspy.teleprompt.gepa.gepa: Iteration 74: All subsample scores perfect. Skipping.
2025/12/03 17:40:54 INFO dspy.teleprompt.gepa.gepa: Iteration 74: Reflective mutation did not propose a new candidate
GEPA Optimization:  81%|████████  | 635/784 [25:39<07:08,  2.87s/rollouts]2025/12/03 17:40:54 INFO dspy.teleprompt.gepa.gepa: Iteration 75: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]

2025/12/03 17:41:02 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:02 INFO dspy.teleprompt.gepa.gepa: Iteration 75: All subsample scores perfect. Skipping.
2025/12/03 17:41:02 INFO dspy.teleprompt.gepa.gepa: Iteration 75: Reflective mutation did not propose a new candidate
GEPA Optimization:  81%|████████▏ | 638/784 [25:47<06:52,  2.82s/rollouts]2025/12/03 17:41:02 INFO dspy.teleprompt.gepa.gepa: Iteration 76: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.88s/it]

2025/12/03 17:41:11 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:11 INFO dspy.teleprompt.gepa.gepa: Iteration 76: All subsample scores perfect. Skipping.
2025/12/03 17:41:11 INFO dspy.teleprompt.gepa.gepa: Iteration 76: Reflective mutation did not propose a new candidate
GEPA Optimization:  82%|████████▏ | 641/784 [25:55<06:46,  2.84s/rollouts]2025/12/03 17:41:11 INFO dspy.teleprompt.gepa.gepa: Iteration 77: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:11<00:00,  3.78s/it]

2025/12/03 17:41:22 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:22 INFO dspy.teleprompt.gepa.gepa: Iteration 77: All subsample scores perfect. Skipping.
2025/12/03 17:41:22 INFO dspy.teleprompt.gepa.gepa: Iteration 77: Reflective mutation did not propose a new candidate
GEPA Optimization:  82%|████████▏ | 644/784 [26:07<07:18,  3.13s/rollouts]2025/12/03 17:41:22 INFO dspy.teleprompt.gepa.gepa: Iteration 78: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.91s/it]

2025/12/03 17:41:31 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:31 INFO dspy.teleprompt.gepa.gepa: Iteration 78: All subsample scores perfect. Skipping.
2025/12/03 17:41:31 INFO dspy.teleprompt.gepa.gepa: Iteration 78: Reflective mutation did not propose a new candidate
GEPA Optimization:  83%|████████▎ | 647/784 [26:15<07:00,  3.07s/rollouts]2025/12/03 17:41:31 INFO dspy.teleprompt.gepa.gepa: Iteration 79: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.52s/it]

2025/12/03 17:41:39 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:39 INFO dspy.teleprompt.gepa.gepa: Iteration 79: All subsample scores perfect. Skipping.
2025/12/03 17:41:39 INFO dspy.teleprompt.gepa.gepa: Iteration 79: Reflective mutation did not propose a new candidate
GEPA Optimization:  83%|████████▎ | 650/784 [26:23<06:29,  2.91s/rollouts]2025/12/03 17:41:39 INFO dspy.teleprompt.gepa.gepa: Iteration 80: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.32s/it]

2025/12/03 17:41:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:45 INFO dspy.teleprompt.gepa.gepa: Iteration 80: All subsample scores perfect. Skipping.
2025/12/03 17:41:45 INFO dspy.teleprompt.gepa.gepa: Iteration 80: Reflective mutation did not propose a new candidate
GEPA Optimization:  83%|████████▎ | 653/784 [26:30<05:58,  2.73s/rollouts]2025/12/03 17:41:45 INFO dspy.teleprompt.gepa.gepa: Iteration 81: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:05<00:00,  1.97s/it]

2025/12/03 17:41:51 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:51 INFO dspy.teleprompt.gepa.gepa: Iteration 81: All subsample scores perfect. Skipping.
2025/12/03 17:41:51 INFO dspy.teleprompt.gepa.gepa: Iteration 81: Reflective mutation did not propose a new candidate
GEPA Optimization:  84%|████████▎ | 656/784 [26:36<05:20,  2.51s/rollouts]2025/12/03 17:41:51 INFO dspy.teleprompt.gepa.gepa: Iteration 82: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.62s/it]

2025/12/03 17:41:59 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:41:59 INFO dspy.teleprompt.gepa.gepa: Iteration 82: All subsample scores perfect. Skipping.
2025/12/03 17:41:59 INFO dspy.teleprompt.gepa.gepa: Iteration 82: Reflective mutation did not propose a new candidate
GEPA Optimization:  84%|████████▍ | 659/784 [26:44<05:18,  2.54s/rollouts]2025/12/03 17:41:59 INFO dspy.teleprompt.gepa.gepa: Iteration 83: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.27s/it]

2025/12/03 17:42:06 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:06 INFO dspy.teleprompt.gepa.gepa: Iteration 83: All subsample scores perfect. Skipping.
2025/12/03 17:42:06 INFO dspy.teleprompt.gepa.gepa: Iteration 83: Reflective mutation did not propose a new candidate
GEPA Optimization:  84%|████████▍ | 662/784 [26:51<05:00,  2.47s/rollouts]2025/12/03 17:42:06 INFO dspy.teleprompt.gepa.gepa: Iteration 84: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.58s/it]

2025/12/03 17:42:14 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:14 INFO dspy.teleprompt.gepa.gepa: Iteration 84: All subsample scores perfect. Skipping.
2025/12/03 17:42:14 INFO dspy.teleprompt.gepa.gepa: Iteration 84: Reflective mutation did not propose a new candidate
GEPA Optimization:  85%|████████▍ | 665/784 [26:59<04:57,  2.50s/rollouts]2025/12/03 17:42:14 INFO dspy.teleprompt.gepa.gepa: Iteration 85: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.47s/it]

2025/12/03 17:42:21 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:21 INFO dspy.teleprompt.gepa.gepa: Iteration 85: All subsample scores perfect. Skipping.
2025/12/03 17:42:21 INFO dspy.teleprompt.gepa.gepa: Iteration 85: Reflective mutation did not propose a new candidate
GEPA Optimization:  85%|████████▌ | 668/784 [27:06<04:49,  2.50s/rollouts]2025/12/03 17:42:21 INFO dspy.teleprompt.gepa.gepa: Iteration 86: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.66s/it]

2025/12/03 17:42:29 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:29 INFO dspy.teleprompt.gepa.gepa: Iteration 86: All subsample scores perfect. Skipping.
2025/12/03 17:42:29 INFO dspy.teleprompt.gepa.gepa: Iteration 86: Reflective mutation did not propose a new candidate
GEPA Optimization:  86%|████████▌ | 671/784 [27:14<04:48,  2.55s/rollouts]2025/12/03 17:42:29 INFO dspy.teleprompt.gepa.gepa: Iteration 87: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]

2025/12/03 17:42:37 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:37 INFO dspy.teleprompt.gepa.gepa: Iteration 87: All subsample scores perfect. Skipping.
2025/12/03 17:42:37 INFO dspy.teleprompt.gepa.gepa: Iteration 87: Reflective mutation did not propose a new candidate
GEPA Optimization:  86%|████████▌ | 674/784 [27:22<04:45,  2.59s/rollouts]2025/12/03 17:42:37 INFO dspy.teleprompt.gepa.gepa: Iteration 88: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.45s/it]

2025/12/03 17:42:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:45 INFO dspy.teleprompt.gepa.gepa: Iteration 88: All subsample scores perfect. Skipping.
2025/12/03 17:42:45 INFO dspy.teleprompt.gepa.gepa: Iteration 88: Reflective mutation did not propose a new candidate
GEPA Optimization:  86%|████████▋ | 677/784 [27:29<04:33,  2.55s/rollouts]2025/12/03 17:42:45 INFO dspy.teleprompt.gepa.gepa: Iteration 89: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.62s/it]

2025/12/03 17:42:53 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:42:53 INFO dspy.teleprompt.gepa.gepa: Iteration 89: All subsample scores perfect. Skipping.
2025/12/03 17:42:53 INFO dspy.teleprompt.gepa.gepa: Iteration 89: Reflective mutation did not propose a new candidate
GEPA Optimization:  87%|████████▋ | 680/784 [27:37<04:27,  2.58s/rollouts]2025/12/03 17:42:53 INFO dspy.teleprompt.gepa.gepa: Iteration 90: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.56s/it]

2025/12/03 17:43:00 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:43:00 INFO dspy.teleprompt.gepa.gepa: Iteration 90: All subsample scores perfect. Skipping.
2025/12/03 17:43:00 INFO dspy.teleprompt.gepa.gepa: Iteration 90: Reflective mutation did not propose a new candidate
GEPA Optimization:  87%|████████▋ | 683/784 [27:45<04:20,  2.57s/rollouts]2025/12/03 17:43:00 INFO dspy.teleprompt.gepa.gepa: Iteration 91: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.44s/it]

2025/12/03 17:43:08 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:43:08 INFO dspy.teleprompt.gepa.gepa: Iteration 91: All subsample scores perfect. Skipping.
2025/12/03 17:43:08 INFO dspy.teleprompt.gepa.gepa: Iteration 91: Reflective mutation did not propose a new candidate
GEPA Optimization:  88%|████████▊ | 686/784 [27:52<04:08,  2.54s/rollouts]2025/12/03 17:43:08 INFO dspy.teleprompt.gepa.gepa: Iteration 92: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:18<00:00,  6.14s/it]

2025/12/03 17:43:26 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:43:26 INFO dspy.teleprompt.gepa.gepa: Iteration 92: All subsample scores perfect. Skipping.
2025/12/03 17:43:26 INFO dspy.teleprompt.gepa.gepa: Iteration 92: Reflective mutation did not propose a new candidate
GEPA Optimization:  88%|████████▊ | 689/784 [28:11<05:44,  3.62s/rollouts]2025/12/03 17:43:26 INFO dspy.teleprompt.gepa.gepa: Iteration 93: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.65s/it]

2025/12/03 17:43:34 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:43:34 INFO dspy.teleprompt.gepa.gepa: Iteration 93: All subsample scores perfect. Skipping.
2025/12/03 17:43:34 INFO dspy.teleprompt.gepa.gepa: Iteration 93: Reflective mutation did not propose a new candidate
GEPA Optimization:  88%|████████▊ | 692/784 [28:19<05:06,  3.33s/rollouts]2025/12/03 17:43:34 INFO dspy.teleprompt.gepa.gepa: Iteration 94: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.29s/it]

2025/12/03 17:43:41 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:43:41 INFO dspy.teleprompt.gepa.gepa: Iteration 94: All subsample scores perfect. Skipping.
2025/12/03 17:43:41 INFO dspy.teleprompt.gepa.gepa: Iteration 94: Reflective mutation did not propose a new candidate
GEPA Optimization:  89%|████████▊ | 695/784 [28:26<04:29,  3.02s/rollouts]2025/12/03 17:43:41 INFO dspy.teleprompt.gepa.gepa: Iteration 95: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.32s/it]

2025/12/03 17:43:51 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:43:51 INFO dspy.teleprompt.gepa.gepa: Iteration 95: All subsample scores perfect. Skipping.
2025/12/03 17:43:51 INFO dspy.teleprompt.gepa.gepa: Iteration 95: Reflective mutation did not propose a new candidate
GEPA Optimization:  89%|████████▉ | 698/784 [28:36<04:28,  3.12s/rollouts]2025/12/03 17:43:51 INFO dspy.teleprompt.gepa.gepa: Iteration 96: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:11<00:00,  3.74s/it]

2025/12/03 17:44:02 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:44:02 INFO dspy.teleprompt.gepa.gepa: Iteration 96: All subsample scores perfect. Skipping.
2025/12/03 17:44:02 INFO dspy.teleprompt.gepa.gepa: Iteration 96: Reflective mutation did not propose a new candidate
GEPA Optimization:  89%|████████▉ | 701/784 [28:47<04:34,  3.31s/rollouts]2025/12/03 17:44:02 INFO dspy.teleprompt.gepa.gepa: Iteration 97: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.29s/it]

2025/12/03 17:44:12 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:44:12 INFO dspy.teleprompt.gepa.gepa: Iteration 97: All subsample scores perfect. Skipping.
2025/12/03 17:44:12 INFO dspy.teleprompt.gepa.gepa: Iteration 97: Reflective mutation did not propose a new candidate
GEPA Optimization:  90%|████████▉ | 704/784 [28:57<04:24,  3.31s/rollouts]2025/12/03 17:44:12 INFO dspy.teleprompt.gepa.gepa: Iteration 98: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.12s/it]

2025/12/03 17:44:19 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:44:19 INFO dspy.teleprompt.gepa.gepa: Iteration 98: All subsample scores perfect. Skipping.
2025/12/03 17:44:19 INFO dspy.teleprompt.gepa.gepa: Iteration 98: Reflective mutation did not propose a new candidate
GEPA Optimization:  90%|█████████ | 707/784 [29:03<03:47,  2.95s/rollouts]2025/12/03 17:44:19 INFO dspy.teleprompt.gepa.gepa: Iteration 99: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.35s/it]

2025/12/03 17:44:26 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:44:26 INFO dspy.teleprompt.gepa.gepa: Iteration 99: All subsample scores perfect. Skipping.
2025/12/03 17:44:26 INFO dspy.teleprompt.gepa.gepa: Iteration 99: Reflective mutation did not propose a new candidate
GEPA Optimization:  91%|█████████ | 710/784 [29:10<03:25,  2.78s/rollouts]2025/12/03 17:44:26 INFO dspy.teleprompt.gepa.gepa: Iteration 100: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:15<00:00,  5.17s/it]

2025/12/03 17:44:41 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:44:41 INFO dspy.teleprompt.gepa.gepa: Iteration 100: All subsample scores perfect. Skipping.
2025/12/03 17:44:41 INFO dspy.teleprompt.gepa.gepa: Iteration 100: Reflective mutation did not propose a new candidate
GEPA Optimization:  91%|█████████ | 713/784 [29:26<04:08,  3.50s/rollouts]2025/12/03 17:44:41 INFO dspy.teleprompt.gepa.gepa: Iteration 101: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:09<00:00,  3.31s/it]

2025/12/03 17:44:51 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:44:51 INFO dspy.teleprompt.gepa.gepa: Iteration 101: All subsample scores perfect. Skipping.
2025/12/03 17:44:51 INFO dspy.teleprompt.gepa.gepa: Iteration 101: Reflective mutation did not propose a new candidate
GEPA Optimization:  91%|█████████▏| 716/784 [29:36<03:54,  3.44s/rollouts]2025/12/03 17:44:51 INFO dspy.teleprompt.gepa.gepa: Iteration 102: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.86s/it]

2025/12/03 17:45:00 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:00 INFO dspy.teleprompt.gepa.gepa: Iteration 102: All subsample scores perfect. Skipping.
2025/12/03 17:45:00 INFO dspy.teleprompt.gepa.gepa: Iteration 102: Reflective mutation did not propose a new candidate
GEPA Optimization:  92%|█████████▏| 719/784 [29:44<03:32,  3.27s/rollouts]2025/12/03 17:45:00 INFO dspy.teleprompt.gepa.gepa: Iteration 103: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.94s/it]

2025/12/03 17:45:09 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:09 INFO dspy.teleprompt.gepa.gepa: Iteration 103: All subsample scores perfect. Skipping.
2025/12/03 17:45:09 INFO dspy.teleprompt.gepa.gepa: Iteration 103: Reflective mutation did not propose a new candidate
GEPA Optimization:  92%|█████████▏| 722/784 [29:53<03:16,  3.18s/rollouts]2025/12/03 17:45:09 INFO dspy.teleprompt.gepa.gepa: Iteration 104: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.43s/it]

2025/12/03 17:45:16 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:16 INFO dspy.teleprompt.gepa.gepa: Iteration 104: All subsample scores perfect. Skipping.
2025/12/03 17:45:16 INFO dspy.teleprompt.gepa.gepa: Iteration 104: Reflective mutation did not propose a new candidate
GEPA Optimization:  92%|█████████▏| 725/784 [30:01<02:54,  2.95s/rollouts]2025/12/03 17:45:16 INFO dspy.teleprompt.gepa.gepa: Iteration 105: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.21s/it]

2025/12/03 17:45:23 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:23 INFO dspy.teleprompt.gepa.gepa: Iteration 105: All subsample scores perfect. Skipping.
2025/12/03 17:45:23 INFO dspy.teleprompt.gepa.gepa: Iteration 105: Reflective mutation did not propose a new candidate
GEPA Optimization:  93%|█████████▎| 728/784 [30:07<02:33,  2.73s/rollouts]2025/12/03 17:45:23 INFO dspy.teleprompt.gepa.gepa: Iteration 106: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.40s/it]

2025/12/03 17:45:30 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:30 INFO dspy.teleprompt.gepa.gepa: Iteration 106: All subsample scores perfect. Skipping.
2025/12/03 17:45:30 INFO dspy.teleprompt.gepa.gepa: Iteration 106: Reflective mutation did not propose a new candidate
GEPA Optimization:  93%|█████████▎| 731/784 [30:15<02:19,  2.64s/rollouts]2025/12/03 17:45:30 INFO dspy.teleprompt.gepa.gepa: Iteration 107: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.91s/it]

2025/12/03 17:45:39 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:39 INFO dspy.teleprompt.gepa.gepa: Iteration 107: All subsample scores perfect. Skipping.
2025/12/03 17:45:39 INFO dspy.teleprompt.gepa.gepa: Iteration 107: Reflective mutation did not propose a new candidate
GEPA Optimization:  94%|█████████▎| 734/784 [30:23<02:16,  2.72s/rollouts]2025/12/03 17:45:39 INFO dspy.teleprompt.gepa.gepa: Iteration 108: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.39s/it]

2025/12/03 17:45:46 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:46 INFO dspy.teleprompt.gepa.gepa: Iteration 108: All subsample scores perfect. Skipping.
2025/12/03 17:45:46 INFO dspy.teleprompt.gepa.gepa: Iteration 108: Reflective mutation did not propose a new candidate
GEPA Optimization:  94%|█████████▍| 737/784 [30:31<02:03,  2.63s/rollouts]2025/12/03 17:45:46 INFO dspy.teleprompt.gepa.gepa: Iteration 109: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]

2025/12/03 17:45:55 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:45:55 INFO dspy.teleprompt.gepa.gepa: Iteration 109: All subsample scores perfect. Skipping.
2025/12/03 17:45:55 INFO dspy.teleprompt.gepa.gepa: Iteration 109: Reflective mutation did not propose a new candidate
GEPA Optimization:  94%|█████████▍| 740/784 [30:39<01:58,  2.70s/rollouts]2025/12/03 17:45:55 INFO dspy.teleprompt.gepa.gepa: Iteration 110: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.20s/it]

2025/12/03 17:46:01 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:01 INFO dspy.teleprompt.gepa.gepa: Iteration 110: All subsample scores perfect. Skipping.
2025/12/03 17:46:01 INFO dspy.teleprompt.gepa.gepa: Iteration 110: Reflective mutation did not propose a new candidate
GEPA Optimization:  95%|█████████▍| 743/784 [30:46<01:44,  2.55s/rollouts]2025/12/03 17:46:01 INFO dspy.teleprompt.gepa.gepa: Iteration 111: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.43s/it]

2025/12/03 17:46:08 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:08 INFO dspy.teleprompt.gepa.gepa: Iteration 111: All subsample scores perfect. Skipping.
2025/12/03 17:46:08 INFO dspy.teleprompt.gepa.gepa: Iteration 111: Reflective mutation did not propose a new candidate
GEPA Optimization:  95%|█████████▌| 746/784 [30:53<01:35,  2.52s/rollouts]2025/12/03 17:46:08 INFO dspy.teleprompt.gepa.gepa: Iteration 112: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:05<00:00,  1.98s/it]

2025/12/03 17:46:14 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:14 INFO dspy.teleprompt.gepa.gepa: Iteration 112: All subsample scores perfect. Skipping.
2025/12/03 17:46:14 INFO dspy.teleprompt.gepa.gepa: Iteration 112: Reflective mutation did not propose a new candidate
GEPA Optimization:  96%|█████████▌| 749/784 [30:59<01:22,  2.36s/rollouts]2025/12/03 17:46:14 INFO dspy.teleprompt.gepa.gepa: Iteration 113: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.29s/it]

2025/12/03 17:46:21 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:21 INFO dspy.teleprompt.gepa.gepa: Iteration 113: All subsample scores perfect. Skipping.
2025/12/03 17:46:21 INFO dspy.teleprompt.gepa.gepa: Iteration 113: Reflective mutation did not propose a new candidate
GEPA Optimization:  96%|█████████▌| 752/784 [31:06<01:14,  2.34s/rollouts]2025/12/03 17:46:21 INFO dspy.teleprompt.gepa.gepa: Iteration 114: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.77s/it]

2025/12/03 17:46:30 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:30 INFO dspy.teleprompt.gepa.gepa: Iteration 114: All subsample scores perfect. Skipping.
2025/12/03 17:46:30 INFO dspy.teleprompt.gepa.gepa: Iteration 114: Reflective mutation did not propose a new candidate
GEPA Optimization:  96%|█████████▋| 755/784 [31:14<01:11,  2.47s/rollouts]2025/12/03 17:46:30 INFO dspy.teleprompt.gepa.gepa: Iteration 115: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.60s/it]

2025/12/03 17:46:38 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:38 INFO dspy.teleprompt.gepa.gepa: Iteration 115: All subsample scores perfect. Skipping.
2025/12/03 17:46:38 INFO dspy.teleprompt.gepa.gepa: Iteration 115: Reflective mutation did not propose a new candidate
GEPA Optimization:  97%|█████████▋| 758/784 [31:22<01:05,  2.51s/rollouts]2025/12/03 17:46:38 INFO dspy.teleprompt.gepa.gepa: Iteration 116: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:12<00:00,  4.31s/it]

2025/12/03 17:46:50 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:50 INFO dspy.teleprompt.gepa.gepa: Iteration 116: All subsample scores perfect. Skipping.
2025/12/03 17:46:50 INFO dspy.teleprompt.gepa.gepa: Iteration 116: Reflective mutation did not propose a new candidate
GEPA Optimization:  97%|█████████▋| 761/784 [31:35<01:10,  3.06s/rollouts]2025/12/03 17:46:50 INFO dspy.teleprompt.gepa.gepa: Iteration 117: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.21s/it]

2025/12/03 17:46:57 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:46:57 INFO dspy.teleprompt.gepa.gepa: Iteration 117: All subsample scores perfect. Skipping.
2025/12/03 17:46:57 INFO dspy.teleprompt.gepa.gepa: Iteration 117: Reflective mutation did not propose a new candidate
GEPA Optimization:  97%|█████████▋| 764/784 [31:42<00:56,  2.80s/rollouts]2025/12/03 17:46:57 INFO dspy.teleprompt.gepa.gepa: Iteration 118: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.28s/it]

2025/12/03 17:47:04 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:04 INFO dspy.teleprompt.gepa.gepa: Iteration 118: All subsample scores perfect. Skipping.
2025/12/03 17:47:04 INFO dspy.teleprompt.gepa.gepa: Iteration 118: Reflective mutation did not propose a new candidate
GEPA Optimization:  98%|█████████▊| 767/784 [31:49<00:45,  2.65s/rollouts]2025/12/03 17:47:04 INFO dspy.teleprompt.gepa.gepa: Iteration 119: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.30s/it]

2025/12/03 17:47:11 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:11 INFO dspy.teleprompt.gepa.gepa: Iteration 119: All subsample scores perfect. Skipping.
2025/12/03 17:47:11 INFO dspy.teleprompt.gepa.gepa: Iteration 119: Reflective mutation did not propose a new candidate
GEPA Optimization:  98%|█████████▊| 770/784 [31:55<00:35,  2.55s/rollouts]2025/12/03 17:47:11 INFO dspy.teleprompt.gepa.gepa: Iteration 120: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.65s/it]

2025/12/03 17:47:19 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:19 INFO dspy.teleprompt.gepa.gepa: Iteration 120: All subsample scores perfect. Skipping.
2025/12/03 17:47:19 INFO dspy.teleprompt.gepa.gepa: Iteration 120: Reflective mutation did not propose a new candidate
GEPA Optimization:  99%|█████████▊| 773/784 [32:03<00:28,  2.58s/rollouts]2025/12/03 17:47:19 INFO dspy.teleprompt.gepa.gepa: Iteration 121: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.15s/it]

2025/12/03 17:47:25 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:25 INFO dspy.teleprompt.gepa.gepa: Iteration 121: All subsample scores perfect. Skipping.
2025/12/03 17:47:25 INFO dspy.teleprompt.gepa.gepa: Iteration 121: Reflective mutation did not propose a new candidate
GEPA Optimization:  99%|█████████▉| 776/784 [32:10<00:19,  2.46s/rollouts]2025/12/03 17:47:25 INFO dspy.teleprompt.gepa.gepa: Iteration 122: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:13<00:00,  4.53s/it]

2025/12/03 17:47:39 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:39 INFO dspy.teleprompt.gepa.gepa: Iteration 122: All subsample scores perfect. Skipping.
2025/12/03 17:47:39 INFO dspy.teleprompt.gepa.gepa: Iteration 122: Reflective mutation did not propose a new candidate
GEPA Optimization:  99%|█████████▉| 779/784 [32:24<00:15,  3.08s/rollouts]2025/12/03 17:47:39 INFO dspy.teleprompt.gepa.gepa: Iteration 123: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:07<00:00,  2.63s/it]

2025/12/03 17:47:47 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 123: All subsample scores perfect. Skipping.
2025/12/03 17:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 123: Reflective mutation did not propose a new candidate
GEPA Optimization: 100%|█████████▉| 782/784 [32:32<00:05,  2.95s/rollouts]2025/12/03 17:47:47 INFO dspy.teleprompt.gepa.gepa: Iteration 124: Selected program 3 score: 0.9900990099009901



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:08<00:00,  2.75s/it]

2025/12/03 17:47:55 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/12/03 17:47:55 INFO dspy.teleprompt.gepa.gepa: Iteration 124: All subsample scores perfect. Skipping.
2025/12/03 17:47:55 INFO dspy.teleprompt.gepa.gepa: Iteration 124: Reflective mutation did not propose a new candidate
GEPA Optimization: 100%|█████████▉| 782/784 [32:40<00:05,  2.51s/rollouts]



Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [00:49<00:00,  2.03it/s]

2025/12/03 17:48:45 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)




OPTIMIZER: GEPA
Baseline Accuracy:   72.3%
Optimized Accuracy:  99.0%
Accuracy Uplift:     +26.7%
------------------------------------------------------------
Total Tokens:        983,276
  - Prompt:          598,859
  - Completion:      384,417
Estimated Cost:      $0.9765
Time Taken:          2010.1s
Notes:               Evolutionary optimizer with reflection. Uses gpt-4o for reflection.



---
## 7. SIMBA

Stochastic Introspective Mini-Batch Ascent - uses LLM to analyze performance and generate improvement rules.

In [36]:
from dspy.teleprompt import SIMBA

print("Running SIMBA optimizer...")

start_time = time.time()
with dspy.track_usage() as usage:
    optimizer = SIMBA(
        metric=exact_match,
        bsize=8,  # Mini-batch size (reduced for small dataset)
        num_candidates=4,  # Number of candidates per iteration
        max_steps=3,  # Number of optimization steps
        max_demos=4,
        prompt_model=teacher_lm,
        num_threads=4,
    )
    optimized_detector = optimizer.compile(
        AIDetector(),
        trainset=trainset,
        seed=42,
    )
    
    # Evaluate
    optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)

elapsed = time.time() - start_time
total_usage = usage.get_total_tokens()

total_tokens = sum(d.get("total_tokens", 0) for d in total_usage.values())
prompt_tokens = sum(d.get("prompt_tokens", 0) for d in total_usage.values())
completion_tokens = sum(d.get("completion_tokens", 0) for d in total_usage.values())

result = BenchmarkResult(
    optimizer_name="SIMBA",
    baseline_accuracy=baseline_accuracy,
    optimized_accuracy=optimized_accuracy,
    accuracy_uplift=optimized_accuracy - baseline_accuracy,
    total_tokens=total_tokens,
    prompt_tokens=prompt_tokens,
    completion_tokens=completion_tokens,
    cost_usd=calculate_cost(total_usage),
    time_seconds=elapsed,
    notes="Mini-batch ascent with self-reflection. Uses gpt-4o for rule generation.",
    usage_by_model=total_usage,
)

print_benchmark_result(result)
all_results.append(result)

2025/12/03 17:48:45 INFO dspy.teleprompt.simba: Starting batch 1 of 3.


Running SIMBA optimizer...


2025/12/03 17:48:47 INFO dspy.teleprompt.simba: Sampling program trajectories on 8 examples x 4 samples.


Processed 32 / 32 examples: 100%|██████████| 32/32 [00:57<00:00,  1.80s/it]

2025/12/03 17:49:44 INFO dspy.teleprompt.simba: Batch 1: Baseline mini-batch score: 0.8125

2025/12/03 17:49:44 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.25.
2025/12/03 17:49:44 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule





2025/12/03 17:50:13 INFO dspy.teleprompt.simba_utils: Advice for detect.predict: If the input is a single sentence or very brief and exhibits generic academic collocations (e.g., "nuanced interplay," "far‑reaching implications," "increasingly complex landscape," "holistic approach," "it is imperative to") without concrete anchors (named entities, numbers, dates, citations, examples, quotes), then treat this absence of specificity as positive evidence for AI generation and lean is_ai = True even with low confidence. If the prose is uniformly polished, neutral, and non-committal, with balanced clauses and zero idiosyncrasies (typos, informal turns of phrase, personal voice), weight this further toward AI. Do not require overt AI artifacts (repetition, hallucinations) to vote AI; the lack of human-specific signals (personal anecdotes, situated context, concrete details) should count meaningfully. Conversely, if the short text includes multiple concrete anchors (e.g., “OECD 2019 report,” “

Processed 40 / 40 examples: 100%|██████████| 40/40 [01:10<00:00,  1.76s/it]

2025/12/03 17:51:48 INFO dspy.teleprompt.simba: Scores after 1 batches: [1.0, 0.875, 1.0, 1.0, 0.75], Best: 1.0

2025/12/03 17:51:48 INFO dspy.teleprompt.simba: Starting batch 2 of 3.





2025/12/03 17:51:50 INFO dspy.teleprompt.simba: Sampling program trajectories on 8 examples x 4 samples.


Processed 32 / 32 examples: 100%|██████████| 32/32 [01:10<00:00,  2.21s/it]

2025/12/03 17:53:01 INFO dspy.teleprompt.simba: Batch 2: Baseline mini-batch score: 0.71875

2025/12/03 17:53:01 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.5.
2025/12/03 17:53:01 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_demo_
2025/12/03 17:53:01 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/12/03 17:53:01 INFO dspy.teleprompt.simba: 

2025/12/03 17:53:01 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.25.
2025/12/03 17:53:01 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_demo_
2025/12/03 17:53:01 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/12/03 17:53:01 INFO dspy.teleprompt.simba: 

2025/12/03 17:53:01 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #3, with max score 1.0, max-to-min gap 1.0, and max-to-avg 




2025/12/03 17:53:28 INFO dspy.teleprompt.simba_utils: Advice for detect.predict: Adopt a tie-breaker for short, contextless advice. If the input is brief (e.g., a single sentence or < ~25 words), prescriptive, and uniformly polished, then weigh the following subtle AI cues heavily and classify as AI when multiple are present: 1) Generic collocations and safe corporate/academic phrasing (e.g., “benefit from,” “concise subject lines,” “specific requests,” “courteous closing”). 2) Formulaic parallelism or triads with evenly balanced noun phrases. 3) No concrete anchors (no examples, names, numbers, settings) and no experiential/personal voice (no I/we/you with lived detail). 4) Non-idiosyncratic wording with smoothed, error-free cadence. Avoid defaulting to “human” just because the sample is short or lacks overt AI tells. Instead: - If ≥2 of the above AI cues appear and human cues are absent, classify as AI and explain with those cues. - If you see human-leaning signals (idiosyncratic phr

Processed 40 / 40 examples: 100%|██████████| 40/40 [01:24<00:00,  2.10s/it]

2025/12/03 17:55:19 INFO dspy.teleprompt.simba: Scores after 2 batches: [0.75, 1.0, 1.0, 1.0, 0.625], Best: 1.0

2025/12/03 17:55:19 INFO dspy.teleprompt.simba: Starting batch 3 of 3.





2025/12/03 17:55:20 INFO dspy.teleprompt.simba: Sampling program trajectories on 8 examples x 4 samples.


Processed 32 / 32 examples: 100%|██████████| 32/32 [00:58<00:00,  1.83s/it]

2025/12/03 17:56:19 INFO dspy.teleprompt.simba: Batch 3: Baseline mini-batch score: 0.875

2025/12/03 17:56:19 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.5.
2025/12/03 17:56:19 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_demo_, having dropped 1 demos per predictor
2025/12/03 17:56:19 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/12/03 17:56:19 INFO dspy.teleprompt.simba: 

2025/12/03 17:56:19 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.25.
2025/12/03 17:56:19 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule





2025/12/03 17:56:48 INFO dspy.teleprompt.simba_utils: Advice for detect.predict: Adopt a checklist aligned to the guideline and apply the tie‑breaker explicitly. If the input is a single sentence or very short (e.g., one clause or a brief declarative) and contains 2+ generic academic collocations (e.g., noun phrases like "cognitive function," "mood regulation," "physical recovery") while lacking concrete anchors (no numbers, dates, citations, named entities, locations, quotes, attributions, or first‑person/experiential voice), then set is_ai = True. In your reasoning, explicitly (a) list the collocations you found, (b) state the absence of anchors, and (c) cite that the decision follows the short‑text tie‑breaker. If the text instead includes multiple concrete anchors or a distinct personal/experiential voice (e.g., "In my fieldwork in Oaxaca in 2019…", "OECD 2019 report", percentages, named institutions), shift weight away from AI and explain those anchors. Do not require overt AI art

Processed 40 / 40 examples: 100%|██████████| 40/40 [01:09<00:00,  1.73s/it]

2025/12/03 17:57:58 INFO dspy.teleprompt.simba: Scores after 3 batches: [1.0, 1.0, 1.0, 0.875, 0.5], Best: 1.0

2025/12/03 17:57:58 INFO dspy.teleprompt.simba: VALIDATION: Evaluating 4 programs on the full trainset.



Processed 404 / 404 examples: 100%|██████████| 404/404 [11:34<00:00,  1.72s/it]

2025/12/03 18:09:32 INFO dspy.teleprompt.simba: Final trainset scores: [0.6732673267326733, 0.9900990099009901, 1.0, 0.9702970297029703], Best: 1.0 (at index 2)






Average Metric: 100.00 / 101 (99.0%): 100%|██████████| 101/101 [00:46<00:00,  2.16it/s]

2025/12/03 18:10:19 INFO dspy.evaluate.evaluate: Average Metric: 100 / 101 (99.0%)




OPTIMIZER: SIMBA
Baseline Accuracy:   72.3%
Optimized Accuracy:  99.0%
Accuracy Uplift:     +26.7%
------------------------------------------------------------
Total Tokens:        607,784
  - Prompt:          307,312
  - Completion:      300,472
Estimated Cost:      $0.7337
Time Taken:          1293.7s
Notes:               Mini-batch ascent with self-reflection. Uses gpt-4o for rule generation.



---
## Fine-tuning Optimizers (Require Training Infrastructure)

The following optimizers require **actual model fine-tuning** (not just inference). They will NOT work with:
- LM Studio (inference only)
- Ollama (inference only)
- Standard OpenAI/Anthropic APIs

---

### Setup Instructions

#### Option A: BootstrapFinetune with SGLang (Local GPU)

**Installation:**
```bash
# SGLang for inference serving
pip install "sglang[all]>=0.4.4.post3" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python

# Fine-tuning dependencies
pip install torch transformers==4.48.3 accelerate trl peft
```

**Usage:**
```python
from dspy.clients.lm_local import LocalProvider

# Initialize local model with SGLang
student_lm = dspy.LM(
    model="openai/local:Qwen/Qwen2.5-1.5B-Instruct",
    provider=LocalProvider(),
    max_tokens=2000
)
student_lm.launch()  # Starts SGLang server

# Configure BootstrapFinetune
optimizer = dspy.BootstrapFinetune(
    metric=your_metric,
    train_kwargs={
        "num_train_epochs": 5,
        "per_device_train_batch_size": 1,
        "gradient_accumulation_steps": 8,
        "learning_rate": 1e-5,
        "bf16": True,
    }
)
```

---

#### Option B: GRPO with Arbor (Recommended for RL Fine-tuning)

GRPO uses [Arbor](https://github.com/Ziems/arbor), a framework for optimizing DSPy programs with reinforcement learning.

**Installation:**
```bash
pip install arbor-ai

# Optional: Flash Attention for performance (may take 15+ min to install)
pip install flash-attn --no-build-isolation
```

**Usage:**
```python
import arbor
from arbor import ArborProvider
from dspy.teleprompt import GRPO
from dspy.clients.utils_finetune import MultiGPUConfig

# Start Arbor server
arbor_server_info = arbor.init()

# Connect DSPy to Arbor
provider = ArborProvider()
lm = dspy.LM(
    model="openai/arbor:Qwen/Qwen2.5-1.5B-Instruct",
    provider=provider,
    api_base=arbor_server_info["base_url"],
    api_key="arbor",
    temperature=1.0,
    max_tokens=2048,
)

# Configure GRPO
gpu_config = MultiGPUConfig(num_inference_gpus=1, num_training_gpus=1)

optimizer = GRPO(
    metric=your_metric,
    num_dspy_examples_per_grpo_step=4,
    num_rollouts_per_grpo_step=8,
    exclude_demos=True,  # Required
    multitask=True,      # Required
    num_train_steps=100,
    gpu_config=gpu_config,
    train_kwargs={
        "temperature": 0.9,
        "beta": 0.04,
        "learning_rate": 1e-6,
        "gradient_checkpointing": True,
    }
)
```

---

#### Hardware Requirements

| Optimizer | Min GPUs | Recommended | VRAM per GPU |
|-----------|----------|-------------|--------------|
| BootstrapFinetune | 1 | 1-2 | 16GB+ |
| GRPO | 2 | 4+ | 24GB+ (H100 ideal) |
| BetterTogether | 2 | 4+ | 24GB+ |

---

#### References

- [DSPy RL Multi-Hop Tutorial](https://dspy.ai/tutorials/rl_multihop/)
- [DSPy RL PAPILLON Tutorial](https://dspy.ai/tutorials/rl_papillon/)
- [Arbor GitHub](https://github.com/Ziems/arbor)
- [DSPy Classification Fine-tuning Tutorial](https://dspy.ai/tutorials/classification_finetuning/)

---

**If you don't have this infrastructure, skip these cells.** The prompt-based optimizers above (LabeledFewShot through SIMBA) work with any LLM provider and are typically sufficient for most use cases.

In [10]:
pip install "sglang[all]>=0.4.4.post3" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python

Looking in links: https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
Collecting sglang>=0.4.4.post3 (from sglang[all]>=0.4.4.post3)
  Downloading sglang-0.5.6-py3-none-any.whl.metadata (25 kB)
Collecting anthropic>=0.20.0 (from sglang>=0.4.4.post3->sglang[all]>=0.4.4.post3)
  Using cached anthropic-0.75.0-py3-none-any.whl.metadata (28 kB)
Collecting blobfile==3.0.0 (from sglang>=0.4.4.post3->sglang[all]>=0.4.4.post3)
  Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)
Collecting build (from sglang>=0.4.4.post3->sglang[all]>=0.4.4.post3)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting compressed-tensors (from sglang>=0.4.4.post3->sglang[all]>=0.4.4.post3)
  Downloading compressed_tensors-0.12.2-py3-none-any.whl.metadata (7.0 kB)
Collecting cuda-python (from sglang>=0.4.4.post3->sglang[all]>=0.4.4.post3)
  Downloading cuda_python-13.0.3-py3-none-any.whl.metadata (4.7 kB)
Collecting decord2 (from sglang>=0.4.4.post3->sglang[all]>=0.4.4.post3)
  D

In [11]:
pip install torch transformers==4.48.3 accelerate trl peft

Collecting transformers==4.48.3
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.48.3)
  Using cached tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting datasets>=3.0.0 (from trl)
  Using cached datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
INFO: pip is looking at multiple versions of trl to determine which version is compatible with other requirements. This could take a while.
Collecting trl
  Downloading trl-0.25.0-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.23.1-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.22.2

In [17]:
# Check if fine-tuning infrastructure is available
import subprocess
import shutil

BOOTSTRAPFINETUNE_AVAILABLE = False
GRPO_AVAILABLE = False
LOCAL_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

# Check for GPU
try:
    import torch
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_count = torch.cuda.device_count()
        vram = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"GPU available: {gpu_name}")
        print(f"GPU count: {gpu_count}")
        print(f"VRAM per GPU: {vram:.1f} GB")
        BOOTSTRAPFINETUNE_AVAILABLE = True
        if gpu_count >= 2:
            GRPO_AVAILABLE = True
    else:
        print("No GPU available.")
except ImportError:
    print("PyTorch not installed.")

# Check for Arbor (required for GRPO)
try:
    import arbor
    print("Arbor installed: Yes")
    if not GRPO_AVAILABLE:
        print("  (but need 2+ GPUs for GRPO)")
except ImportError:
    print("Arbor installed: No (run: pip install arbor-ai)")
    GRPO_AVAILABLE = False

# Check for SGLang (required for BootstrapFinetune)
try:
    import sglang
    print("SGLang installed: Yes")
except ImportError:
    print("SGLang installed: No (run: pip install 'sglang[all]')")
    BOOTSTRAPFINETUNE_AVAILABLE = False

print(f"\nLocal model: {LOCAL_MODEL}")
print(f"BootstrapFinetune available: {BOOTSTRAPFINETUNE_AVAILABLE}")
print(f"GRPO available: {GRPO_AVAILABLE}")

if not BOOTSTRAPFINETUNE_AVAILABLE and not GRPO_AVAILABLE:
    print("\nFine-tuning optimizers will be skipped. See setup instructions above.")

No GPU available.


  from .autonotebook import tqdm as notebook_tqdm
W1204 12:31:50.437000 73384 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Arbor installed: No (run: pip install arbor-ai)
SGLang installed: No (run: pip install 'sglang[all]')

Local model: Qwen/Qwen2.5-1.5B-Instruct
BootstrapFinetune available: False
GRPO available: False

Fine-tuning optimizers will be skipped. See setup instructions above.


---
## 8. GRPO (Group Relative Policy Optimization)

Fine-tunes the model using reinforcement learning with group relative rewards.

In [None]:
if GRPO_AVAILABLE:
    import arbor
    from arbor import ArborProvider
    from dspy.teleprompt import GRPO
    from dspy.clients.utils_finetune import MultiGPUConfig
    
    print("Running GRPO optimizer...")
    
    start_time = time.time()
    try:
        # Start Arbor server
        arbor_server_info = arbor.init()
        
        # Connect DSPy to Arbor with local model
        provider = ArborProvider()
        local_lm = dspy.LM(
            model=f"openai/arbor:{LOCAL_MODEL}",
            provider=provider,
            api_base=arbor_server_info["base_url"],
            api_key="arbor",
            temperature=1.0,
            max_tokens=2048,
        )
        
        # Create student with Arbor LM
        student = AIDetector()
        student.set_lm(local_lm)
        
        # Configure GPU allocation
        gpu_config = MultiGPUConfig(
            num_inference_gpus=1,
            num_training_gpus=1
        )
        
        optimizer = GRPO(
            metric=exact_match,
            num_threads=4,
            num_train_steps=10,  # Reduced for benchmarking
            num_dspy_examples_per_grpo_step=2,
            num_rollouts_per_grpo_step=4,
            exclude_demos=True,  # Required
            multitask=True,      # Required
            gpu_config=gpu_config,
            train_kwargs={
                "temperature": 0.9,
                "beta": 0.04,
                "learning_rate": 1e-6,
                "gradient_checkpointing": True,
            }
        )
        optimized_detector = optimizer.compile(
            student,
            trainset=trainset,
            valset=valset,
        )
        
        # Evaluate
        optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)
        
        elapsed = time.time() - start_time
        
        result = BenchmarkResult(
            optimizer_name="GRPO",
            baseline_accuracy=baseline_accuracy,
            optimized_accuracy=optimized_accuracy,
            accuracy_uplift=optimized_accuracy - baseline_accuracy,
            total_tokens=0,
            prompt_tokens=0,
            completion_tokens=0,
            cost_usd=0.0,  # Local model, no API cost
            time_seconds=elapsed,
            notes=f"RL fine-tuning with Arbor + {LOCAL_MODEL}. Local GPU compute.",
        )
        
        print_benchmark_result(result)
        all_results.append(result)
        
    except Exception as e:
        print(f"GRPO failed: {e}")
        result = BenchmarkResult(
            optimizer_name="GRPO",
            baseline_accuracy=baseline_accuracy,
            optimized_accuracy=0,
            accuracy_uplift=0,
            total_tokens=0,
            prompt_tokens=0,
            completion_tokens=0,
            cost_usd=0,
            time_seconds=time.time() - start_time,
            notes=f"FAILED: {str(e)[:100]}",
        )
        all_results.append(result)
else:
    print("Skipping GRPO - requires Arbor + 2 GPUs")
    result = BenchmarkResult(
        optimizer_name="GRPO",
        baseline_accuracy=baseline_accuracy,
        optimized_accuracy=0,
        accuracy_uplift=0,
        total_tokens=0,
        prompt_tokens=0,
        completion_tokens=0,
        cost_usd=0,
        time_seconds=0,
        notes="SKIPPED: Requires Arbor (pip install arbor-ai) + 2 GPUs",
    )
    all_results.append(result)

---
## 9. BootstrapFinetune

Bootstraps successful traces and uses them to fine-tune the model.

In [None]:
if BOOTSTRAPFINETUNE_AVAILABLE:
    from dspy.teleprompt import BootstrapFinetune
    from dspy.clients.lm_local import LocalProvider
    
    print("Running BootstrapFinetune optimizer...")
    
    start_time = time.time()
    try:
        # Initialize local model with SGLang
        provider = LocalProvider()
        local_lm = dspy.LM(
            model=f"openai/local:{LOCAL_MODEL}",
            provider=provider,
            max_tokens=2000,
        )
        local_lm.launch()  # Start SGLang server
        
        # Create student with local LM
        student = AIDetector()
        student.set_lm(local_lm)
        
        # Use the OpenAI model as teacher for bootstrapping
        teacher = AIDetector()
        teacher.set_lm(teacher_lm)
        
        optimizer = BootstrapFinetune(
            metric=exact_match,
            multitask=True,
            num_threads=4,
            train_kwargs={
                "num_train_epochs": 3,
                "per_device_train_batch_size": 1,
                "gradient_accumulation_steps": 4,
                "learning_rate": 1e-5,
                "bf16": True,
            }
        )
        optimized_detector = optimizer.compile(
            student,
            teacher=teacher,
            trainset=trainset,
        )
        
        # Launch fine-tuned model for evaluation
        optimized_detector.get_lm().launch()
        
        # Evaluate
        optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)
        
        elapsed = time.time() - start_time
        
        result = BenchmarkResult(
            optimizer_name="BootstrapFinetune",
            baseline_accuracy=baseline_accuracy,
            optimized_accuracy=optimized_accuracy,
            accuracy_uplift=optimized_accuracy - baseline_accuracy,
            total_tokens=0,
            prompt_tokens=0,
            completion_tokens=0,
            cost_usd=0.0,
            time_seconds=elapsed,
            notes=f"SGLang + {LOCAL_MODEL}. Teacher: gpt-5.",
        )
        
        print_benchmark_result(result)
        all_results.append(result)
        
    except Exception as e:
        print(f"BootstrapFinetune failed: {e}")
        result = BenchmarkResult(
            optimizer_name="BootstrapFinetune",
            baseline_accuracy=baseline_accuracy,
            optimized_accuracy=0,
            accuracy_uplift=0,
            total_tokens=0,
            prompt_tokens=0,
            completion_tokens=0,
            cost_usd=0,
            time_seconds=time.time() - start_time,
            notes=f"FAILED: {str(e)[:100]}",
        )
        all_results.append(result)
else:
    print("Skipping BootstrapFinetune - requires SGLang + GPU")
    result = BenchmarkResult(
        optimizer_name="BootstrapFinetune",
        baseline_accuracy=baseline_accuracy,
        optimized_accuracy=0,
        accuracy_uplift=0,
        total_tokens=0,
        prompt_tokens=0,
        completion_tokens=0,
        cost_usd=0,
        time_seconds=0,
        notes="SKIPPED: Requires SGLang (pip install 'sglang[all]') + GPU",
    )
    all_results.append(result)

---
## 10. BetterTogether

Combines prompt optimization and weight optimization in an alternating strategy.

In [None]:
if BOOTSTRAPFINETUNE_AVAILABLE:
    from dspy.teleprompt import BetterTogether
    from dspy.clients.lm_local import LocalProvider
    
    print("Running BetterTogether optimizer...")
    print("Note: BetterTogether is experimental and requires dspy.settings.experimental = True")
    
    start_time = time.time()
    try:
        # Initialize local model with SGLang
        provider = LocalProvider()
        local_lm = dspy.LM(
            model=f"openai/local:{LOCAL_MODEL}",
            provider=provider,
            max_tokens=2000,
        )
        local_lm.launch()
        
        # Create student with local LM
        student = AIDetector()
        student.set_lm(local_lm)
        
        optimizer = BetterTogether(
            metric=exact_match,
            seed=42,
        )
        optimized_detector = optimizer.compile(
            student,
            trainset=trainset,
            strategy="p -> w",  # Prompt optimization then weight optimization
        )
        
        # Evaluate
        optimized_accuracy = evaluate_optimized(optimized_detector, valset, exact_match)
        
        elapsed = time.time() - start_time
        
        result = BenchmarkResult(
            optimizer_name="BetterTogether",
            baseline_accuracy=baseline_accuracy,
            optimized_accuracy=optimized_accuracy,
            accuracy_uplift=optimized_accuracy - baseline_accuracy,
            total_tokens=0,
            prompt_tokens=0,
            completion_tokens=0,
            cost_usd=0.0,
            time_seconds=elapsed,
            notes=f"SGLang + {LOCAL_MODEL}. Strategy: p -> w. Experimental.",
        )
        
        print_benchmark_result(result)
        all_results.append(result)
        
    except Exception as e:
        print(f"BetterTogether failed: {e}")
        result = BenchmarkResult(
            optimizer_name="BetterTogether",
            baseline_accuracy=baseline_accuracy,
            optimized_accuracy=0,
            accuracy_uplift=0,
            total_tokens=0,
            prompt_tokens=0,
            completion_tokens=0,
            cost_usd=0,
            time_seconds=time.time() - start_time,
            notes=f"FAILED: {str(e)[:100]}",
        )
        all_results.append(result)
else:
    print("Skipping BetterTogether - requires SGLang + GPU")
    result = BenchmarkResult(
        optimizer_name="BetterTogether",
        baseline_accuracy=baseline_accuracy,
        optimized_accuracy=0,
        accuracy_uplift=0,
        total_tokens=0,
        prompt_tokens=0,
        completion_tokens=0,
        cost_usd=0,
        time_seconds=0,
        notes="SKIPPED: Requires SGLang + GPU. Experimental.",
    )
    all_results.append(result)

---
## Summary: Optimizer Comparison

Final comparison of all optimizers across all metrics.

In [None]:
# Create summary DataFrame
summary_data = []
for r in all_results:
    summary_data.append({
        "Optimizer": r.optimizer_name,
        "Baseline Acc": f"{r.baseline_accuracy*100:.1f}%",
        "Optimized Acc": f"{r.optimized_accuracy*100:.1f}%" if r.optimized_accuracy > 0 else "N/A",
        "Uplift": f"{r.accuracy_uplift*100:+.1f}%" if r.optimized_accuracy > 0 else "N/A",
        "Time (s)": f"{r.time_seconds:.1f}" if r.time_seconds > 0 else "N/A",
        "Notes": r.notes[:50] + "..." if len(r.notes) > 50 else r.notes,
    })

summary_df = pd.DataFrame(summary_data)

print("\n" + "="*100)
print("OPTIMIZER BENCHMARK SUMMARY")
print("="*100)
print(f"\nBaseline Model: openai/gpt-5-mini")
print(f"Teacher/Reflection Model: openai/gpt-5")
print(f"Fine-tuning Model: {LOCAL_MODEL}")
print(f"\nTraining Examples: {len(trainset)}")
print(f"Validation Examples: {len(valset)}")
print("\n")

# Display with pandas
pd.set_option('display.max_colwidth', 60)
print(summary_df.to_string(index=False))

# Find best optimizer
valid_results = [r for r in all_results if r.optimized_accuracy > 0]
if valid_results:
    best_by_accuracy = max(valid_results, key=lambda r: r.accuracy_uplift)
    
    print("\n" + "-"*100)
    print(f"Best by Accuracy Uplift: {best_by_accuracy.optimizer_name} ({best_by_accuracy.accuracy_uplift*100:+.1f}%)")
    print("-"*100)

In [None]:
# Detailed token usage breakdown
print("\n" + "="*80)
print("DETAILED TOKEN USAGE BY MODEL")
print("="*80)

for r in all_results:
    if r.usage_by_model:
        print(f"\n{r.optimizer_name}:")
        for model, usage in r.usage_by_model.items():
            prompt = usage.get('prompt_tokens', 0)
            completion = usage.get('completion_tokens', 0)
            total = usage.get('total_tokens', prompt + completion)
            print(f"  {model}:")
            print(f"    Prompt tokens:     {prompt:,}")
            print(f"    Completion tokens: {completion:,}")
            print(f"    Total tokens:      {total:,}")

In [None]:
# Save results to CSV
results_df = pd.DataFrame([{
    "optimizer": r.optimizer_name,
    "baseline_accuracy": r.baseline_accuracy,
    "optimized_accuracy": r.optimized_accuracy,
    "accuracy_uplift": r.accuracy_uplift,
    "total_tokens": r.total_tokens,
    "prompt_tokens": r.prompt_tokens,
    "completion_tokens": r.completion_tokens,
    "cost_usd": r.cost_usd,
    "time_seconds": r.time_seconds,
    "notes": r.notes,
} for r in all_results])

results_df.to_csv("optimizer_benchmark_results.csv", index=False)
print("Results saved to optimizer_benchmark_results.csv")