# Optimizing Language Models with DSPy GEPA: From 42% to 64% Accuracy

This notebook demonstrates how to use DSPy's GEPA (Generalized Error-driven Prompt Augmentation) optimizer to improve language model performance on mathematical reasoning tasks. We'll work with the NuminaMath-1.5 dataset and show how GEPA can boost accuracy from 42% to 64% through automated prompt optimization.

**What you'll learn:**
- Setting up DSPy with local (Ollama) or cloud (OpenRouter) language models
- Processing and filtering mathematical problem datasets
- Building a baseline Chain-of-Thought reasoning program
- Optimizing prompts with GEPA using error-driven feedback
- Evaluating improvements in model accuracy

**Key Results:**
- Baseline accuracy: 42.3% (569/1344 correct)
- Optimized accuracy: 64.0% (860/1344 correct)
- **+21.7% improvement** through automated prompt engineering

GEPA works by analyzing errors, generating targeted feedback, and automatically refining prompts to address common failure patterns. This makes it particularly effective for complex reasoning tasks where prompt quality significantly impacts performance.

In [None]:
import dspy
from datasets import load_dataset
import os

In [None]:
# Configure Ollama Language Model for DSPy
# Prerequisites: 
# 1. Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
# 2. Run model: ollama run llama3.2:1b (or your preferred model)

import dspy

# Configure Ollama LM using DSPy's official format
ollama_llm = dspy.LM(
    model='ollama_chat/gemma3:4b',  # Format: ollama_chat/{model_name}
    api_base='http://localhost:11434',  # Ollama default endpoint
    api_key='',  # Empty string for local Ollama
    max_tokens=65536,
    temperature=1.0
)

# Set as default LM
dspy.configure(lm=ollama_llm)

print("✅ Ollama LM configured successfully!")
print(f"Model: {ollama_llm.model}")
print("🔄 Make sure Ollama is running: ollama run qwen3:8b")

In [None]:
open_router_lm = dspy.LM('openrouter/openai/gpt-4.1-nano', 
                          api_key=os.getenv('openrouter_api_key'), 
                          api_base='https://openrouter.ai/api/v1',
                          max_tokens=65536,
                          temperature=1.0)

dspy.configure(lm=open_router_lm)

reflection_lm = dspy.LM('openrouter/meta-llama/llama-4-scout', 
                          api_key=os.getenv('openrouter_api_key'), 
                          api_base='https://openrouter.ai/api/v1',
                          max_tokens=65536,
                          temperature=1.0)

In [None]:
train_split = load_dataset("AI-MO/NuminaMath-1.5")['train']

In [None]:
def is_numeric_answer(answer):
    try:
        int(answer)  # Try converting string to int number
        return True
    except (ValueError, TypeError):
        return False

In [None]:
# keep only the samples where its ['answer'] key is int or float, do it modular and fast.
train_split = train_split.filter(lambda x: is_numeric_answer(x['answer']))

In [None]:
print(train_split[12]['answer'])

In [None]:
def init_dataset(train_split_ratio=None, test_split_ratio=None, val_split_ratio=None, sample_fraction=1.0):
    if train_split_ratio is None:
        train_split_ratio = 0.5
    if test_split_ratio is None:
        test_split_ratio = 0.45
    if val_split_ratio is None:
        val_split_ratio = 0.05
    assert (train_split_ratio + test_split_ratio + val_split_ratio) == 1.0, "Ratios must sum to 1.0"

    train_split = load_dataset("AI-MO/NuminaMath-1.5")['train']
    # keep only the samples where its ['answer'] key is int or float.
    train_split = train_split.filter(lambda x: is_numeric_answer(x['answer']))
    train_split = [
        dspy.Example({
            "problem": x['problem'],
            'solution': x['solution'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in train_split
    ]
    import random
    random.Random(0).shuffle(train_split)
    tot_num = len(train_split)
    print(f"Total number of examples after filtering: {tot_num}")

    if sample_fraction < 1.0:
        sample_num = int(tot_num * sample_fraction)
        train_split = train_split[:sample_num]
        tot_num = sample_num
        print(f"Sampled down to {sample_num} examples.")
    
    train_set = train_split[:int(train_split_ratio * tot_num)]
    val_set = train_split[int(train_split_ratio * tot_num):int((train_split_ratio + val_split_ratio) * tot_num)]
    test_set = train_split[int((train_split_ratio + val_split_ratio) * tot_num):]

    return train_set, val_set, test_set

In [None]:
train_set, val_set, test_set = init_dataset(sample_fraction=0.01)

len(train_set), len(val_set), len(test_set)

In [None]:
print("Problem:")
print(train_set[0]['problem'])
print("\n\nSolution:")
print(train_set[0]['solution'])
print("\n\nAnswer:")
print(train_set[0]['answer'])

In [None]:
print(test_set[0]['problem'])
print("\n\nAnswer:")
print(test_set[0]['answer'])

In [None]:
class GenerateResponse(dspy.Signature):
    """Solve the problem and provide the answer in the correct format."""
    problem = dspy.InputField()
    answer = dspy.OutputField()

program = dspy.ChainOfThought(GenerateResponse)

In [None]:
def metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
    correct_answer = int(example['answer'])
    try:
        llm_answer = int(prediction.answer)
    except ValueError as e:
        return 0
    return int(correct_answer == llm_answer)

In [None]:
import dspy
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

In [None]:
# SYSTEMATIC DEBUGGING - Step 1: Test program on single example (FIXED)
print("=== STEP 1: Testing program on single example ===")
test_example = test_set[0]
print(f"Input problem: {test_example.problem[:100]}...")
print(f"Expected answer: {test_example.answer}")

try:
    # FIX: Use keyword argument matching signature field name
    prediction = program(problem=test_example.problem)
    print(f"Program prediction: {prediction}")
    print(f"Prediction answer: {prediction.answer}")
    print(f"Prediction type: {type(prediction.answer)}")
    print("✅ Program works!")
except Exception as e:
    print(f"❌ Program failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
def metric_with_feedback(example, prediction, trace=None, pred_name=None, pred_trace=None):
    correct_answer = int(example['answer'])
    written_solution = example.get('solution', '')
    try:
        llm_answer = int(prediction.answer)
    except ValueError as e:
        feedback_text = f"The final answer must be a valid integer and nothing else. You responded with '{prediction.answer}', which couldn't be parsed as a python integer. Please ensure your answer is a valid integer without any additional text or formatting."
        feedback_text += f" The correct answer is '{correct_answer}'."
        if written_solution:
            feedback_text += f" Here's the full step-by-step solution:\n{written_solution}\n\nThink about what takeaways you can learn from this solution to improve your future answers and approach to similar problems and ensure your final answer is a valid integer."
        return dspy.Prediction(score=0, feedback=feedback_text)

    score = int(correct_answer == llm_answer)

    feedback_text = ""
    if score == 1:
        feedback_text = f"Your answer is correct. The correct answer is '{correct_answer}'."
    else:
        feedback_text = f"Your answer is incorrect. The correct answer is '{correct_answer}'."
    
    if written_solution:
        feedback_text += f" Here's the full step-by-step solution:\n{written_solution}\n\nThink about what takeaways you can learn from this solution to improve your future answers and approach to similar problems."

    return dspy.Prediction(score=score, feedback=feedback_text)

In [None]:
from dspy import GEPA

optimizer = GEPA(
    metric=metric_with_feedback,
    auto="heavy",
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=16,
    track_best_outputs=True,
    add_format_failure_as_feedback=True,
    reflection_lm=reflection_lm
)


In [None]:
optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

In [None]:
print(optimized_program.predict.signature.instructions)

In [None]:
evaluate(optimized_program)