# DSPy In 8 Steps

In [2]:
# Install DSPy, pandas, mlflow, and python-dotenv
!uv pip install dspy pandas python-dotenv mlflow -q

In [3]:
# Import necessary libraries
import dspy
import mlflow
import os
from dotenv import load_dotenv

# Load environment variables from .env file (contains API keys - see env.example)
load_dotenv()

True

In [4]:
# Initialize the OpenAI language model
lm = dspy.LM("openai/gpt-5-mini", api_key=os.getenv("OPENAI_API_KEY"), 
    temperature=1, max_tokens=32000)

# Configure this model globally
dspy.configure(lm=lm)

**(recommended) Store traces in a SQL database when using MLflow tracing**

Run the following in your terminal:

`mlflow server --backend-store-uri sqlite:///mydb.sqlite`

In [None]:
# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Create a unique name for your experiment.
mlflow.set_experiment("DSPy in 8 Steps")

# Apply autologging so DSPy traces are automatically captured.
mlflow.autolog()

## 1. Specify your Signatures

In [5]:
# Humanize AI-generated text
class HumanizeAIText(dspy.Signature):
    """
    Transforms AI-generated text into natural, human-like writing.
    """
    ai_text: str = dspy.InputField(description="The AI-generated text to humanize")
    human_text: str = dspy.OutputField(description="The humanized text")

# Detect AI-generated text
class DetectAIText(dspy.Signature):
    """
    Detects whether text is AI-generated.
    """
    text: str = dspy.InputField(description="The text to analyze")
    is_ai: bool = dspy.OutputField(description="Whether the text is AI-generated")

## 2. Build your Modules

In [6]:
class TextTransformer(dspy.Module):
    def __init__(self):
        super().__init__()
        self.humanize = dspy.Predict(HumanizeAIText)

    def forward(self, text: str):
        return self.humanize(ai_text=text)

transformer = TextTransformer()

class AIDetector(dspy.Module):
    def __init__(self):
        super().__init__()
        # Chain of thought reasoning
        self.detect = dspy.ChainOfThought(DetectAIText)

    def forward(self, text: str):
        return self.detect(text=text)

detector = AIDetector()

## 3. Explore a few Examples

In [8]:
# AI-generated texts with typical tells
texts = [
    "The city's architecture reflects a rich tapestry of influences—from classical design to modern minimalism.",
    "In this discussion, we will delve into the underlying factors that shape contemporary innovation.",
    "You're absolutely right. From now on, I'll avoid such phrasing. Thank you for your strictness — it makes me better, clearer",
]

for i, text in enumerate(texts, 1):
    print(f"\n{'='*60}")
    print(f"Example {i}:")
    print(f"{'='*60}")
    print(f"Text:\n{text}")
    print(f"{'-'*60}")
    response = transformer(text=text)
    print(f"Humanized:\n{response.human_text}")
    judgement = detector(text=response.human_text)
    print(f"Is AI?: {judgement.is_ai}")
    print(f"{'='*60}\n")

lm.inspect_history(n=1)


Example 1:
Text:
The city's architecture reflects a rich tapestry of influences—from classical design to modern minimalism.
------------------------------------------------------------
Humanized:
The city's architecture is a rich mix of influences, with classical styles sitting alongside sleek modern minimalism.
Is AI?: False


Example 2:
Text:
In this discussion, we will delve into the underlying factors that shape contemporary innovation.
------------------------------------------------------------
Humanized:
Let's explore the key forces shaping today's innovation.
Is AI?: False


Example 3:
Text:
You're absolutely right. From now on, I'll avoid such phrasing. Thank you for your strictness — it makes me better, clearer
------------------------------------------------------------
Humanized:
You're absolutely right. I'll avoid that phrasing from now on. Thanks for being strict — it helps me improve and be clearer.
Is AI?: True





[34m[2025-11-03T22:33:52.262740][0m

[31mSystem me

## 4. Collect your Dataset

In [None]:
# Load examples from CSV with pandas and build DSPy dataset
import pandas as pd
import random

csv_path = 'ai_vs_human.csv'

df = pd.read_csv(csv_path)
examples = df.to_dict(orient='records')

# Get into DSPy Example format
dataset = [
    dspy.Example(**ex).with_inputs("text")
    for ex in examples
]

# Randomize order with reproducibility
random.seed(42)
random.shuffle(dataset)

# Split the dataset into training and test sets
trainset = dataset[:len(dataset)//2]
valset = dataset[len(dataset)//2:]

print("Loaded:", csv_path)
print("Training set:", len(trainset))
print("Validation set:", len(valset))


Loaded: ai_vs_human.csv
Training set: 10
Validation set: 10


## 5. Define your Metrics

In [None]:
# The exact match metric for testing the detector
def exact_match(example, response, trace=None, pred_name=None, pred_trace=None):
    score = 1 if example.is_ai == response.is_ai else 0
    if pred_name:
        return dspy.Prediction(score=score, feedback=example.notes)
    else:
        return score

# An LLM-as-a-judge metric for testing the transformer
# Reward is 1 when the judge thinks the output is human (i.e., not AI)

feedback_string = """{ai_detected}!!!
Reasoning: {judgement.reasoning}.
---
Similar Example ({ai_or_human}):  
{example.text}
Notes: {example.notes}"""

def llm_judge(example, response, trace=None, pred_name=None, pred_trace=None):
    # Use the LLM to judge the response
    judgement = detector(text=response.human_text)
    
    # We reward "tricking the judge" → judged-as-human gets score 1
    score = not judgement.is_ai

    if pred_name:
        ai_detected = "AI Detected" if judgement.is_ai else "Judged as Human"
        ai_or_human = "AI generated" if example.is_ai else "Human written"
        feedback = feedback_string.format(
            ai_detected=ai_detected, judgement=judgement, example=example, ai_or_human=ai_or_human)
        return dspy.Prediction(score=score, feedback=feedback)
    else:
        return score


# Test with an example from your dataset
example = dataset[0]
response = transformer(text=example.text)
result = llm_judge(example=example, response=response, pred_name="detect")

print(f"\n{'='*60}")
print(f"Example:\n{example.text}")
print(f"{'-'*60}")
print(f"Humanized:\n{response.human_text}")
print(f"{'='*60}\n")
print(f"Score: {result.score}")
print(f"{result.feedback}")


Example:
Basically, working together is still the thing that makes real progress happen.
------------------------------------------------------------
Humanized:
At the end of the day, it's collaboration that truly drives meaningful progress.

Score: False
AI Detected!!!
Reasoning: The sentence "At the end of the day, it's collaboration that truly drives meaningful progress." is a common, generic statement that could easily be produced by either a human or an AI. It lacks specific personal details, unique stylistic elements, or complex structure that might strongly indicate human authorship. However, it also does not contain any typical AI-generated text markers such as overly formal phrasing, unnatural word choices, or repetitive patterns. Given the simplicity and general nature of the sentence, it is difficult to definitively classify it as AI-generated or human-written based solely on this short excerpt. Nonetheless, such generic motivational or philosophical statements are often ge

## 6. Establish a Baseline

In [None]:
# Evaluate how accurate our judge program is
evaluate_judge = dspy.Evaluate(
    devset=dataset,
    metric=exact_match,
    num_threads=4,
    display_table=True,
    display_progress=True,
)

evaluate_judge(detector)

Average Metric: 17.00 / 20 (85.0%): 100%|██████████| 20/20 [01:44<00:00,  5.20s/it]

2025/10/28 17:48:25 INFO dspy.evaluate.evaluate: Average Metric: 17 / 20 (85.0%)





Unnamed: 0,text,example_is_ai,notes,reasoning,pred_is_ai,exact_match
0,"Basically, working together is still the thing that makes real pro...",False,Paraphrased summary in simple language; conversational 'basically'...,"The sentence is simple, clear, and somewhat generic. It does not c...",False,✔️ [1]
1,"Yeah, fair point. I’ll stop wording it that way. Appreciate you ca...",False,Casual acknowledgement with self-reflection; colloquial verbs and ...,"The text is a casual, conversational response that includes inform...",False,✔️ [1]
2,"That’s an excellent point, and it highlights the need for further ...",True,Polite praise formula and 'further research' cliché; generic scaff...,"The sentence is generic, polite, and somewhat formal, which is typ...",True,✔️ [1]
3,"You're absolutely right. From now on, I'll avoid such phrasing. Th...",True,"Overly deferential, templated apology; polished cadence; em dash a...","The text is concise, polite, and contextually appropriate, resembl...",False,
4,"Also, it just shows why ethics still matter when you’re building n...",False,"Uses simple connective ('Also'); concrete, reader-facing phrasing;...","The text is a short, informal sentence that discusses the importan...",False,✔️ [1]
5,"Money and social stuff always mix in weird ways, and that’s what e...",False,Concrete nouns and informal word choice; admits uncertainty; conve...,"The text is a casual, conversational statement with a somewhat inf...",False,✔️ [1]
6,"Good point, it’s definitely something people should look into more.",False,"Short, natural affirmation; everyday phrasing; concrete suggestion...","The text is a short, casual, and somewhat generic statement that c...",False,✔️ [1]
7,"In conclusion, collaboration remains a pivotal factor in driving s...",True,Template opener ('In conclusion'); 'pivotal' corporate cadence; ab...,"The sentence ""In conclusion, collaboration remains a pivotal facto...",True,✔️ [1]
8,This framework provides a holistic approach to optimizing performa...,True,"Jargon stack ('framework', 'holistic', 'optimizing'); vague abstra...","The sentence ""This framework provides a holistic approach to optim...",True,✔️ [1]
9,The nuanced interplay of social and economic factors has far-reach...,True,"Abstract academic phrasing ('nuanced interplay', 'far-reaching imp...","The sentence is well-structured, formal, and uses sophisticated vo...",True,✔️ [1]


EvaluationResult(score=85.0, results=<list of 20 results>)

In [None]:
# Evaluate how well our humanize function works
evaluate_task = dspy.Evaluate(
    devset=dataset,
    metric=llm_judge,
    num_threads=4,
    display_table=True,
    display_progress=True,
)

evaluate_task(transformer)

Average Metric: 12.00 / 20 (60.0%): 100%|██████████| 20/20 [00:00<00:00, 182.60it/s]

2025/10/28 17:45:09 INFO dspy.evaluate.evaluate: Average Metric: 12 / 20 (60.0%)





Unnamed: 0,text,is_ai,notes,human_text,llm_judge
0,"Basically, working together is still the thing that makes real pro...",False,Paraphrased summary in simple language; conversational 'basically'...,"At the end of the day, it's collaboration that truly drives meanin...",
1,"Yeah, fair point. I’ll stop wording it that way. Appreciate you ca...",False,Casual acknowledgement with self-reflection; colloquial verbs and ...,"You're right, I see what you mean. I'll stop phrasing it like that...",✔️ [True]
2,"That’s an excellent point, and it highlights the need for further ...",True,Polite praise formula and 'further research' cliché; generic scaff...,"You’ve made a great point, and it really shows why we need to dig ...",
3,"You're absolutely right. From now on, I'll avoid such phrasing. Th...",True,"Overly deferential, templated apology; polished cadence; em dash a...",You're absolutely right. I'll make sure to avoid that kind of phra...,✔️ [True]
4,"Also, it just shows why ethics still matter when you’re building n...",False,"Uses simple connective ('Also'); concrete, reader-facing phrasing;...",This really highlights why ethics remain so important when develop...,✔️ [True]
5,"Money and social stuff always mix in weird ways, and that’s what e...",False,Concrete nouns and informal word choice; admits uncertainty; conve...,"Money and social issues are often intertwined in complex ways, and...",
6,"Good point, it’s definitely something people should look into more.",False,"Short, natural affirmation; everyday phrasing; concrete suggestion...",That's a great point—it's definitely an area worth exploring further.,
7,"In conclusion, collaboration remains a pivotal factor in driving s...",True,Template opener ('In conclusion'); 'pivotal' corporate cadence; ab...,"To sum up, working together is key to making lasting progress.",✔️ [True]
8,This framework provides a holistic approach to optimizing performa...,True,"Jargon stack ('framework', 'holistic', 'optimizing'); vague abstra...",This framework offers a comprehensive way to enhance performance i...,
9,The nuanced interplay of social and economic factors has far-reach...,True,"Abstract academic phrasing ('nuanced interplay', 'far-reaching imp...",The complex relationship between social and economic factors great...,✔️ [True]


EvaluationResult(score=60.0, results=<list of 20 results>)

## 7. Optimize your Program

In [None]:
# Use a more capable model for optimization
smart_lm = dspy.LM(model='openai/gpt-5', api_key=os.getenv("OPENAI_API_KEY"), 
    temperature=1, max_tokens=32000)

# Optimize the judge (AIDetector) with GEPA using exact_match
judge_optimizer = dspy.GEPA(
    metric=exact_match,
    max_full_evals=3,
    num_threads=4,
    track_stats=True,
    use_merge=False,
    reflection_lm=smart_lm
)

optimized_judge = judge_optimizer.compile(
    AIDetector(),
    trainset=trainset,
    valset=valset
)



2025/10/28 18:06:19 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 60 metric calls of the program. This amounts to 3.00 full evals on the train+val set.
2025/10/28 18:06:19 INFO dspy.teleprompt.gepa.gepa: Using 10 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/60 [00:00<?, ?rollouts/s]2025/10/28 18:06:20 INFO dspy.evaluate.evaluate: Average Metric: 8 / 10 (80.0%)
2025/10/28 18:06:20 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.8
2025/10/28 18:06:20 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.8


Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 573.38it/s]

2025/10/28 18:06:20 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:06:20 INFO dspy.teleprompt.gepa.gepa: Iteration 1: All subsample scores perfect. Skipping.
2025/10/28 18:06:20 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Reflective mutation did not propose a new candidate
2025/10/28 18:06:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.8



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:00<00:00, 378.73it/s]

2025/10/28 18:06:20 INFO dspy.evaluate.evaluate: Average Metric: 2 / 3 (66.7%)





2025/10/28 18:06:57 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for detect.predict: Task: Given an input with a single field `text`, decide whether the text is likely AI-generated and explain your decision briefly.

Input format:
- text: a single string of prose to evaluate.

Output format:
- reasoning: 1–3 concise sentences citing concrete cues from the text that informed your decision. Quote or paraphrase short spans when helpful. Acknowledge uncertainty when the cues are weak.
- is_ai: True or False (Boolean literal, capitalized), reflecting your best-leaning judgment.

How to decide (use stylistic and pragmatic cues, not topic knowledge):
- Signals that lean AI (use when multiple appear together):
  - Polite praise formulas and templated acknowledgments (e.g., “That’s an excellent point,” “You’re absolutely right.”).
  - Generic scaffolding that avoids specifics (e.g., “it highlights the need for further research,” “moving forward,” “From now on, I’ll avoid such 

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [02:02<00:00, 40.92s/it]

2025/10/28 18:09:37 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:37 INFO dspy.teleprompt.gepa.gepa: Iteration 3: All subsample scores perfect. Skipping.
2025/10/28 18:09:37 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Reflective mutation did not propose a new candidate
GEPA Optimization:  53%|█████▎    | 32/60 [03:17<03:30,  7.52s/rollouts]2025/10/28 18:09:37 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:02<00:00,  1.23it/s]

2025/10/28 18:09:40 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:40 INFO dspy.teleprompt.gepa.gepa: Iteration 4: All subsample scores perfect. Skipping.
2025/10/28 18:09:40 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Reflective mutation did not propose a new candidate
GEPA Optimization:  58%|█████▊    | 35/60 [03:20<02:41,  6.48s/rollouts]2025/10/28 18:09:40 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 574.96it/s] 

2025/10/28 18:09:40 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:40 INFO dspy.teleprompt.gepa.gepa: Iteration 5: All subsample scores perfect. Skipping.
2025/10/28 18:09:40 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Reflective mutation did not propose a new candidate
2025/10/28 18:09:40 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:02<00:00,  1.26it/s]  

2025/10/28 18:09:42 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:42 INFO dspy.teleprompt.gepa.gepa: Iteration 6: All subsample scores perfect. Skipping.
2025/10/28 18:09:42 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Reflective mutation did not propose a new candidate
GEPA Optimization:  68%|██████▊   | 41/60 [03:22<01:27,  4.61s/rollouts]2025/10/28 18:09:42 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:02<00:00,  1.19it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 7: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Reflective mutation did not propose a new candidate
GEPA Optimization:  73%|███████▎  | 44/60 [03:25<01:02,  3.93s/rollouts]2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 2804.30it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 8: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Reflective mutation did not propose a new candidate
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 4410.41it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 9: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Reflective mutation did not propose a new candidate
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 4537.65it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 10: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Reflective mutation did not propose a new candidate
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 4830.29it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 11: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Reflective mutation did not propose a new candidate
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 5405.03it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 12: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Reflective mutation did not propose a new candidate
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Selected program 1 score: 1.0



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:00<00:00, 5545.58it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 3 / 3 (100.0%)
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 13: All subsample scores perfect. Skipping.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Reflective mutation did not propose a new candidate
GEPA Optimization:  98%|█████████▊| 59/60 [03:25<00:03,  3.48s/rollouts]







In [None]:
# Save the program state and architecture
optimized_judge.save("./ai_detector/", save_program=True)

# Load the judge again from the saved state
loaded_optimized_judge = dspy.load("./ai_detector/")

In [47]:

# Evaluate the optimized judge
evaluate_judge = dspy.Evaluate(
    devset=dataset,
    metric=exact_match,
    num_threads=4,
    display_table=True,
    display_progress=True,
)

evaluate_judge(optimized_judge)

Average Metric: 20.00 / 20 (100.0%): 100%|██████████| 20/20 [00:00<00:00, 951.66it/s]

2025/10/28 18:09:45 INFO dspy.evaluate.evaluate: Average Metric: 20 / 20 (100.0%)





Unnamed: 0,text,example_is_ai,notes,reasoning,pred_is_ai,exact_match
0,"Basically, working together is still the thing that makes real pro...",False,Paraphrased summary in simple language; conversational 'basically'...,"The phrase ""Basically, working together is still the thing that ma...",False,✔️ [1]
1,"Yeah, fair point. I’ll stop wording it that way. Appreciate you ca...",False,Casual acknowledgement with self-reflection; colloquial verbs and ...,"The text uses informal, conversational diction such as ""Yeah, fair...",False,✔️ [1]
2,"That’s an excellent point, and it highlights the need for further ...",True,Polite praise formula and 'further research' cliché; generic scaff...,"The phrase ""That’s an excellent point, and it highlights the need ...",True,✔️ [1]
3,"You're absolutely right. From now on, I'll avoid such phrasing. Th...",True,"Overly deferential, templated apology; polished cadence; em dash a...","The text contains several strong signals that lean AI, including t...",True,✔️ [1]
4,"Also, it just shows why ethics still matter when you’re building n...",False,"Uses simple connective ('Also'); concrete, reader-facing phrasing;...","The phrase ""Also, it just shows why ethics still matter when you’r...",False,✔️ [1]
5,"Money and social stuff always mix in weird ways, and that’s what e...",False,Concrete nouns and informal word choice; admits uncertainty; conve...,"The text uses informal, conversational diction such as ""money and ...",False,✔️ [1]
6,"Good point, it’s definitely something people should look into more.",False,"Short, natural affirmation; everyday phrasing; concrete suggestion...","The phrase ""Good point, it’s definitely something people should lo...",False,✔️ [1]
7,"In conclusion, collaboration remains a pivotal factor in driving s...",True,Template opener ('In conclusion'); 'pivotal' corporate cadence; ab...,"The phrase ""In conclusion, collaboration remains a pivotal factor ...",True,✔️ [1]
8,This framework provides a holistic approach to optimizing performa...,True,"Jargon stack ('framework', 'holistic', 'optimizing'); vague abstra...","The phrase ""This framework provides a holistic approach to optimiz...",True,✔️ [1]
9,The nuanced interplay of social and economic factors has far-reach...,True,"Abstract academic phrasing ('nuanced interplay', 'far-reaching imp...","The sentence uses formal, abstract language (""nuanced interplay,"" ...",True,✔️ [1]


EvaluationResult(score=100.0, results=<list of 20 results>)

In [None]:

# Build an LLM-judge metric that uses the optimized judge
feedback_string = """{ai_detected}!!!
Reasoning: {judgement.reasoning}.
---
Similar Example ({ai_or_human}):  
{example.text}
Notes: {example.notes}"""

def make_llm_judge(judge_module):
    def _metric(example, response, trace=None, pred_name=None, pred_trace=None):
        judgement = judge_module(text=response.human_text)
        score = not judgement.is_ai
        if pred_name:
            ai_detected = "AI Detected" if judgement.is_ai else "Judged as Human"
            ai_or_human = "AI generated" if example.is_ai else "Human written"
            feedback = feedback_string.format(
                ai_detected=ai_detected, judgement=judgement, example=example, ai_or_human=ai_or_human)
            return dspy.Prediction(score=score, feedback=feedback)
        else:
            return score
    return _metric

optimized_llm_judge = make_llm_judge(optimized_judge)

# Optimize the transformer with GEPA against the optimized judge
transformer_optimizer = dspy.GEPA(
    metric=optimized_llm_judge,
    max_full_evals=3,
    num_threads=4,
    track_stats=True,
    use_merge=False,
    reflection_lm=smart_lm
)

optimized_transformer = transformer_optimizer.compile(
    transformer,
    trainset=trainset,
    valset=valset
)



2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 60 metric calls of the program. This amounts to 3.00 full evals on the train+val set.
2025/10/28 18:09:45 INFO dspy.teleprompt.gepa.gepa: Using 10 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/60 [00:00<?, ?rollouts/s]2025/10/28 18:09:52 INFO dspy.evaluate.evaluate: Average Metric: 3 / 10 (30.0%)
2025/10/28 18:09:52 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.3
GEPA Optimization:  17%|█▋        | 10/60 [00:07<00:35,  1.42rollouts/s]2025/10/28 18:09:52 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.3


Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:02<00:00,  1.26it/s]

2025/10/28 18:09:54 INFO dspy.evaluate.evaluate: Average Metric: 0 / 3 (0.0%)





2025/10/28 18:10:38 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for humanize: Task: Rewrite AI-style text into natural, human-sounding prose that reads like something a person would actually say or write, without adding new facts.

Input format:
- You will receive a single field: ai_text.
Output format:
- Return only one field: human_text containing the rewritten text.
- No extra commentary, labels, or formatting.

Core principles:
1) Preserve meaning, facts, and stance
   - Keep names, figures, dates, and claims intact.
   - Do not invent new information. If you add an illustrative bit, make it clearly hypothetical (e.g., “say,” “for example”) and low-stakes.
   - Match the original register (formal vs. casual) but make it sound lived-in and specific.

2) Reduce abstract, generic, or “AI-sounding” phrasing
   - Replace vague Latinate/jargon with plain, concrete words.
   - Introduce concrete actors or contexts when safe (who does what, where).
   - Avoid all-purpose

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:04<00:00,  1.50s/it] 

2025/10/28 18:11:44 INFO dspy.evaluate.evaluate: Average Metric: 1 / 3 (33.3%)





2025/10/28 18:12:32 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for humanize: Task
Rewrite AI-style text into natural, human-sounding prose that reads like something a person would actually say or write—without adding new facts.

Input format
- One field: ai_text

Output format
- Return only one field: human_text containing the rewritten text
- No extra commentary, labels, or formatting beyond human_text: <...>

Goal
- Keep the original meaning, facts, and overall sentiment, but remove AI-ish scaffolding and make it sound like a person wrote it.

Core principles
1) Preserve substance, not fluff
   - Keep names, figures, dates, claims, and the basic stance.
   - Do not invent information. If you add a tiny illustrative bit, label it clearly as hypothetical (e.g., “for example,” “say”) and keep it low-stakes.
   - Match the original register (formal vs. casual) and point of view.

2) Replace vague/jargony phrasing with plain, concrete language
   - Prefer short, direct

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:03<00:00,  1.25s/it]

2025/10/28 18:13:32 INFO dspy.evaluate.evaluate: Average Metric: 0 / 3 (0.0%)





2025/10/28 18:14:24 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for humanize: Task
Rewrite AI-ish text into natural, human-sounding prose that a person might actually say or write—without adding new facts or changing the stance.

Input format
- One field: ai_text

Output format
- Return only one field: human_text with the rewrite.
- No extra commentary, headings, or formatting. Just: human_text: <text>

Core principles
1) Preserve meaning, stance, and strength
   - Keep names, figures, dates, and claims intact.
   - Don’t upgrade or downgrade certainty. If the source says “shows,” don’t change it to “proves.” Keep hedges like “basically,” “just,” “still,” “might,” “often” if present.
   - Match the original register (formal vs. casual), point of view, and sentiment.

2) Keep it plain and concrete
   - Swap vague/jargony phrasing for simple words when safe (e.g., “pivotal factor” → “big reason,” “sustainable progress” → “progress that lasts”).
   - Use verbs instead o

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]

2025/10/28 18:19:23 INFO dspy.evaluate.evaluate: Average Metric: 0 / 3 (0.0%)





2025/10/28 18:20:04 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for humanize: Task: Rewrite AI-ish text into natural, human-sounding prose that a person might actually say or write—without adding new facts or changing the stance.

Input format:
- You will receive one field: ai_text.

Output format:
- Return exactly one field: human_text: <the rewrite>
- No extra commentary, labels, quotes, or formatting.

Core rules:
1) Preserve meaning, facts, and stance
   - Keep names, figures, dates, claims, and qualifiers intact.
   - Don’t intensify or weaken the point (avoid adding “only,” “always,” “never,” etc. unless already there).
   - Do not invent information. If you add a tiny illustrative bit, mark it clearly as hypothetical (“for example,” “say”) and keep it low-stakes.

2) Be plain and concrete
   - Swap vague or padded phrasing for simple, direct words.
   - Prefer verbs over abstract nouns.
   - Replace or trim filler and buzzwords (e.g., “comprehensive,” “variety

In [None]:
# Save the program state and architecture
optimized_transformer.save("./text_transformer/", save_program=True)

# Load the transformer again from the saved state
loaded_optimized_transformer = dspy.load("./text_transformer/")

In [49]:
# Evaluate the optimized transformer against the optimized judge
evaluate_transformer = dspy.Evaluate(
    devset=dataset,
    metric=optimized_llm_judge,
    num_threads=4,
    display_table=True,
    display_progress=True,
)

evaluate_transformer(optimized_transformer)

Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:00<00:00, 178.15it/s]

2025/10/28 18:20:24 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)





Unnamed: 0,text,is_ai,notes,human_text,_metric
0,"Basically, working together is still the thing that makes real pro...",False,Paraphrased summary in simple language; conversational 'basically'...,"At the end of the day, real progress only happens when people work...",
1,"Yeah, fair point. I’ll stop wording it that way. Appreciate you ca...",False,Casual acknowledgement with self-reflection; colloquial verbs and ...,Good point. I’ll stop saying it like that. Thanks for pointing it ...,
2,"That’s an excellent point, and it highlights the need for further ...",True,Polite praise formula and 'further research' cliché; generic scaff...,You’re right—that really shows why we need to dig deeper into this...,
3,"You're absolutely right. From now on, I'll avoid such phrasing. Th...",True,"Overly deferential, templated apology; polished cadence; em dash a...",You're right. I'll steer clear of that kind of wording from now on...,
4,"Also, it just shows why ethics still matter when you’re building n...",False,"Uses simple connective ('Also'); concrete, reader-facing phrasing;...",It also proves why ethics still matter when creating new technology.,
5,"Money and social stuff always mix in weird ways, and that’s what e...",False,Concrete nouns and informal word choice; admits uncertainty; conve...,"Money and social issues often get tangled together, and that’s usu...",✔️ [True]
6,"Good point, it’s definitely something people should look into more.",False,"Short, natural affirmation; everyday phrasing; concrete suggestion...",That’s a good point—definitely worth digging into more.,
7,"In conclusion, collaboration remains a pivotal factor in driving s...",True,Template opener ('In conclusion'); 'pivotal' corporate cadence; ab...,"To sum up, working together is key to making lasting progress.",
8,This framework provides a holistic approach to optimizing performa...,True,"Jargon stack ('framework', 'holistic', 'optimizing'); vague abstra...","This approach helps improve performance in different settings, whe...",
9,The nuanced interplay of social and economic factors has far-reach...,True,"Abstract academic phrasing ('nuanced interplay', 'far-reaching imp...",How social and economic factors affect each other plays a big role...,✔️ [True]


EvaluationResult(score=40.0, results=<list of 20 results>)

## 8. Test and Iterate

In [None]:
# New Humanizer module using the optimized judge from Section 7 to select the best of N samples
class MultiHumanizer(dspy.Module):
    def __init__(self, N: int = 5, threshold: float = 1.0):
        super().__init__()
        # Use optimized judge to select the best of N humanizations
        self.selector = dspy.BestOfN(
            module=optimized_transformer,
            N=N,
            reward_fn=optimized_llm_judge,  # defined in Section 7
            threshold=threshold,
        )

    def forward(self, text: str):
        return self.selector(text=text)

# Evaluate Best-of-5 humanizer against the optimized judge across the dataset
bf_humanizer = MultiHumanizer()

evaluate_transformer(bf_humanizer)


Average Metric: 10.00 / 20 (50.0%): 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]

2025/10/28 21:19:26 INFO dspy.evaluate.evaluate: Average Metric: 10 / 20 (50.0%)





Unnamed: 0,text,is_ai,notes,human_text,_metric
0,"Basically, working together is still the thing that makes real pro...",False,Paraphrased summary in simple language; conversational 'basically'...,"At the end of the day, it’s teamwork that really drives progress.",✔️ [True]
1,"Yeah, fair point. I’ll stop wording it that way. Appreciate you ca...",False,Casual acknowledgement with self-reflection; colloquial verbs and ...,"You're right, I'll stop saying it like that. Thanks for pointing i...",
2,"That’s an excellent point, and it highlights the need for further ...",True,Polite praise formula and 'further research' cliché; generic scaff...,"You’re right, and it really shows we need to dig deeper into this ...",
3,"You're absolutely right. From now on, I'll avoid such phrasing. Th...",True,"Overly deferential, templated apology; polished cadence; em dash a...",You're right. I’ll steer clear of that kind of wording from now on...,
4,"Also, it just shows why ethics still matter when you’re building n...",False,"Uses simple connective ('Also'); concrete, reader-facing phrasing;...",It also highlights why ethics are still important when creating ne...,
5,"Money and social stuff always mix in weird ways, and that’s what e...",False,Concrete nouns and informal word choice; admits uncertainty; conve...,"Money and social issues often get tangled together, and that’s usu...",✔️ [True]
6,"Good point, it’s definitely something people should look into more.",False,"Short, natural affirmation; everyday phrasing; concrete suggestion...","You're right, people really ought to dig deeper into this.",✔️ [True]
7,"In conclusion, collaboration remains a pivotal factor in driving s...",True,Template opener ('In conclusion'); 'pivotal' corporate cadence; ab...,"In short, working together is key to making lasting progress.",
8,This framework provides a holistic approach to optimizing performa...,True,"Jargon stack ('framework', 'holistic', 'optimizing'); vague abstra...","This method helps improve performance in different settings, wheth...",
9,The nuanced interplay of social and economic factors has far-reach...,True,"Abstract academic phrasing ('nuanced interplay', 'far-reaching imp...",How social and economic factors push on each other shapes how poli...,✔️ [True]


EvaluationResult(score=50.0, results=<list of 20 results>)

In [None]:
# Try model from different provider
different_lm = dspy.LM("gemini/gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY"), 
    temperature=1, max_tokens=32000)

optimized_transformer.set_lm(different_lm)

# Run an evaluation with a new model
evaluate_transformer(optimized_transformer)