# Prompt Optimization

In this notebook, we will use `mlflow.genai.optimize_prompts` to optimize our lease extraction prompt. After optimization, we will evaluate the performance using the held-out evaluation set.

In [None]:
import mlflow
from mlflow.genai.scorers import Correctness
from mlflow.genai.optimize import GepaPromptOptimizer
import pandas as pd
from utils import setup_mlflow, load_config, create_predict_fn

setup_mlflow()
config = load_config()

print(f"User: {config['user']}")
print(f"Experiment: {config['mlflow_experiment_name']}")

## Load data

In [None]:
# Load full training data from Delta table
train_table_name = f"{config['catalog']}.{config['schema']}.prompt_opt_training"
train_sdf = spark.read.table(train_table_name)
print(f"Total training records: {train_sdf.count()}")

# Load eval dataset from MLflow
eval_dataset_name = f"{config['catalog']}.{config['schema']}.prompt_opt_eval"
eval_dataset = mlflow.genai.datasets.get_dataset(eval_dataset_name)
eval_records = eval_dataset.to_df()[["inputs", "expectations"]].to_dict(
    orient="records"
)
print(f"Total eval records: {len(eval_records)}")

## Optimizing prompt with GEPA

In [None]:
# Get training data
train_pdf = train_sdf.toPandas()

# Convert to records format
train_records = []
for _, row in train_pdf.iterrows():
    record = {
        "inputs": {"query": row.get("request", "")},
        "expectations": {"expected_response": row.get("labels", "")},
    }
    train_records.append(record)

print(f"Records for optimization: {len(train_records)}")

In [None]:
base_prompt_uri = (
    f"prompts:/{config['catalog']}.{config['schema']}.lease_extraction_prompt/1"
)

predict_fn_base = create_predict_fn(base_prompt_uri)


# Aggregation function to convert Correctness feedback to numerical score
def aggregation_fn(scores: dict) -> float:
    """Convert Correctness Feedback to numerical score (1.0 for pass, 0.0 for fail)."""
    correctness_feedback = scores.get("correctness")
    if correctness_feedback is None:
        return 0.0
    # Feedback.value is "yes"/"no" or similar categorical value
    value = correctness_feedback.value
    return 1.0 if value in ("yes", "PASS", True) else 0.0


with mlflow.start_run(run_name="Optimization"):
    optimization_first_half = mlflow.genai.optimize_prompts(
        predict_fn=predict_fn_base,
        train_data=train_records,
        prompt_uris=[base_prompt_uri],
        optimizer=GepaPromptOptimizer(
            reflection_model=f"databricks:/{config['optimizer_endpoint']}",
            max_metric_calls=len(train_records) * 3,
        ),
        scorers=[Correctness()],
        aggregation=aggregation_fn,
    )

## Evaluate optimized prompt

In [None]:
optimized_prompt_uri = (
    f"prompts:/{config['catalog']}.{config['schema']}.lease_extraction_prompt/3"
)

predict_fn_new = create_predict_fn(optimized_prompt_uri)

In [None]:
with mlflow.start_run(run_name="Eval Round"):
    eval_results = mlflow.genai.evaluate(
        data=eval_records,
        predict_fn=predict_fn_new,
        scorers=[Correctness()],
    )

print(f"New Prompt's Correctness: {eval_results.metrics['correctness/mean']}")

With 70 training samples, the GEPA-optimized prompt was able to achieve an increase in correctness of nearly 10%!