# Establish baseline evaluation

In [1]:
import pandas as pd
import mlflow
from utils import setup_mlflow, load_config, create_predict_fn, create_fuzzy_scorers

setup_mlflow()
config = load_config()

print(f"User: {config['user']}")
print(f"Experiment: {config['mlflow_experiment_name']}")

Set to existing MLflow experiment: /Users/david.huang@databricks.com/experiments/gepa-demo
User: david.huang@databricks.com
Experiment: /Users/david.huang@databricks.com/experiments/gepa-demo


## Split dataset into training and eval sets

In [None]:
# Read lease documents from Delta table
lease_docs_short_df = spark.read.table(
    f"{config['catalog']}.{config['schema']}.{config['table']}"
)

In [None]:
# Limit training set to first 70 documents and save to Unity Catalog
train_sdf = lease_docs_short_df.limit(70)
train_sdf.write.mode("overwrite").saveAsTable(
    f"{config['catalog']}.{config['schema']}.prompt_opt_training"
)

In [None]:
# Remaining documents become eval set
eval_sdf = lease_docs_short_df.subtract(train_sdf)
eval_pdf = eval_sdf.toPandas()

In [None]:
# Convert eval set to records
eval_records = []
for _, row in eval_pdf.iterrows():
    record = {
        "inputs": {"query": row.get("request", "")},
        "expectations": {"expected_response": row.get("labels", "")},
    }
    eval_records.append(record)

## Create a MLflow evaluation set

In [2]:
# Create MLflow eval dataset
try:
    eval_dataset = mlflow.genai.datasets.create_dataset(
        name=f"{config['catalog']}.{config['schema']}.prompt_opt_eval"
    )
    eval_dataset.merge_records(eval_records)
except Exception as e:
    if "TABLE_ALREADY_EXISTS" in str(e):
        eval_dataset = mlflow.genai.datasets.get_dataset(
            name=f"{config['catalog']}.{config['schema']}.prompt_opt_eval",
        )
    else:
        raise e

In [3]:
# Turn eval dataset into records
eval_records = eval_dataset.to_df()[["inputs", "expectations"]].to_dict(
    orient="records"
)

## Run baseline evaluation with fuzzy match scorers

In [4]:
# Load prediction function with base prompt
lease_extraction_predict_fn = create_predict_fn(
    prompt_uri=f"prompts:/{config['catalog']}.{config['schema']}.lease_extraction_prompt/1"
)

In [5]:
# Run fuzzy match scorers on the eval dataset
fuzzy_scorers = create_fuzzy_scorers(threshold=0.7)

with mlflow.start_run(run_name="Baseline eval run"):
    results = mlflow.genai.evaluate(
        data=eval_records,
        predict_fn=lease_extraction_predict_fn,
        scorers=fuzzy_scorers,
    )

2025/11/25 10:28:56 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/11/25 10:28:56 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. To disable this check, set the MLFLOW_GENAI_EVAL_SKIP_TRACE_VALIDATION environment variable to True.
  from .autonotebook import tqdm as notebook_tqdm
Evaluating: 100%|██████████| 30/30 [Elapsed: 01:58, Remaining: 00:00] 


In [6]:
# Print the mean fuzzy match scores per field
mean_scores = []
for key, value in results.metrics.items():
    if "/mean" in key:
        print(f"{key}: {value:.2%}")
        mean_scores.append(value)

# Print overall average
if mean_scores:
    print(f"\nOverall average: {sum(mean_scores) / len(mean_scores):.2%}")

signing_date/mean: 53.11%
designated_use/mean: 76.44%
extension_period/mean: 33.05%
lessee/mean: 91.75%
end_date/mean: 56.41%
start_date/mean: 60.50%
leased_space/mean: 62.83%
lessor/mean: 93.60%
term_of_payment/mean: 30.15%
expiration_date_of_lease/mean: 56.69%

Overall average: 61.45%


The base prompt on GPT-OSS 20B got an average accuract of 61.5%. It was evaluated using fuzzy match scorers (0.0-1.0) for each extraction field. Next we will use GEPA to optimize the prompt.