# Establish baseline evaluation

In [None]:
import pandas as pd
import mlflow
from utils import setup_mlflow, load_config, create_predict_fn, create_fuzzy_scorers

setup_mlflow()
config = load_config()

print(f"User: {config['user']}")
print(f"Experiment: {config['mlflow_experiment_name']}")

## Split dataset into training and eval sets

In [None]:
# Read lease documents from Delta table
lease_docs_short_df = spark.read.table(
    f"{config['catalog']}.{config['schema']}.{config['table']}"
)

In [None]:
# Limit training set to first 70 documents and save to Unity Catalog
train_sdf = lease_docs_short_df.limit(70)
train_sdf.write.mode("overwrite").saveAsTable(
    f"{config['catalog']}.{config['schema']}.prompt_opt_training"
)

In [None]:
# Remaining documents become eval set
eval_sdf = lease_docs_short_df.subtract(train_sdf)
eval_pdf = eval_sdf.toPandas()

In [None]:
# Convert eval set to records
eval_records = []
for _, row in eval_pdf.iterrows():
    record = {
        "inputs": {"query": row.get("request", "")},
        "expectations": {"expected_response": row.get("labels", "")},
    }
    eval_records.append(record)

## Create a MLflow evaluation set

In [None]:
# Create MLflow eval dataset
try:
    eval_dataset = mlflow.genai.datasets.create_dataset(
        name=f"{config['catalog']}.{config['schema']}.prompt_opt_eval"
    )
    eval_dataset.merge_records(eval_records)
except Exception as e:
    if "TABLE_ALREADY_EXISTS" in str(e):
        eval_dataset = mlflow.genai.datasets.get_dataset(
            name=f"{config['catalog']}.{config['schema']}.prompt_opt_eval",
        )
    else:
        raise e

In [None]:
# Turn eval dataset into records
eval_records = eval_dataset.to_df()[["inputs", "expectations"]].to_dict(
    orient="records"
)

## Run baseline evaluation with fuzzy match scorers

In [None]:
# Load prediction function with base prompt
lease_extraction_predict_fn = create_predict_fn(
    prompt_uri=f"prompts:/{config['catalog']}.{config['schema']}.lease_extraction_prompt/1"
)

In [None]:
# Run fuzzy match scorers on the eval dataset
fuzzy_scorers = create_fuzzy_scorers(threshold=0.7)

with mlflow.start_run(run_name="Baseline eval run"):
    results = mlflow.genai.evaluate(
        data=eval_records,
        predict_fn=lease_extraction_predict_fn,
        scorers=fuzzy_scorers,
    )

In [None]:
# Print the mean fuzzy match scores per field
mean_scores = []
for key, value in results.metrics.items():
    if "/mean" in key:
        print(f"{key}: {value:.2%}")
        mean_scores.append(value)

# Print overall average
if mean_scores:
    print(f"\nOverall average: {sum(mean_scores) / len(mean_scores):.2%}")

The base prompt on GPT-OSS 20B got an average accuract of 61.5%. It was evaluated using fuzzy match scorers (0.0-1.0) for each extraction field. Next we will use GEPA to optimize the prompt.