# Run Regression Detection


Detects regressions between candidate and baseline model versions

In [None]:
import sys
import os

# Add src to path for imports (notebook runs from notebooks/ directory)
notebook_dir = os.getcwd()
repo_root = os.path.dirname(notebook_dir)
src_path = os.path.join(repo_root, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)


In [None]:
import logging
import json
from verdict.regression.regression_detector import RegressionDetector, VerdictLabel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Widget parameters
dbutils.widgets.text("candidate_version", "", "Candidate Version")
dbutils.widgets.text("baseline_version", "", "Baseline Version")
dbutils.widgets.text("dataset_version", "v1", "Dataset Version")
dbutils.widgets.text("eval_run_id", "", "Evaluation Run ID")
dbutils.widgets.text("threshold", "5.0", "Regression Threshold %")
dbutils.widgets.text("p_value", "0.05", "P-value Threshold")
dbutils.widgets.text("catalog_name", "verdict", "Catalog Name")

candidate_version = dbutils.widgets.get("candidate_version")
baseline_version = dbutils.widgets.get("baseline_version")
dataset_version = dbutils.widgets.get("dataset_version")
eval_run_id = dbutils.widgets.get("eval_run_id") or None
threshold = float(dbutils.widgets.get("threshold"))
p_value_threshold = float(dbutils.widgets.get("p_value"))
catalog_name = dbutils.widgets.get("catalog_name")

In [None]:
logger.info(f"Comparing candidate {candidate_version} vs baseline {baseline_version}")

In [None]:
# Initialize detector
detector = RegressionDetector(
    catalog_name=catalog_name,
    threshold_pct=threshold,
    p_value_threshold=p_value_threshold,
    experiment_path="/verdict/experiments"
)

In [None]:
# Run regression detection
report = detector.detect_regression(
    candidate_version=candidate_version,
    baseline_version=baseline_version,
    run_id=eval_run_id,
    dataset_version=dataset_version
)

In [None]:
# Display verdict
verdict = report["verdict"]
print("\n" + "=" * 60)
print(f"VERDICT: {verdict}")
print("=" * 60)

for comp in report["comparisons"]:
    metric_name = comp.get("metric_name", "unknown")
    is_regression = comp.get("is_regression", False)
    candidate_mean = comp.get("candidate_mean", "N/A")
    baseline_mean = comp.get("baseline_mean", "N/A")
    pct_change = comp.get("pct_change", 0)
    p_val = comp.get("p_value", 1)

    status = "⚠️  REGRESSION" if is_regression else "✓  OK"
    print(f"\n{metric_name}: {status}")
    if candidate_mean is not None:
        print(f"  Candidate: {candidate_mean:.4f}")
    if baseline_mean is not None:
        print(f"  Baseline:  {baseline_mean:.4f}")
    print(f"  Change:    {pct_change:+.2f}%")
    print(f"  P-value:   {p_val:.4f}")

print("\n" + "=" * 60 + "\n")

In [None]:
# Return values for downstream tasks
dbutils.jobs.taskValues.set("verdict", verdict)
dbutils.jobs.taskValues.set("verdict_report", json.dumps(report))

In [None]:
# Get verdict history
history_df = detector.get_verdict_history(model_version=candidate_version, limit=10)
print("Recent verdict history:")
history_df.display()