# Run Evaluation

Evaluates responses using MLflow and LLM-as-a-judge

In [None]:
%pip install /Workspace/Repos/verdict/dist/verdict-*.whl

In [None]:
import loggingimport uuidfrom verdict.evaluation.mlflow_evaluator import MLflowEvaluatorfrom verdict.evaluation.custom_judges import LLMJudgeEvaluatorfrom verdict.evaluation.deterministic_metrics import DeterministicMetricsCalculatorlogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__)

In [None]:
# Widget parametersdbutils.widgets.text("candidate_version", "", "Candidate Version")dbutils.widgets.text("run_id", "", "Run ID (from inference)")dbutils.widgets.text("judge_endpoint", "databricks-llama-4-maverick", "Judge Model Endpoint")dbutils.widgets.text("catalog_name", "verdict", "Catalog Name")candidate_version = dbutils.widgets.get("candidate_version")run_id = dbutils.widgets.get("run_id") or Nonejudge_endpoint = dbutils.widgets.get("judge_endpoint")catalog_name = dbutils.widgets.get("catalog_name")

In [None]:
logger.info(f"Starting evaluation for model version: {candidate_version}")logger.info(f"Judge endpoint: {judge_endpoint}")

In [None]:
# Load responses from inferenceresponses_table = f"{catalog_name}.raw.model_responses"responses_df = spark.table(responses_table)if run_id:    responses_df = responses_df.filter(f"run_id = '{run_id}'")# Join with prompts for ground truthprompts_df = spark.table(f"{catalog_name}.raw.prompt_datasets")responses_df = responses_df.join(    prompts_df.select("prompt_id", "prompt", "ground_truth"),    on="prompt_id",    how="left")response_count = responses_df.count()logger.info(f"Loaded {response_count} responses for evaluation")

In [None]:
# Run deterministic metricslogger.info("Computing deterministic metrics...")det_calculator = DeterministicMetricsCalculator(catalog_name=catalog_name)metrics_df = det_calculator.calculate_metrics(responses_df)# Latency statslatency_stats = det_calculator.compute_latency_stats(metrics_df)logger.info("Latency statistics:")latency_stats.display()

In [None]:
# Run MLflow evaluationlogger.info("Running MLflow LLM Evaluate...")mlflow_evaluator = MLflowEvaluator(    catalog_name=catalog_name,    experiment_path="/verdict/experiments")eval_run_id = str(uuid.uuid4())mlflow_results = mlflow_evaluator.evaluate_responses(    responses_df=responses_df,    run_id=eval_run_id,    metrics=["faithfulness", "answer_relevance", "toxicity"])

In [None]:
# Run LLM-as-a-judge evaluationlogger.info(f"Running LLM-as-a-judge evaluation with {judge_endpoint}...")judge_evaluator = LLMJudgeEvaluator(    catalog_name=catalog_name,    judge_endpoint=judge_endpoint,    max_workers=10)judge_results = judge_evaluator.evaluate(    responses_df=responses_df,    run_id=eval_run_id)

In [None]:
# Summaryprint(f"\nEvaluation Run ID: {eval_run_id}")print(f"Model Version: {candidate_version}")print(f"Judge Endpoint: {judge_endpoint}")# Display summaryeval_table = f"{catalog_name}.evaluated.eval_results"spark.sql(f"""    SELECT metric_name,           COUNT(*) as count,           AVG(metric_value) as avg_value,           MIN(metric_value) as min_value,           MAX(metric_value) as max_value    FROM {eval_table}    WHERE run_id = '{eval_run_id}'    GROUP BY metric_name""").display()

In [None]:
# Return values for downstream tasksdbutils.jobs.taskValues.set("eval_run_id", eval_run_id)