This part is done after training if Data Scientist would like to manually review which models are the best.

### Parse mlflow to compute scores and get the best models
- extract metadata fold-level, architecture and hyperparameter combination and filter out too big differences in losses (don't take overfitting ones)
- compute averages across folds
- compute scores
- select the best score 

In [None]:
from pathlib import Path
import mlflow
import pandas as pd

In [None]:
mlflow_path_dir = Path.cwd().parent / "experiments/mlruns"
mlflow.set_tracking_uri(mlflow_path_dir.as_uri())

In [None]:
def get_all_runs(experiment_name: str) -> pd.DataFrame:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        raise ValueError(f"Experiment '{experiment_name}' not found.")
    
    runs = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string="attributes.status = 'FINISHED'",
        output_format="pandas"
    )
    return runs


In [None]:
runs = get_all_runs("food-101_30%_tr70_va15_te15_2025-08-05_17-22-11")  # change to experiment you like to analyze
runs

In [None]:
def filter_by_loss_discrepancy(runs: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame:
    runs["loss_diff"] = (runs["metrics.train_loss"] - runs["metrics.val_loss"]).abs()
    return runs[runs["loss_diff"] <= threshold].copy()


In [None]:
filterd_runs = filter_by_loss_discrepancy(runs, threshold=0.25)
filterd_runs

In [None]:
def group_by_arch_and_config(runs: pd.DataFrame, val_metric: str = "val_acc") -> pd.DataFrame:
    grouped = runs.groupby(["tags.architecture", "tags.config"])

    summary = []
    for (arch, config), group in grouped:
        avg_val_loss = group["metrics.val_loss"].mean()
        avg_val_metric = group[f"metrics.{val_metric}"].mean()

        summary.append({
            "architecture": arch,
            "config": config,
            "folds": len(group),
            "avg_val_loss": avg_val_loss,
            f"avg_{val_metric}": avg_val_metric
        })

    return pd.DataFrame(summary)


In [None]:
grouped_runs = group_by_arch_and_config(filterd_runs, val_metric="val_acc")
grouped_runs  # there might not be all hyperparameter combinations if those with to big loss discrepancy were filtered

In [None]:
def score_models(df: pd.DataFrame, val_metric: str = "val_acc",
                 acc_weight: float = 0.7, loss_weight: float = 0.3) -> pd.DataFrame:
    df = df.copy()

    # Normalize
    df["loss_score"] = 1 - (df["avg_val_loss"] - df["avg_val_loss"].min()) / (df["avg_val_loss"].max() - df["avg_val_loss"].min())
    df["metric_score"] = (df[f"avg_{val_metric}"] - df[f"avg_{val_metric}"].min()) / (df[f"avg_{val_metric}"].max() - df[f"avg_{val_metric}"].min())

    # Weighted score
    df["score"] = acc_weight * df["metric_score"] + loss_weight * df["loss_score"]
    return df.sort_values("score", ascending=False)


In [None]:
# If you want to check which one is the best by scoring algorithm (the same is in automated pipeline) run this cell:
scored = score_models(grouped_runs, val_metric="val_acc", acc_weight=0.7, loss_weight=0.3)  # you can modify weights based on how important you think those metrics sould be
scored  # look at scores

In [None]:
def select_best_configs(df_scored: pd.DataFrame) -> pd.DataFrame:
    return df_scored.loc[df_scored.groupby("architecture")["score"].idxmax()].reset_index(drop=True)

In [None]:
best_config = select_best_configs(scored)
best_config