In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

from segmentation_failures.utils import GLOBAL_SEEDS
from segmentation_failures.utils.io import load_expt_config

expt_root = Path(
    # "/mnt/E132-Projekte/Projects/2023_MaxZ_segmentation_failures/cluster_logs/logs/paper_expts_2403"
    # "/mnt/E132-Projekte/Projects/2023_MaxZ_segmentation_failures/cluster_logs/revision_newdataset_2407"
    # "/mnt/cluster_checkpoints_ro/segfail_project/revision_architecture_2408"
    "/mnt/cluster_checkpoints_ro/segfail_project/revision_newdataset_2408"
    # "/mnt/E132-Projekte/Projects/2023_MaxZ_segmentation_failures/cluster_logs/revision_newdataset_2408"
    # "/mnt/E132-Projekte/Projects/2023_MaxZ_segmentation_failures/cluster_logs/revision_architecture_2408/"
)
# expt_root = Path("/mnt/cluster_checkpoints_ro/segfail_project/logs/paper_expts_2403")

In [None]:
seed_inv_mapping = {v: k for k, v in GLOBAL_SEEDS.items()}

# iterate  recursively through all directories in expt_root and find
# - train_seg directories
# - train_image_csf directories
# For each directory, check the seed and whether it is completed; enter this information in a dataframe
# the resulting dataframe (one for each directory type) should have columns
# expt_path, expt_name, dataset, seed, completed
def check_runs(expt_group_dir, task, test_results_path=None):
    all_results = []
    all_configs = {}
    for train_dir in tqdm(list(expt_group_dir.glob(f"*/*/*/{task}"))):
        for expt_dir in train_dir.glob("*"):
            expt_id = expt_dir.name
            if expt_dir.is_file():
                continue
            try:
                cfg = load_expt_config(expt_dir)
            except FileNotFoundError:
                print(f"Could not load config for {expt_dir}")
                continue
            if "dataset" not in cfg:
                dataset_id = int(cfg.datamodule.hparams.dataset_id)
                fold = int(cfg.datamodule.hparams.fold)
            else:
                dataset_id = int(cfg.dataset.dataset_id)
                fold = int(cfg.datamodule.fold)
            # nicer expt name
            expt_name = cfg.expt_name
            # if "unet_dropout" in expt_name:
            #     # remove everything before unet_dropout- (including)
            #     expt_name = expt_name.split("unet_dropout-")[-1]
            #     # expt_name = map_method_names(expt_name)
            seed = seed_inv_mapping[cfg.seed]
            # check if completed
            completed = (expt_dir / "COMPLETED").exists()
            num_checkpoints = 0
            ckpt_list = []
            if (expt_dir / "checkpoints").exists():
                ckpt_list =[
                        x
                        for x in (expt_dir / "checkpoints").iterdir()
                        if x.suffix == ".ckpt"
                    ]
                num_checkpoints = len(ckpt_list)
            results_found = False
            if test_results_path is not None:
                results_found = (expt_dir / test_results_path).exists()

            entry = {
                # "expt_path": expt_dir,
                "expt_id": expt_id,
                "expt_name": expt_name,
                "dataset": dataset_id,
                "seed": seed,
                "fold": fold,
                "completed": completed,
                "num_checkpoints": num_checkpoints,
                "ckpt_list": ckpt_list,
                "test_results_found": results_found,
            }
            all_results.append(entry)
            all_configs[expt_id] = cfg

    return pd.DataFrame(all_results), all_configs

In [None]:
# segmentation training
train_seg_df, configs = check_runs(expt_root, "train_seg")

In [None]:
# ds_df = train_seg_df.groupby("dataset").get_group(515)
# # ds_df.pivot(index="expt_name", columns="seed", values="completed")
# summary = (
#     ds_df.groupby(["expt_name", "fold", "seed"])
#     .agg({"completed": "sum", "num_checkpoints": "sum"})
#     .reset_index()
#     .pivot(index=["expt_name", "seed"], columns="fold", values="num_checkpoints")
# )
# summary
summary_df = (
    train_seg_df.groupby(["dataset", "expt_name", "fold", "seed"])
    .agg({"completed": "sum", "num_checkpoints": "sum"})
    .reset_index()
    .pivot(
        index=["expt_name", "seed"],
        columns=["dataset", "fold"],
        values="completed",
    )
)
# summary_df.loc[:, ([514, 540, 560], slice(None))]
summary_df

In [None]:
list(train_seg_df.groupby(["dataset", "fold", "seed"]).get_group((560, 1, 1)).ckpt_list)

In [None]:
# CSF validation
df_csf_validate, configs = check_runs(expt_root, "validate_pixel_csf")


In [None]:

summary_df = (
    df_csf_validate.groupby(["dataset", "expt_name", "fold", "seed"])
    .agg({"completed": "sum", "num_checkpoints": "sum"})
    .reset_index()
    .pivot(
        index=["expt_name", "seed"],
        columns=["dataset", "fold"],
        values="completed",
    )
)
# summary_df.loc[:, ([514, 540], slice(None))]
summary_df

In [None]:
# CSF training
df_csf_train, configs = check_runs(expt_root, "train_image_csf")

In [None]:
# filter by job ID (I repeated some)
filtered_df = df_csf_train
# filtered_df = df_csf[
#     df_csf.expt_name.str.contains("mahalanobis|-vae_", regex=True)
#     | (df_csf.job_id >= 23960389)
# ]

# filtered_df = df_csf[
#     df_csf.expt_name.str.contains("mahalanobis|-vae_", regex=True)
# ]

# concatenate the results for all datasets along the columns and add a multiindex with the dataset ID to the columns
summary_df = (
    filtered_df.groupby(["dataset", "expt_name", "fold"])
    .agg({"completed": "sum", "num_checkpoints": "sum"})
    .reset_index()
    .pivot(index="expt_name", columns=["dataset", "fold"], values="completed")
)
# summary_df.loc[:, ([514, 540], slice(None))]  # select all columns for dataset 500
summary_df

In [None]:
df_csf_pixel, configs = check_runs(
    expt_root, "test_pixel_csf", test_results_path="results/metrics.npz"
)

In [None]:
# concatenate the results for all datasets along the columns and add a multiindex with the dataset ID to the columns
summary_df = (
    df_csf_pixel.groupby(["dataset", "expt_name", "fold"])
    .agg({"completed": "sum", "test_results_found": "sum"})
    .reset_index()
    .pivot(index="expt_name", columns=["dataset", "fold"], values="test_results_found")
)
# summary_df.loc[:, (511, slice(None))]
summary_df

In [None]:
df_csf_img, configs = check_runs(
    expt_root, "test_fd", test_results_path="analysis/fd_metrics.csv"
)

In [None]:
# concatenate the results for all datasets along the columns and add a multiindex with the dataset ID to the columns
summary_df = (
    df_csf_img.groupby(["dataset", "expt_name", "fold"])
    .agg({"completed": "sum", "test_results_found": "sum"})
    .reset_index()
    .pivot(index="expt_name", columns=["dataset", "fold"], values="completed")
)
# summary_df.loc[:, (514, slice(None))]  # select all columns for dataset 500
summary_df