# experimets variability

I want to have a visual of the variability of the experiments

# setup

In [None]:
# make a cell print all the outputs instead of just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pathlib import Path

Let's first check that I have all the runs. 

## data

i create a link to where the data really is in a directory `data` parallel to the notebook

if things are moved away, make the link point to the right place

In [None]:
%pwd
%ls ../../data/results-report-rc2021/

In [None]:
%mkdir -p data
%cd data
!ln -s ../../../data/results-report-rc2021 
%cd ..

In [None]:
%ls data/results-report-rc2021/

## funcs

In [None]:
import numpy as np
from numpy import ndarray
from typing import Dict, Union
from pathlib import Path
import json
import re
from datetime import timedelta, datetime
import numpy as np
from numpy import ndarray
from typing import Dict, Union
from pathlib import Path
import json
import numpy as np
from numpy import ndarray
from typing import Dict, Union
from pathlib import Path
import json
import re
import copy
from typing import List


# this is to get the strings associated to the classes in the fcdd code
# copied from: fcdd/python/fcdd/datasets/__init__.py
# inside function `str_labels`
# commit: 9f268d8fd2fee33a5c5f38cdfb781da927bdb614
CLASS_LABELS = {
    'cifar10': ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'],
    'fmnist': [
        't-shirt/top', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'
    ],
    'mvtec': [
        'bottle', 'cable', 'capsule', 'carpet', 'grid', 'hazelnut', 'leather',
        'metal_nut', 'pill', 'screw', 'tile', 'toothbrush', 'transistor',
        'wood', 'zipper'
    ],
    # 'imagenet': deepcopy(ADImageNet.ad_classes),
    # this one forwards to: fcdd/python/fcdd/datasets/imagenet.py
    # in: ADImageNet.ad_classes
    # at the same commit as above
    'imagenet': ['acorn', 'airliner', 'ambulance', 'American alligator', 'banjo', 'barn', 'bikini', 'digital clock',
                  'dragonfly', 'dumbbell', 'forklift', 'goblet', 'grand piano', 'hotdog', 'hourglass', 'manhole cover',
                  'mosque', 'nail', 'parking meter', 'pillow', 'revolver', 'dial telephone', 'schooner',
                  'snowmobile', 'soccer ball', 'stingray', 'strawberry', 'tank', 'toaster', 'volcano'],
    'pascalvoc': ['horse'],
}


def get_classes_labels_order(dataset: str) -> List[str]:
    return copy.deepcopy(CLASS_LABELS[dataset])


def get_class_label(class_dirname: str, dataset: str) -> str:
    return CLASS_LABELS[dataset][int(class_dirname.lstrip("normal_"))]


def get_training_time(path: Path) -> Dict[str, float]:
    """
    The is printed in `log.txt` as 
        START: 21-12-2021 22:46:18 
        DURATION: 1:13:29.009893 
    so this function parses that file.
    
    :param path: points to the 'log.txt' file inside an experiment (dataset + class + iteration)
    
    # simple test
    TEST_PATH = Path("../../data/results/mvtec/supervised_merged/normal_0/it_0/log.txt")
    get_training_time(TEST_PATH)
    del TEST_PATH
    """
    DURATION_LINE_REGEX = r"^DURATION: (.+) $"
    DELTA_DATETIME_FORMAT = "%H:%M:%S.%f"
    
    # src: https://stackoverflow.com/a/1327389/9582881
    search = re.search(DURATION_LINE_REGEX, path.read_text(), re.MULTILINE)
    assert search is not None
    delta_str = search.group(1)
    assert delta_str != ""
    # src: https://stackoverflow.com/a/12352624/9582881
    t = datetime.strptime(delta_str, DELTA_DATETIME_FORMAT)
    delta = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)
    assert delta > timedelta(0)
    return {"training_timedelta": delta}


def get_roc(path: Path) -> Dict[str, Union[float, ndarray]]:
    """
    :param path: points to the 'roc.json' file inside an experiment (dataset + class + iteration)
    """
    ROC_JSON_EXPECTED_KEYS = {"tpr", "fpr", "ths", "auc"}
    ROC_JSON_LISTS = {"tpr", "fpr", "ths"}
    
    with path.open("r") as json_file:
        payload = json.load(json_file)
    
    assert type(payload) == dict, f"{type(payload)=}"
    assert set(payload.keys()) == ROC_JSON_EXPECTED_KEYS, f"{set(payload.keys())=}"
    
    # transform the lists in ndarrays
    for list_key in ROC_JSON_LISTS:
        list_obj = payload[list_key]
        assert type(list_obj) == list, f"{type(list_obj)=}"
        payload[list_key] = np.array(list_obj)
        
    return payload


def get_pixelwise_roc(path: Path) -> Dict[str, float]:
    """
    The pixel-wise roc is often lost in mvtec because the images are big so the json
    with all the tpr/fpr/ths would be too big (there's a limit of 10MB somewhere in the code)
    but the pixel-wise ROC-AUC is printed in `log.txt` as like
    ##### GTMAP ROC TEST SCORE 0.9562439079836628 #####
    so this function parses that file.
    
    :param path: points to the 'log.txt' file inside an experiment (dataset + class + iteration)
    """
    # "gtmap" = "pixel-wise" here
    # i got the floating point pattern in stackoverflow
    # src: https://stackoverflow.com/a/12643073/9582881
    GTMAP_ROC_LINE_REGEX = r"##### GTMAP ROC TEST SCORE (([0-9]*[.])?[0-9]+) #####"
    
    # src: https://stackoverflow.com/a/1327389/9582881
    search = re.search(GTMAP_ROC_LINE_REGEX, path.read_text())
    assert search is not None
    val = float(search.group(1))
    assert 1 >= val >= 0
    return {"auc": val}


def experiment_key(experiment_dir: Path) -> str:
    """
    :param experiment_dir: path to the experiment directory
    """
    return f"{experiment_dir.parent.name}.{experiment_dir.name}"


def build_experiment_payload(experiment_dir: Path, dataset: str) -> Dict[str, Union[str, float, ndarray]]:
    """
    :param experiment_dir: path to the experiment directory
    """
    normal_class_str = experiment_dir.parent.name
    roc_json_path = experiment_dir / "roc.json"
    pixelwise_roc_pdf_path = experiment_dir / "gtmap_roc_curve.pdf"
    print_log_path = experiment_dir / "print.log"
    log_txt_path = experiment_dir / "log.txt"
    
    return {
        "normal_class": normal_class_str,
        "it": experiment_dir.name,
        "dir": experiment_dir,
        "roc_json_path": roc_json_path,
        "normal_class_label": get_class_label(normal_class_str, dataset),
        **get_training_time(log_txt_path),
        **{
            # this is a renaming so sample_roc/pixel_roc won't conflict
            f"sample_roc.{key}": val 
            for key, val in get_roc(roc_json_path).items()
        },
        **(
            {
                f"pixel_roc.{key}": val 
                for key, val in get_pixelwise_roc(print_log_path).items()
            }
            if pixelwise_roc_pdf_path.exists() else
            {}
        )
    }


def get_all_experiments_data(path: Path, dataset: str) -> Dict[str, Dict[str, Union[str, float, ndarray]]]:
    
    """
    :param path: a folder that looks like 'fcdd_20211220193242_fmnist_' 
                 a whole experiment on a dataset with all iterations and nominal classes inside, 
                 the structure should look like
                 
                path/
                path/normal_0
                path/normal_1
                ...
                path/normal_9/
                path/normal_9/it_0
                ...
                path/normal_9/it_4/roc.json
    """
    assert path.is_dir()
    assert dataset in CLASS_LABELS
    
    experiments = {
        (key := experiment_key(iterpath)): {
            **build_experiment_payload(iterpath, dataset=dataset),
            **{"key": key},
        }
        
        # first level: classes
        for dirpath in path.glob("normal_*")
        if dirpath.is_dir()
        
        # second level: iterations
        for iterpath in dirpath.glob("it_*")
        if iterpath.is_dir()
    }
    
    return experiments


# some simple tests
# TEST_PATH = Path("../../data/results/mvtec/supervised_merged/normal_0/it_0")
# build_experiment_payload(TEST_PATH, dataset="mvtec")
# del TEST_PATH

# TEST_PATH = Path("../../data/results/mvtec/supervised_merged/")
# all_dicts = get_all_rocs_experiment(TEST_PATH, dataset="mvtec")
# len(all_dicts)
# all_dicts[sorted(all_dicts)[0]]
# del TEST_PATH, all_dicts

# gather data

In [None]:
%ls data/results-report-rc2021/
%ls data/results-report-rc2021/cifar10/
%ls data/results-report-rc2021/mvtec/
%ls data/results-report-rc2021/fmnist/

In [None]:
import pandas as pd

In [None]:
# variable
records = {
    
    # ssup = semi-supervised
    "mvtec-ssup": get_all_experiments_data(
        path=Path("data/results-report-rc2021/mvtec/supervised_merged"), 
        dataset="mvtec"
    ),

    # unsup = unsupervised
    "mvtec-unsup": get_all_experiments_data(
        path=Path("data/results-report-rc2021/mvtec/unsupervised_merged"), 
        dataset="mvtec"
    ),

    "fmnist": get_all_experiments_data(
        path=Path("data/results-report-rc2021/fmnist/OE-CIFAR100/fcdd_20211220193450_fmnist_"), 
        dataset="fmnist"
    ),

    "cifar10": get_all_experiments_data(
        path=Path("data/results-report-rc2021/cifar10/fcdd_20211221161549_cifar10_"), 
        dataset="cifar10"
    ),
}

In [None]:
# df = DataFrame
drop_columns = ["dir", "roc_json_path", "normal_class"]
dfs = {
    experiment: pd.DataFrame.from_records(data=rec).T \
                    .drop(columns=drop_columns) \
                    .reset_index() \
                    .drop(columns=["index", "key"]) #\
                    .set_index(["normal_class_label", "it"])
    for experiment, rec in records.items()
}

for exp, df in dfs.items():
    print(exp)
    df.head(2)

In [None]:
exp_keys = sorted(dfs)

df = pd.concat(
    objs=[dfs[k] for k in exp_keys],
    axis=0,
    keys=exp_keys,
)
df.index = df.index.rename(names="experiment", level=0)
df.head()
df.index

## basic checks

- number of classes
- number of iterations


In [None]:
df.reset_index()[df.index.names].describe()

In [None]:
df.reset_index().groupby("experiment")[df.index.names[1:]].describe()

In [None]:
df.reset_index().groupby(["experiment", "normal_class_label"])[df.index.names[2:]].describe()

# mvtec

In [None]:
df.columns

In [None]:
mvtec_df = df.loc[["mvtec-ssup", "mvtec-unsup"]] \
            .drop(columns=['training_timedelta', 'sample_roc.tpr', 'sample_roc.fpr', 'sample_roc.ths'])

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
f, axs = plt.subplots(1, 3, figsize=(15, 5), dpi=100)

classes = get_classes_labels_order("mvtec")

plots = [
    {"ax": axs[0], "classes": classes[0:5], },
    {"ax": axs[1], "classes": classes[5:10], },
    {"ax": axs[2], "classes": classes[10:15], },
]

for plot in plots:
    
    ax = plot["ax"]
    classes = plot["classes"]
    df = mvtec_df.reset_index().set_index("normal_class_label").loc[classes].reset_index()

    sns.boxplot(
        data=df, 
        ax=ax,
        x="pixel_roc.auc", 
        y="normal_class_label", 
        hue="experiment",
        # Otherwise, the points for each experiment will be plotted on top of each other.
        dodge=True,
        # whis=[0, 100],
        whis=[0, 100],
        width=.2,
        fliersize=20,
        linewidth=.5,
    )

    sns.stripplot(
        data=df, 
        ax=ax,
        x="pixel_roc.auc", 
        y="normal_class_label", 
        hue="experiment",
        # Otherwise, the points for each experiment will be plotted on top of each other.
        dodge=True,
        alpha=.5,
    )
    
# src: https://stackoverflow.com/a/63519038/9582881
axs[0].legend([], [], frameon=False)
axs[2].legend([], [], frameon=False)

# only keep the legend in the middle and with only dots
# src: https://stackoverflow.com/a/13589144/9582881
handles, labels = a.get_legend_handles_labels()
axs[1].legend(handles[2:4], labels[2:4], frameon=True)

# rotate the names of the classes to be in vertical 
# src: https://www.delftstack.com/howto/matplotlib/how-to-rotate-x-axis-tick-label-text-in-matplotlib/#ax-set-xticklabels-xlabels-rotation-to-rotate-xticks-label-text
axs[0].set_yticklabels([t.get_text() for t in axs[0].get_yticklabels()], rotation=90)
axs[1].set_yticklabels([t.get_text() for t in axs[1].get_yticklabels()], rotation=90)
axs[2].set_yticklabels([t.get_text() for t in axs[2].get_yticklabels()], rotation=90)

# don't repeat the axis title 
axs[1].axes.yaxis.set_label_text("")
axs[2].axes.yaxis.set_label_text("")

# set the limits of the 3 plots to be the same
xmin, xmax = mvtec_df["pixel_roc.auc"].min(), mvtec_df["pixel_roc.auc"].max()
axs[0].set_xlim(left=xmin, right=xmax)
axs[1].set_xlim(left=xmin, right=xmax)
axs[2].set_xlim(left=xmin, right=xmax)

In [None]:
f