# CIFAR-10 training time.

Let's first check that I have all the runs. 

In [1]:
# setup

# make a cell print all the outputs instead of just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pathlib import Path

In [2]:
%pwd
%ls ../../data/results | grep cifar
%ls ../../data/results/cifar10/ -l
%ls ../../data/results/cifar10/fcdd_20211221161549_cifar10_/ -l

'/home/bertoldo/repos/fcdd/python/analyse'

[0m[01;34mcifar10[0m/
[01;34mfcdd_20211221161549_cifar10_[0m/
[01;34mfcdd_20211221161549_cifar10__AE[0m/
[01;34mfcdd_20211221161549_cifar10__HSC[0m/
total 0
lrwxrwxrwx 1 bertoldo bertoldo 31 déc.  23 19:43 [0m[01;36mfcdd_20211221161549_cifar10_[0m -> [01;34m../fcdd_20211221161549_cifar10_[0m[K/
lrwxrwxrwx 1 bertoldo bertoldo 34 janv. 17 14:08 [01;36mfcdd_20211221161549_cifar10__AE[0m -> [01;34m../fcdd_20211221161549_cifar10__AE[0m[K/
lrwxrwxrwx 1 bertoldo bertoldo 35 déc.  23 19:43 [01;36mfcdd_20211221161549_cifar10__HSC[0m -> [01;34m../fcdd_20211221161549_cifar10__HSC[0m[K/
total 88
drwxrwxr-x 2 bertoldo bertoldo  4096 déc.  23 18:06 [0m[01;34mall_classwise_roc_curves[0m/
drwxrwxr-x 7 bertoldo bertoldo  4096 déc.  21 19:07 [01;34mnormal_0[0m/
drwxrwxr-x 7 bertoldo bertoldo  4096 déc.  21 22:23 [01;34mnormal_1[0m/
drwxrwxr-x 7 bertoldo bertoldo  4096 déc.  22 01:16 [01;34mnormal_2[0m/
drwxrwxr-x 7 bertoldo bertoldo  4096 déc.  22 03:57 [01;34mnormal_3

## funcs

In [3]:
import numpy as np
from numpy import ndarray
from typing import Dict, Union
from pathlib import Path
import json
import re
from datetime import timedelta, datetime
import numpy as np
from numpy import ndarray
from typing import Dict, Union
from pathlib import Path
import json
import numpy as np
from numpy import ndarray
from typing import Dict, Union
from pathlib import Path
import json
import re
import copy
from typing import List


# this is to get the strings associated to the classes in the fcdd code
# copied from: fcdd/python/fcdd/datasets/__init__.py
# inside function `str_labels`
# commit: 9f268d8fd2fee33a5c5f38cdfb781da927bdb614
CLASS_LABELS = {
    'cifar10': ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'],
    'fmnist': [
        't-shirt/top', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'
    ],
    'mvtec': [
        'bottle', 'cable', 'capsule', 'carpet', 'grid', 'hazelnut', 'leather',
        'metal_nut', 'pill', 'screw', 'tile', 'toothbrush', 'transistor',
        'wood', 'zipper'
    ],
    # 'imagenet': deepcopy(ADImageNet.ad_classes),
    # this one forwards to: fcdd/python/fcdd/datasets/imagenet.py
    # in: ADImageNet.ad_classes
    # at the same commit as above
    'imagenet': ['acorn', 'airliner', 'ambulance', 'American alligator', 'banjo', 'barn', 'bikini', 'digital clock',
                  'dragonfly', 'dumbbell', 'forklift', 'goblet', 'grand piano', 'hotdog', 'hourglass', 'manhole cover',
                  'mosque', 'nail', 'parking meter', 'pillow', 'revolver', 'dial telephone', 'schooner',
                  'snowmobile', 'soccer ball', 'stingray', 'strawberry', 'tank', 'toaster', 'volcano'],
    'pascalvoc': ['horse'],
}


def get_classes_labels_order(dataset: str) -> List[str]:
    return copy.deepcopy(CLASS_LABELS[dataset])


def get_class_label(class_dirname: str, dataset: str) -> str:
    return CLASS_LABELS[dataset][int(class_dirname.lstrip("normal_"))]


def get_training_time(path: Path) -> Dict[str, float]:
    """
    The is printed in `log.txt` as 
        START: 21-12-2021 22:46:18 
        DURATION: 1:13:29.009893 
    so this function parses that file.
    
    :param path: points to the 'log.txt' file inside an experiment (dataset + class + iteration)
    
    # simple test
    TEST_PATH = Path("../../data/results/mvtec/supervised_merged/normal_0/it_0/log.txt")
    get_training_time(TEST_PATH)
    del TEST_PATH
    """
    DURATION_LINE_REGEX = r"^DURATION: (.+) $"
    DELTA_DATETIME_FORMAT = "%H:%M:%S.%f"
    
    # src: https://stackoverflow.com/a/1327389/9582881
    search = re.search(DURATION_LINE_REGEX, path.read_text(), re.MULTILINE)
    assert search is not None
    delta_str = search.group(1)
    assert delta_str != ""
    # src: https://stackoverflow.com/a/12352624/9582881
    t = datetime.strptime(delta_str, DELTA_DATETIME_FORMAT)
    delta = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)
    assert delta > timedelta(0)
    return {"training_timedelta": delta}


def get_roc(path: Path) -> Dict[str, Union[float, ndarray]]:
    """
    :param path: points to the 'roc.json' file inside an experiment (dataset + class + iteration)
    """
    ROC_JSON_EXPECTED_KEYS = {"tpr", "fpr", "ths", "auc"}
    ROC_JSON_LISTS = {"tpr", "fpr", "ths"}
    
    with path.open("r") as json_file:
        payload = json.load(json_file)
    
    assert type(payload) == dict, f"{type(payload)=}"
    assert set(payload.keys()) == ROC_JSON_EXPECTED_KEYS, f"{set(payload.keys())=}"
    
    # transform the lists in ndarrays
    for list_key in ROC_JSON_LISTS:
        list_obj = payload[list_key]
        assert type(list_obj) == list, f"{type(list_obj)=}"
        payload[list_key] = np.array(list_obj)
        
    return payload


def get_pixelwise_roc(path: Path) -> Dict[str, float]:
    """
    The pixel-wise roc is often lost in mvtec because the images are big so the json
    with all the tpr/fpr/ths would be too big (there's a limit of 10MB somewhere in the code)
    but the pixel-wise ROC-AUC is printed in `log.txt` as like
    ##### GTMAP ROC TEST SCORE 0.9562439079836628 #####
    so this function parses that file.
    
    :param path: points to the 'log.txt' file inside an experiment (dataset + class + iteration)
    """
    # "gtmap" = "pixel-wise" here
    # i got the floating point pattern in stackoverflow
    # src: https://stackoverflow.com/a/12643073/9582881
    GTMAP_ROC_LINE_REGEX = r"##### GTMAP ROC TEST SCORE (([0-9]*[.])?[0-9]+) #####"
    
    # src: https://stackoverflow.com/a/1327389/9582881
    search = re.search(GTMAP_ROC_LINE_REGEX, path.read_text())
    assert search is not None
    val = float(search.group(1))
    assert 1 >= val >= 0
    return {"auc": val}


def experiment_key(experiment_dir: Path) -> str:
    """
    :param experiment_dir: path to the experiment directory
    """
    return f"{experiment_dir.parent.name}.{experiment_dir.name}"


def build_experiment_payload(experiment_dir: Path, dataset: str) -> Dict[str, Union[str, float, ndarray]]:
    """
    :param experiment_dir: path to the experiment directory
    """
    normal_class_str = experiment_dir.parent.name
    roc_json_path = experiment_dir / "roc.json"
    pixelwise_roc_pdf_path = experiment_dir / "gtmap_roc_curve.pdf"
    print_log_path = experiment_dir / "print.log"
    log_txt_path = experiment_dir / "log.txt"
    
    return {
        "normal_class": normal_class_str,
        "it": experiment_dir.name,
        "dir": experiment_dir,
        "roc_json_path": roc_json_path,
        "normal_class_label": get_class_label(normal_class_str, dataset),
        **get_training_time(log_txt_path),
        **{
            # this is a renaming so sample_roc/pixel_roc won't conflict
            f"sample_roc.{key}": val 
            for key, val in get_roc(roc_json_path).items()
        },
        **(
            {
                f"pixel_roc.{key}": val 
                for key, val in get_pixelwise_roc(print_log_path).items()
            }
            if pixelwise_roc_pdf_path.exists() else
            {}
        )
    }


def get_all_rocs_experiment(path: Path, dataset: str) -> Dict[str, Dict[str, Union[str, float, ndarray]]]:
    
    """
    :param path: a folder that looks like 'fcdd_20211220193242_fmnist_' 
                 a whole experiment on a dataset with all iterations and nominal classes inside, 
                 the structure should look like
                 
                path/
                path/normal_0
                path/normal_1
                ...
                path/normal_9/
                path/normal_9/it_0
                ...
                path/normal_9/it_4/roc.json
    """
    assert path.is_dir()
    assert dataset in CLASS_LABELS
    
    experiments = {
        (key := experiment_key(iterpath)): {
            **build_experiment_payload(iterpath, dataset=dataset),
            **{"key": key},
        }
        
        # first level: classes
        for dirpath in path.glob("normal_*")
        if dirpath.is_dir()
        
        # second level: iterations
        for iterpath in dirpath.glob("it_*")
        if iterpath.is_dir()
    }
    
    return experiments


# some simple tests
TEST_PATH = Path("../../data/results/mvtec/supervised_merged/normal_0/it_0")
build_experiment_payload(TEST_PATH, dataset="mvtec")
del TEST_PATH

TEST_PATH = Path("../../data/results/mvtec/supervised_merged/")
all_dicts = get_all_rocs_experiment(TEST_PATH, dataset="mvtec")
len(all_dicts)
all_dicts[sorted(all_dicts)[0]]
del TEST_PATH, all_dicts

{'normal_class': 'normal_0',
 'it': 'it_0',
 'dir': PosixPath('../../data/results/mvtec/supervised_merged/normal_0/it_0'),
 'roc_json_path': PosixPath('../../data/results/mvtec/supervised_merged/normal_0/it_0/roc.json'),
 'normal_class_label': 'bottle',
 'training_timedelta': datetime.timedelta(seconds=1898),
 'sample_roc.tpr': array([0.        , 0.01587302, 0.95238095, 0.95238095, 0.96825397,
        0.96825397, 0.98412698, 0.98412698, 1.        , 1.        ]),
 'sample_roc.fpr': array([0.  , 0.  , 0.  , 0.05, 0.05, 0.15, 0.15, 0.65, 0.65, 1.  ]),
 'sample_roc.ths': array([1.99086225e+00, 9.90862250e-01, 1.40450727e-02, 6.54330710e-03,
        5.00143599e-03, 3.16883228e-03, 2.53253849e-03, 1.24803372e-03,
        1.06214371e-03, 5.94788231e-04]),
 'sample_roc.auc': 0.9865079365079364,
 'pixel_roc.auc': 0.9645627618672489}

75

{'normal_class': 'normal_0',
 'it': 'it_0',
 'dir': PosixPath('../../data/results/mvtec/supervised_merged/normal_0/it_0'),
 'roc_json_path': PosixPath('../../data/results/mvtec/supervised_merged/normal_0/it_0/roc.json'),
 'normal_class_label': 'bottle',
 'training_timedelta': datetime.timedelta(seconds=1898),
 'sample_roc.tpr': array([0.        , 0.01587302, 0.95238095, 0.95238095, 0.96825397,
        0.96825397, 0.98412698, 0.98412698, 1.        , 1.        ]),
 'sample_roc.fpr': array([0.  , 0.  , 0.  , 0.05, 0.05, 0.15, 0.15, 0.65, 0.65, 1.  ]),
 'sample_roc.ths': array([1.99086225e+00, 9.90862250e-01, 1.40450727e-02, 6.54330710e-03,
        5.00143599e-03, 3.16883228e-03, 2.53253849e-03, 1.24803372e-03,
        1.06214371e-03, 5.94788231e-04]),
 'sample_roc.auc': 0.9865079365079364,
 'pixel_roc.auc': 0.9645627618672489,
 'key': 'normal_0.it_0'}

## gather data

In [4]:
import pandas as pd

### load dataframe

In [5]:
# variable
records = get_all_rocs_experiment(
    path=Path("../../data/results/cifar10/fcdd_20211221161549_cifar10_/"), 
    dataset="cifar10"
)

# fixed
df = pd.DataFrame.from_records(data=records).T
df = df.drop(columns=["dir", "roc_json_path"])

In [6]:
df.head()

Unnamed: 0,normal_class,it,normal_class_label,training_timedelta,sample_roc.tpr,sample_roc.fpr,sample_roc.ths,sample_roc.auc,key
normal_0.it_0,normal_0,it_0,airplane,0 days 00:28:50,"[0.0, 0.00011111111111111112, 0.11222222222222...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.001, 0.001, 0.002,...","[51.81818389892578, 50.81818389892578, 16.0511...",0.948551,normal_0.it_0
normal_0.it_1,normal_0,it_1,airplane,0 days 00:29:58,"[0.0, 0.00011111111111111112, 0.14266666666666...","[0.0, 0.0, 0.0, 0.001, 0.001, 0.002, 0.002, 0....","[60.85507583618164, 59.85507583618164, 14.4678...",0.945596,normal_0.it_1
normal_0.it_2,normal_0,it_2,airplane,0 days 00:34:50,"[0.0, 0.00011111111111111112, 0.20766666666666...","[0.0, 0.0, 0.0, 0.001, 0.001, 0.002, 0.002, 0....","[62.59052276611328, 61.59052276611328, 12.3422...",0.947252,normal_0.it_2
normal_0.it_3,normal_0,it_3,airplane,0 days 00:38:29,"[0.0, 0.00011111111111111112, 0.22711111111111...","[0.0, 0.0, 0.0, 0.001, 0.001, 0.002, 0.002, 0....","[53.17036819458008, 52.17036819458008, 11.7287...",0.949191,normal_0.it_3
normal_0.it_4,normal_0,it_4,airplane,0 days 00:38:51,"[0.0, 0.00011111111111111112, 0.14966666666666...","[0.0, 0.0, 0.0, 0.001, 0.001, 0.002, 0.002, 0....","[51.99217224121094, 50.99217224121094, 14.3329...",0.948097,normal_0.it_4


### basic checks

- number of classes
- number of iterations


In [7]:
df[["key", "normal_class", "normal_class_label", "it"]].describe()

Unnamed: 0,key,normal_class,normal_class_label,it
count,50,50,50,50
unique,50,10,10,5
top,normal_0.it_0,normal_0,airplane,it_0
freq,1,5,5,10


In [8]:
df.groupby("normal_class")[["it"]].describe().T

Unnamed: 0,normal_class,normal_0,normal_1,normal_2,normal_3,normal_4,normal_5,normal_6,normal_7,normal_8,normal_9
it,count,5,5,5,5,5,5,5,5,5,5
it,unique,5,5,5,5,5,5,5,5,5,5
it,top,it_0,it_0,it_0,it_0,it_0,it_0,it_0,it_0,it_0,it_0
it,freq,1,1,1,1,1,1,1,1,1,1


In [9]:
df.columns

Index(['normal_class', 'it', 'normal_class_label', 'training_timedelta',
       'sample_roc.tpr', 'sample_roc.fpr', 'sample_roc.ths', 'sample_roc.auc',
       'key'],
      dtype='object')

In [10]:
cifar10_df = df

## training time

In [11]:
cifar10_df.head(1)

Unnamed: 0,normal_class,it,normal_class_label,training_timedelta,sample_roc.tpr,sample_roc.fpr,sample_roc.ths,sample_roc.auc,key
normal_0.it_0,normal_0,it_0,airplane,0 days 00:28:50,"[0.0, 0.00011111111111111112, 0.11222222222222...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.001, 0.001, 0.002,...","[51.81818389892578, 50.81818389892578, 16.0511...",0.948551,normal_0.it_0


In [13]:
training_time_df = pd.concat(
    [
        df_[["normal_class_label", "it", "training_timedelta", "key"]]
        for df_ in [cifar10_df]
    ], 
    axis=0, 
    keys=("cifar10", )
).droplevel(1, axis=0)

training_time_df.head()

Unnamed: 0,normal_class_label,it,training_timedelta,key
cifar10,airplane,it_0,0 days 00:28:50,normal_0.it_0
cifar10,airplane,it_1,0 days 00:29:58,normal_0.it_1
cifar10,airplane,it_2,0 days 00:34:50,normal_0.it_2
cifar10,airplane,it_3,0 days 00:38:29,normal_0.it_3
cifar10,airplane,it_4,0 days 00:38:51,normal_0.it_4


In [14]:
train_df = training_time_df.loc["cifar10"]

import numpy as np
# timedelta needs to be converted to int64 so one can compute std
# src: https://stackoverflow.com/a/44616595/9582881
train_df["tmp"] = train_df["training_timedelta"].values.astype(np.int64)
class_means = train_df.groupby("normal_class_label").agg({"tmp": ["mean", "std"]}).droplevel(0, axis='columns')
class_means["mean"] = pd.to_timedelta(class_means["mean"]).round("1s")
class_means["std"] = pd.to_timedelta(class_means["std"]).round("1s")

class_means.sort_values(by="mean", ascending=False)
class_means.describe()

del train_df, class_means

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["tmp"] = train_df["training_timedelta"].values.astype(np.int64)


Unnamed: 0_level_0,mean,std
normal_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1
ship,0 days 00:40:27,0 days 00:01:50
automobile,0 days 00:38:58,0 days 00:00:57
horse,0 days 00:36:41,0 days 00:00:52
bird,0 days 00:34:20,0 days 00:01:16
airplane,0 days 00:34:12,0 days 00:04:40
truck,0 days 00:33:30,0 days 00:00:55
frog,0 days 00:32:51,0 days 00:03:51
cat,0 days 00:32:00,0 days 00:01:06
deer,0 days 00:29:25,0 days 00:00:31
dog,0 days 00:29:16,0 days 00:00:19


Unnamed: 0,mean,std
count,10,10
mean,0 days 00:34:10,0 days 00:01:37.700000
std,0 days 00:03:41.203777343,0 days 00:01:27.457989915
min,0 days 00:29:16,0 days 00:00:19
25%,0 days 00:32:12.750000,0 days 00:00:52.750000
50%,0 days 00:33:51,0 days 00:01:01.500000
75%,0 days 00:36:05.750000,0 days 00:01:41.500000
max,0 days 00:40:27,0 days 00:04:40
