In [1]:
from copy import deepcopy
from pathlib import Path

import pandas as pd

import lib

pd.set_option("display.max_rows", 1000)

In [2]:
GESTURE = "gesture"
CHURN = "churn"
CALIFORNIA = "california"
HOUSE = "house"
ADULT = "adult"
OTTO = "otto"
HIGGS_SMALL = "higgs-small"
FB_COMMENTS = "fb-comments"
SANTANDER = "santander"
COVTYPE = "covtype"
MICROSOFT = "microsoft"
DATASETS = [
    GESTURE,
    CHURN,
    CALIFORNIA,
    HOUSE,
    ADULT,
    OTTO,
    HIGGS_SMALL,
    FB_COMMENTS,
    SANTANDER,
    COVTYPE,
    MICROSOFT,
]

DETAILS = ["task_type", "n_objects", "n_features"]


def make_datasets_df():
    df = pd.DataFrame.from_records(list(map(lib.data.load_dataset_info, DATASETS)))
    df = df.sort_values("size")
    df = df.reset_index(drop=True)
    df = df[
        [
            "name",
            "size",
            "n_features",
            "n_num_features",
            "n_cat_features",
            "task_type",
            "n_classes",
        ]
    ]
    return df


def collect_outputs(dir_, n_seeds):
    dir_ = lib.get_path(dir_)
    if dir_.exists():
        return [
            x
            for x in dir_.iterdir()
            if (
                x.is_dir()
                and (x / "DONE").exists()
                and x.name.isdigit()
                and int(x.name) < n_seeds
            )
        ]
    else:
        return []


def load_record(output, key, subkey):
    output = lib.get_path(output)
    report = lib.load_report(output)
    if Path(report["program"]).stem == "tune":
        report = report["best"]

    if Path(report["program"]).stem == "ensemble":
        program = report["single_model_program"]
        data_path = report["data"]
    else:
        program = report["program"]
        data_path = report["config"]["data"]["path"]
    dataset = Path(data_path).name

    dataset_info = lib.load_dataset_info(dataset)
    record = {
        "dataset": dataset_info["name"],
        "task_type": dataset_info["task_type"],
        "n_objects": dataset_info["size"],
        "n_features": dataset_info["n_num_features"] + dataset_info["n_cat_features"],
        "key": (
            f"{Path(program).stem} | {output.relative_to(lib.PROJ).parent.name}"
            if key is None
            else key
        ),
        "subkey": subkey,
    }
    for part in lib.Part:
        part = part.value
        if part in report["metrics"]:
            score = report["metrics"][part]["score"]
            if dataset_info["id"] == "house--default":
                score /= 10000
            record[f"{part}_score"] = score
    return record


def sort(df, by):
    if isinstance(by, str):
        by = [by]
    return df.sort_values(
        ["n_objects", "dataset"] + by,
        ascending=[True, True] + ["score" not in x for x in by],
    ).reset_index(drop=True)


def make_df(records):
    df = pd.DataFrame(records)
    if not df["subkey"].any():
        df.drop(columns=["subkey"], inplace=True)
    return sort(df, "val_score").reset_index(drop=True)


def format_scores(df, precision):
    def f(record):
        if record["task_type"] == lib.TaskType.REGRESSION.value:
            for part in lib.Part:
                for suffix in "best", "score":
                    key = f"{part.value}_{suffix}"
                    if key in record:
                        record[key] *= -1
        for k, v in list(record.items()):
            if isinstance(v, float):
                record[k] = round(v, precision)
        return record

    return df.apply(f, axis=1)


def drop_details(df):
    return df.drop(columns=DETAILS)


def drop_std(df):
    return df.drop(columns=[x for x in df.columns if x.endswith("_std")])


def build_df(records_info, precision=None, details=True):
    # (sub_)key_fn: Union[None, str, Callable[[output_dir], str]]
    records = []
    for dir_, output_filter_fn, key_fn, subkey_fn in records_info:
        for output in collect_outputs(dir_, output_filter_fn):
            key, subkey = [
                None if fn is None else fn if isinstance(fn, str) else fn(output)
                for fn in [key_fn, subkey_fn]
            ]
            record = load_record(output, key, subkey)
            if record is not None:
                records.append(record)
    if not records:
        raise RuntimeError("No records are available!")
    df = make_df(records)
    if precision is not None:
        df = format_scores(df, precision)
    if not details:
        df = drop_details(df)
    return df


def aggregate(df):
    aggrs = dict(
        test_score=("test_score", "mean"),
        test_std=("test_score", "std"),
        val_score=("val_score", "mean"),
        val_std=("val_score", "std"),
        train_score=("train_score", "mean"),
        train_std=("train_score", "std"),
        count=("test_score", "count"),
    )
    for x in DETAILS:
        if x in df.columns:
            aggrs[x] = (x, "first")
    df = df.groupby(["dataset", "key"]).agg(**aggrs)
    df["count"] = df["count"].astype(int)
    return df.reset_index().fillna(0.0)

In [4]:
# Use these flags to turn on/off the results for single models and ensembles.
# Results for ensembles are marked with '(e)'.
single_models = False
ensembles = True

results_info = []
for dataset in [
    GESTURE,
    CHURN,
    CALIFORNIA,
    HOUSE,
    ADULT,
    OTTO,
    HIGGS_SMALL,
    FB_COMMENTS,
    SANTANDER,
    COVTYPE,
    MICROSOFT,
]:
    for algorithm_info in [
        # (directory name, formatted name, experiment "names")
        # example: ('mlp', 'MLP', [0, 1, 'two', 'hello-world']),
        ("xgboost_", "XGBoost", [0]),
        ("catboost_", "CatBoost", [0]),
        ("mlp", "MLP", [0]),
        ("mlp-lr", "MLP-LR", [0]),
        ("mlp-q-lr", "MLP-Q-LR", [0]),
        ("mlp-t-lr", "MLP-T-LR", [0]),
        ("mlp-plr", "MLP-PLR", [0]),
        ("resnet", "ResNet", [0]),
        ("resnet-lr", "ResNet-LR", [0]),
        ("resnet-q-lr", "ResNet-Q-LR", [0]),
        ("resnet-t-lr", "ResNet-T-LR", [0]),
        ("resnet-plr", "ResNet-PLR", [0]),
        ("transformer-l", "Transformer-l", [0]),
        ("transformer-lr", "Transformer-LR", [0]),
        ("transformer-q-lr", "Transformer-Q-LR", [0]),
        ("transformer-t-lr", "Transformer-T-LR", [0]),
        ("transformer-plr", "Transformer-PLR", [0]),
    ]:
        if isinstance(algorithm_info, str):
            algorithm_info = (algorithm_info, algorithm_info, [0])
        alg_dir, name, experiments = algorithm_info
        if name is None:
            name = alg_dir

        for experiment in experiments:
            prefix = name if experiment == 0 else f"[{experiment}] {name}"
            if single_models:
                results_info.append(
                    (
                        lib.EXP / alg_dir / dataset / f"{experiment}_evaluation",
                        15,
                        prefix,
                        lambda x: x.name,
                    )
                )
            if ensembles:
                results_info.append(
                    (
                        lib.EXP / alg_dir / dataset / f"{experiment}_ensemble_5",
                        3,
                        prefix + " (e)",
                        lambda x: x.name,
                    )
                )

df = build_df(results_info)
df = aggregate(df)
df = sort(df, "test_score")
df = format_scores(df, 4)
df = df.set_index(["dataset", "key"])

df_ranks = deepcopy(df).reset_index()
df_ranks.loc[df_ranks["task_type"] == "regression", "test_score"] *= -1
df_ranks = df_ranks.pivot("key", "dataset", "test_score")
df_ranks.columns = df_ranks.columns.map(lambda x: x.split()[0].split("-")[0])
df_ranks = df_ranks.rank(0, ascending=False)
df_ranks["avg"] = df_ranks.mean(1)
df_ranks["std"] = df_ranks.std(1)
df_ranks.insert(0, "avg", df_ranks.pop("avg"))
df_ranks.insert(1, "std", df_ranks.pop("std"))
df_ranks = df_ranks.sort_values("avg")

# df = drop_details(df)

display(make_datasets_df())
print(
    '\n!!!!!!!!!!!!!!!!!!\n'
    'WARNING: '
    'the ranks below are different from those from the paper, '
    'because here only the mean metrics are compared '
    'without taking standard deviations into account '
    '(in other words, the ranks here are only a rough estimate)'
    '\n!!!!!!!!!!!!!!!!!!\n'
)
display(df_ranks)
display(df)

Unnamed: 0,name,size,n_features,n_num_features,n_cat_features,task_type,n_classes
0,Gesture Phase,9873,32,32,0,multiclass,5.0
1,Churn Modelling,10000,11,10,1,binclass,
2,California Housing,20640,8,8,0,regression,
3,House 16H,22784,16,16,0,regression,
4,Adult,48842,14,6,8,binclass,
5,Otto Group Products,61878,93,93,0,multiclass,9.0
6,Higgs Small,98049,28,28,0,binclass,
7,Facebook Comments Volume,197080,51,50,1,regression,
8,Santander Customer Transactions,200000,200,200,0,binclass,
9,Covertype,581012,54,54,0,multiclass,7.0


Unnamed: 0_level_0,Unnamed: 1_level_0,test_score,test_std,val_score,val_std,train_score,train_std,count,task_type,n_objects,n_features
dataset,key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Gesture Phase,MLP-PLR (e),0.6999,0.0026,0.7285,0.0023,0.995,0.0013,3,multiclass,9873,32
Gesture Phase,CatBoost (e),0.692,0.0023,0.7082,0.0006,1.0,0.0,3,multiclass,9873,32
Gesture Phase,ResNet-PLR (e),0.691,0.0078,0.7023,0.0042,0.9803,0.0129,3,multiclass,9873,32
Gesture Phase,Transformer-Q-LR (e),0.6903,0.0023,0.7091,0.0087,0.9996,0.0002,3,multiclass,9873,32
Gesture Phase,ResNet (e),0.69,0.0072,0.7034,0.0039,0.9679,0.0085,3,multiclass,9873,32
Gesture Phase,Transformer-PLR (e),0.6864,0.0076,0.7137,0.0113,0.9809,0.007,3,multiclass,9873,32
Gesture Phase,Transformer-T-LR (e),0.6864,0.0051,0.7129,0.0032,0.998,0.0018,3,multiclass,9873,32
Gesture Phase,XGBoost (e),0.6829,0.0016,0.7053,0.0037,1.0,0.0,3,multiclass,9873,32
Gesture Phase,ResNet-T-LR (e),0.6825,0.0075,0.7072,0.0076,0.9999,0.0001,3,multiclass,9873,32
Gesture Phase,MLP-Q-LR (e),0.682,0.0048,0.7011,0.0057,1.0,0.0,3,multiclass,9873,32
