# Data Cleaning Notebook

In [1]:
import itertools
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(".").absolute().parent
DATA_DIRECTORY = PROJECT_ROOT / "data"

In [2]:
amlb_df = pd.read_csv(DATA_DIRECTORY / "results_shorter_amlb_early.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


## Cleaning Duplicates

In [3]:
amlb_df = amlb_df.sort_values(by="result", na_position="first")

def custom_dedup(group):
    if group['result'].isna().all():
        return group.iloc[0:1]
    else:
        return group.dropna(subset=['result']).iloc[-1:]

amlb_df = amlb_df.groupby(["task", "framework", "fold", "constraint"]).apply(custom_dedup).reset_index(drop=True)


In [4]:
assert amlb_df[amlb_df.duplicated(["framework", "task", "fold", "constraint"])].empty

# Show Missing Results

There are some (framework, fold, task, constraint)-combinations which do not have any entries:

In [5]:
from IPython.display import display

with pd.option_context("display.max_rows", None):
    display(amlb_df.groupby(by=["type", "constraint", "framework"]).size())

type        constraint              framework                      
binary      Mixed Time Constraints  AutoGluon_FI_FT_IL_early_10min     410
                                    AutoGluon_FI_FT_IL_early_30min     410
                                    AutoGluon_FI_FT_IL_early_5min      410
                                    AutoGluon_FI_FT_IL_early_60min     410
                                    AutoGluon_HQIL_early_10min         410
                                    AutoGluon_HQIL_early_30min         410
                                    AutoGluon_HQIL_early_5min          410
                                    AutoGluon_HQIL_early_60min         410
                                    AutoGluon_HQ_early_10min           410
                                    AutoGluon_HQ_early_30min           410
                                    AutoGluon_HQ_early_5min            410
                                    AutoGluon_HQ_early_60min           410
                                

In [6]:
# Assuming amlb_df is already defined
nan_counts = amlb_df.groupby('framework')['result'].apply(lambda x: x.isna().sum())

print(nan_counts)

framework
AutoGluon_FI_FT_IL_early_10min       0
AutoGluon_FI_FT_IL_early_30min       0
AutoGluon_FI_FT_IL_early_5min        0
AutoGluon_FI_FT_IL_early_60min       0
AutoGluon_HQIL_early_10min          19
AutoGluon_HQIL_early_30min          10
AutoGluon_HQIL_early_5min           41
AutoGluon_HQIL_early_60min           0
AutoGluon_HQ_early_10min             0
AutoGluon_HQ_early_30min             0
AutoGluon_HQ_early_5min             10
AutoGluon_HQ_early_60min             1
AutoGluon_benchmark_early_10min      0
AutoGluon_benchmark_early_30min      0
AutoGluon_benchmark_early_5min       9
AutoGluon_benchmark_early_60min      0
FEDOT_early_10min                  313
FEDOT_early_30min                  335
FEDOT_early_5min                   300
FEDOT_early_60min                  377
H2OAutoML_early_10min                0
H2OAutoML_early_30min                0
H2OAutoML_early_5min                 1
H2OAutoML_early_60min                5
TPOT_early_10min                   181
TPOT_early_30mi

In [7]:
amlb_df.to_csv(DATA_DIRECTORY / "amlb_all_early.csv", index=False)