# Data Cleaning Notebook

In [1]:
import itertools
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(".").absolute().parent
DATA_DIRECTORY = PROJECT_ROOT / "data"

In [2]:
amlb_df = pd.read_csv(DATA_DIRECTORY / "results_shorter_amlb.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


## Cleaning Duplicates

In [3]:
amlb_df = amlb_df.sort_values(by="result", na_position="first")

def custom_dedup(group):
    if group['result'].isna().all():
        return group.iloc[0:1]
    else:
        return group.dropna(subset=['result']).iloc[-1:]

amlb_df = amlb_df.groupby(["task", "framework", "fold", "constraint"]).apply(custom_dedup).reset_index(drop=True)


In [4]:
assert amlb_df[amlb_df.duplicated(["framework", "task", "fold", "constraint"])].empty

# Show Missing Results

In [5]:
from IPython.display import display

with pd.option_context("display.max_rows", None):
    display(amlb_df.groupby(by=["constraint", "framework"]).size())

constraint              framework                      
Mixed Time Constraints  AutoGluon_HQIL_10min               1040
                        AutoGluon_HQIL_30min               1040
                        AutoGluon_HQIL_5min                1040
                        AutoGluon_HQIL_60min               1040
                        AutoGluon_HQ_10min                 1040
                        AutoGluon_HQ_30min                 1040
                        AutoGluon_HQ_5min                  1040
                        AutoGluon_HQ_60min                 1040
                        AutoGluon_benchmark_10min          1040
                        AutoGluon_benchmark_30min          1040
                        AutoGluon_benchmark_5min           1040
                        AutoGluon_benchmark_60min          1040
                        FEDOT_10min                        1040
                        FEDOT_30min                        1040
                        FEDOT_5min              

In [6]:
# Assuming amlb_df is already defined
nan_counts = amlb_df.groupby('framework')['result'].apply(lambda x: x.isna().sum())
print(nan_counts)

framework
AutoGluon_HQIL_10min                20
AutoGluon_HQIL_30min                10
AutoGluon_HQIL_5min                 44
AutoGluon_HQIL_60min                 4
AutoGluon_HQ_10min                   0
AutoGluon_HQ_30min                   0
AutoGluon_HQ_5min                   10
AutoGluon_HQ_60min                   0
AutoGluon_benchmark_10min            0
AutoGluon_benchmark_30min            0
AutoGluon_benchmark_5min            10
AutoGluon_benchmark_60min            0
FEDOT_10min                        178
FEDOT_30min                        192
FEDOT_5min                         188
FEDOT_60min                        389
GAMA_10min                          33
GAMA_30min                          21
GAMA_5min                           85
GAMA_60min                          42
H2OAutoML_10min                     10
H2OAutoML_30min                      8
H2OAutoML_5min                      20
H2OAutoML_60min                     22
NaiveAutoML_10min                  304
NaiveAutoML_30m

In [8]:
amlb_df.to_csv(DATA_DIRECTORY / "amlb_all.csv", index=False)