In [1]:
!pip install autogluon openml

Collecting autogluon
  Downloading autogluon-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting autogluon.core==1.3.1 (from autogluon.core[all]==1.3.1->autogluon)
  Downloading autogluon.core-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.3.1 (from autogluon)
  Downloading autogluon.features-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.3.1 (from autogluon.tabular[all]==1.3.1->autogluon)
  Downloading autogluon.tabular-1.3.1-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.3.1 (from autogluon)
  Downloading autogluon.multimodal-1.3.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.timeseries==1.3.1 (from autogluon.timeseries[all]==1.3.1->autogluon)
  Downloading autogluon.timeseries-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.7.0,>=1.4.0 (from autogluon.core==1.3.1->autogluon.core[all]==1.3.1->autogluon)
  D

In [2]:
import time
import pandas as pd
import numpy as np
import openml
import os
import math # Importa a biblioteca de matemática para o cálculo do teto

from autogluon.tabular import TabularPredictor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# Configurações
SEED = 42

def carregar_base_openml(openml_id):
    """Carrega um dataset do OpenML e prepara os tipos de dados."""
    dataset = openml.datasets.get_dataset(openml_id, download_data=True, download_qualities=True,
                                          download_features_meta_data=True)
    X, y, _, attribute_names = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)

    if isinstance(y, pd.Series):
        y = y.values.ravel()

    categorical_cols_names = [name for i, name in enumerate(attribute_names) if
                              dataset.features[i].data_type == 'nominal']
    for col_name in categorical_cols_names:
        if col_name in X.columns:
            X[col_name] = X[col_name].astype('category')

    return X, y


In [3]:
def run_autogluon_experiment(openml_id):
    """
    Executa o fluxo completo de avaliação para o AutoGluon em um dataset.
    """
    print(f"--- Processando Dataset ID: {openml_id} com AutoGluon ---")
    
    AUTOML_TIME_BUDGET = 3600

    # 1. Carregamento e pré-processamento dos dados
    X_df, y_orig = carregar_base_openml(openml_id)

    X_train_df, X_test_df, y_train_orig, y_test_orig = train_test_split(
        X_df, y_orig, test_size=0.3, random_state=SEED, stratify=(y_orig if len(np.unique(y_orig)) > 1 else None)
    )

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train_orig)
    y_test_enc = le.transform(y_test_orig)
    
    # Lógica de seleção de métrica dinâmica
    n_classes = len(le.classes_)
    if n_classes > 2:
        metric_to_use = 'roc_auc_ovo'
        print(f"Problema multiclasse detectado ({n_classes} classes). Usando a métrica: {metric_to_use}")
    else:
        metric_to_use = 'roc_auc'
        print(f"Problema binário detectado. Usando a métrica: {metric_to_use}")

    # Imputação de dados faltantes
    num_cols = X_train_df.select_dtypes(include=np.number).columns
    if len(num_cols) > 0 and X_train_df[num_cols].isnull().sum().sum() > 0:
        imp = SimpleImputer(strategy='median')
        X_train_df.loc[:, num_cols] = imp.fit_transform(X_train_df[num_cols])
        X_test_df.loc[:, num_cols] = imp.transform(X_test_df[num_cols])

    cat_cols = X_train_df.select_dtypes(include=['category', 'object']).columns
    if len(cat_cols) > 0:
        for col in cat_cols:
            X_train_df.loc[:, col] = X_train_df[col].astype(str).fillna("__MISSING__")
            X_test_df.loc[:, col] = X_test_df[col].astype(str).fillna("__MISSING__")

    # 2. Treinamento com AutoGluon
    t0_total = time.time()

    train_df_ag = X_train_df.copy()
    target_col = 'target'
    train_df_ag[target_col] = y_train_enc
    
    save_path = f'autogluon_models/ds_{openml_id}'

    predictor = TabularPredictor(
        label=target_col,
        path=save_path,
        eval_metric=metric_to_use
    ).fit(
        train_data=train_df_ag,
        time_limit=AUTOML_TIME_BUDGET,
        presets='best_quality'
    )

    leaderboard = predictor.leaderboard(silent=True)
    best_model_name = leaderboard.iloc[0]['model']
    best_model_score_val = leaderboard.iloc[0]['score_val']

    # 3. Predição e Métricas
    y_proba = predictor.predict_proba(X_test_df, as_multiclass=True).values
    y_pred = predictor.predict(X_test_df).values
    
    total_time = time.time() - t0_total

    if n_classes == 2:
        auc_ovo = roc_auc_score(y_test_enc, y_proba[:, 1])
    else:
        auc_ovo = roc_auc_score(y_test_enc, y_proba, multi_class='ovo')

    acc = accuracy_score(y_test_enc, y_pred)
    ce = log_loss(y_test_enc, y_proba, labels=le.transform(le.classes_))

    print(f"Resultados para {openml_id}: AUC={auc_ovo:.4f}, ACC={acc:.4f}, Time={total_time:.2f}s")
    
    return {
        'openml_id': openml_id, 'modelo': 'autogluon', 'total_time_sec': total_time,
        'mean_auc_ovo': auc_ovo, 'mean_accuracy': acc, 'mean_cross_entropy': ce,
        'best_model_info': best_model_name,
        'best_score_cv': best_model_score_val
    }

In [4]:
if __name__ == "__main__":
    # --- CONFIGURAÇÃO DA EXECUÇÃO DISTRIBUÍDA ---
    # Altere esta variável em cada máquina de 1 a 10
    MACHINE_ID = 2
    TOTAL_MACHINES = 10
    # --- FIM DA CONFIGURAÇÃO ---

    # Lista completa dos 30 datasets do CC18
    cc18_ids_full = [
        11, 15, 18, 23, 29, 31, 37, 50, 54, 188, 307, 458, 469, 1049,
        1050, 1063, 1068, 1462, 1464, 1468, 1480, 1494, 1501, 1510,
        6332, 23381, 40966, 40975, 40982, 40994
    ]

    # Lógica para dividir a lista de IDs em lotes
    chunk_size = math.ceil(len(cc18_ids_full) / TOTAL_MACHINES)
    start_index = (MACHINE_ID - 1) * chunk_size
    end_index = start_index + chunk_size
    ids_to_process = cc18_ids_full[start_index:end_index]

    print(f"--- MÁQUINA {MACHINE_ID}/{TOTAL_MACHINES} ---")
    print(f"Processando os seguintes datasets: {ids_to_process}")
    
    todos_resultados = []

    for oid in ids_to_process:
        try:
            res = run_autogluon_experiment(oid)
            todos_resultados.append(res)
        except Exception as e:
            print(f"!!!!!! ERRO AO PROCESSAR O DATASET {oid}: {e} !!!!!!")
            import traceback
            traceback.print_exc()
            continue

    # Salva o arquivo com um nome único para esta máquina
    if todos_resultados:
        df_results = pd.DataFrame(todos_resultados)
        output_filename = f"resultados_autogluon_maquina_{MACHINE_ID}.csv"
        df_results.to_csv(output_filename, index=False)
        print(f"\nArquivo '{output_filename}' salvo com sucesso!")
    else:
        print("Nenhuma tarefa foi concluída com sucesso nesta máquina.")


--- MÁQUINA 2/10 ---
Processando os seguintes datasets: [23, 29, 31]
--- Processando Dataset ID: 23 com AutoGluon ---


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Debian 5.10.237-1 (2025-05-19)
CPU Count:          8
Memory Avail:       29.95 GB / 31.36 GB (95.5%)
Disk Space Avail:   94.43 GB / 97.87 GB (96.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of rem

Problema multiclasse detectado (3 classes). Usando a métrica: roc_auc_ovo


	Running DyStack sub-fit in a ray process to avoid memory leakage. Enabling ray logging (enable_ray_logging=True). Specify `ds_args={'enable_ray_logging': False}` if you experience logging issues.
2025-06-14 10:18:12,485	INFO worker.py:1843 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
		Context path: "/home/jupyter/autogluon_models/ds_23/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=4507)[0m Running DyStack sub-fit ...
[36m(_dystack pid=4507)[0m Beginning AutoGluon training ... Time limit = 896s
[36m(_dystack pid=4507)[0m AutoGluon will save models to "/home/jupyter/autogluon_models/ds_23/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=4507)[0m Train Data Rows:    916
[36m(_dystack pid=4507)[0m Train Data Columns: 9
[36m(_dystack pid=4507)[0m Label Column:       target
[36m(_dystack pid=4507)[0m Problem Type:       multiclass
[36m(_dystack pid=4507)[0m Preprocessing data ...
[36m(_dystack pid=4507)[0m Train Data Class Count: 3
[36m(_d

[36m(_ray_fit pid=9333)[0m [1000]	valid_set's multi_logloss: 0.919021	valid_set's roc_auc_ovo: 0.727407


[36m(_dystack pid=4507)[0m 	0.7119	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	7.58s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.1s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: NeuralNetTorch_r22_BAG_L1 ... Training model for up to 479.01s of the 777.94s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.00%)
[36m(_dystack pid=4507)[0m 	0.7324	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	14.11s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.09s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: XGBoost_r33_BAG_L1 ... Training model for up to 462.85s of the 761.78s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.14%)
[36m(_dystack pid=4507

[36m(_ray_fit pid=17345)[0m [1000]	valid_set's multi_logloss: 0.853296	valid_set's roc_auc_ovo: 0.803741[32m [repeated 4x across cluster][0m


[36m(_dystack pid=4507)[0m 	0.6965	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	12.96s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.18s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: RandomForest_r39_BAG_L1 ... Training model for up to 237.70s of the 536.63s of remaining time.
[36m(_dystack pid=4507)[0m 	0.7065	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	1.03s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.09s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: CatBoost_r167_BAG_L1 ... Training model for up to 236.54s of the 535.47s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=1.33%)
[36m(_dystack pid=4507)[0m 	0.7342	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	7.91s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.03s	 = Validation run

[36m(_ray_fit pid=22201)[0m [1000]	valid_set's multi_logloss: 0.964148	valid_set's roc_auc_ovo: 0.708774[32m [repeated 5x across cluster][0m


[36m(_dystack pid=4507)[0m 	0.703	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	6.14s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.08s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: NeuralNetTorch_r143_BAG_L1 ... Training model for up to 97.48s of the 396.41s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.00%)
[36m(_dystack pid=4507)[0m 	0.7245	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	12.17s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.11s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: CatBoost_r128_BAG_L1 ... Training model for up to 83.13s of the 382.07s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=1.40%)
[36m(_dystack pid=450

[36m(_ray_fit pid=26468)[0m [1000]	valid_set's multi_logloss: 1.3347	valid_set's roc_auc_ovo: 0.748309[32m [repeated 2x across cluster][0m


[36m(_dystack pid=4507)[0m 	0.7363	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	14.89s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.19s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: LightGBM_BAG_L2 ... Training model for up to 268.17s of the 267.99s of remaining time.
[36m(_ray_fit pid=26150)[0m Metric roc_auc_ovo is not supported by this model - using log_loss instead[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=26148)[0m No improvement since epoch 6: early stopping[32m [repeated 7x across cluster][0m
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.19%)
[36m(_dystack pid=4507)[0m 	0.747	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	7.77s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.09s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: RandomForestGini_BAG_L2 ... T

[36m(_ray_fit pid=28378)[0m [1000]	valid_set's multi_logloss: 1.88353	valid_set's roc_auc_ovo: 0.75555[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=28378)[0m [3000]	valid_set's multi_logloss: 1.97444	valid_set's roc_auc_ovo: 0.76022[32m [repeated 5x across cluster][0m
[36m(_ray_fit pid=28378)[0m [5000]	valid_set's multi_logloss: 1.9956	valid_set's roc_auc_ovo: 0.762186[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=28378)[0m [7000]	valid_set's multi_logloss: 2.00641	valid_set's roc_auc_ovo: 0.765924[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=28378)[0m [9000]	valid_set's multi_logloss: 2.00592	valid_set's roc_auc_ovo: 0.767923[32m [repeated 2x across cluster][0m


[36m(_dystack pid=4507)[0m 	0.7375	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	47.66s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.42s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: CatBoost_r177_BAG_L2 ... Training model for up to 146.81s of the 146.63s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=2.05%)
[36m(_dystack pid=4507)[0m 	0.7586	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	12.73s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.04s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: NeuralNetTorch_r79_BAG_L2 ... Training model for up to 131.98s of the 131.79s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.02%)
[36m(_dystack pid=

[36m(_ray_fit pid=29657)[0m [1000]	valid_set's multi_logloss: 1.27134	valid_set's roc_auc_ovo: 0.765048[32m [repeated 2x across cluster][0m


[36m(_dystack pid=4507)[0m 	0.7446	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	16.85s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.29s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: NeuralNetFastAI_r191_BAG_L2 ... Training model for up to 92.69s of the 92.51s of remaining time.
[36m(_dystack pid=4507)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.03%)
[36m(_ray_fit pid=29979)[0m Metric roc_auc_ovo is not supported by this model - using log_loss instead
[36m(_ray_fit pid=29985)[0m No improvement since epoch 0: early stopping
[36m(_dystack pid=4507)[0m 	0.7229	 = Validation score   (roc_auc_ovo)
[36m(_dystack pid=4507)[0m 	5.8s	 = Training   runtime
[36m(_dystack pid=4507)[0m 	0.13s	 = Validation runtime
[36m(_dystack pid=4507)[0m Fitting model: CatBoost_r9_BAG_L2 ... Training model for up to 84.78s of the 84.60s of remaining time.
[36m(_

Resultados para 23: AUC=0.7636, ACC=0.5928, Time=1965.12s
--- Processando Dataset ID: 29 com AutoGluon ---


 'b' 'b' 'b' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'a' 'a' 'b' 'b' 'b'
 'b' 'a' 'b' 'b' 'b' 'a' 'b' 'a' np.str_('nan') 'b' 'a' 'b' 'a' 'b' 'b'
 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'b' 'b' 'a' 'b' 'b'
 'b' 'b' 'b' 'b' 'a' 'b' 'a' 'b' 'b' 'b' 'a' 'b' 'a' 'a' 'b' 'a' 'b' 'b'
 'b' 'b' 'a' 'b' 'b' np.str_('nan') 'a' 'b' 'b' 'b' 'b' 'a' 'b' 'a' 'b'
 'b' 'b' 'b' 'b' 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'b' 'a' 'b' 'b' 'a' 'b' 'b'
 'b' 'b' 'b' 'a' 'a' 'b' 'b' 'a' 'a' 'b' 'b' 'a' 'b' 'a' 'b' 'b' 'a' 'b'
 'a' 'b' 'a' 'b' 'a' 'b' 'a' 'b' 'b' 'a' 'b' 'a' 'b' 'b' 'b' 'b' 'a' 'a'
 'a' 'b' 'b' 'b' 'b' 'b' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'b' 'b'
 'b' 'b' 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'a' 'b' 'a' 'a' 'b' 'b' 'b' 'b' 'b'
 'a' 'b' 'b' 'b' 'b' 'b' 'b' 'a' 'a' 'b' 'a' 'a' 'b' 'a' 'b' 'b' 'b' 'a'
 'b' 'b' 'b' 'b' 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'b' 'b' 'b' 'a' 'b' 'b' 'a'
 'a' 'b' 'b' 'b' 'b' 'b' 'b' 'b' 'a' 'a' 'b' 'b' 'b' 'a' 'a' 'a' 'b' 'b'
 'b' 'b' 'b' 'b' np.str_('nan') 'a' 'b' 'b' 'b' 'b' '

Problema binário detectado. Usando a métrica: roc_auc


Leaderboard on holdout data (DyStack):
                           model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    NeuralNetFastAI_r143_BAG_L2       0.929167   0.960403     roc_auc        0.962916       0.748095  35.740372                 0.136327                0.138204           8.974780            2       True        133
1       RandomForest_r195_BAG_L2       0.917361   0.948260     roc_auc        0.898040       0.698713  27.781085                 0.071451                0.088822           1.015493            2       True        118
2         NeuralNetFastAI_BAG_L1       0.915278   0.935809     roc_auc        0.418569       0.101185   4.389529                 0.418569                0.101185           4.389529            1       True         10
3        RandomForest_r39_BAG_L2       0.911111   0.945312     roc_auc        0.898855       0.68

Resultados para 29: AUC=0.9119, ACC=0.8309, Time=2550.68s
--- Processando Dataset ID: 31 com AutoGluon ---


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Debian 5.10.237-1 (2025-05-19)
CPU Count:          8
Memory Avail:       28.23 GB / 31.36 GB (90.0%)
Disk Space Avail:   92.71 GB / 97.87 GB (94.7%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of rem

Problema binário detectado. Usando a métrica: roc_auc


Leaderboard on holdout data (DyStack):
                           model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0        RandomForest_r34_BAG_L1       0.826087   0.735091     roc_auc        0.060781       0.100779   0.812605                 0.060781                0.100779           0.812605            1       True         60
1    NeuralNetFastAI_r134_BAG_L1       0.826087   0.740218     roc_auc        0.222790       0.209570  13.436608                 0.222790                0.209570          13.436608            1       True         59
2     NeuralNetFastAI_r37_BAG_L1       0.816601   0.762272     roc_auc        0.131676       0.153512   5.434566                 0.131676                0.153512           5.434566            1       True         53
3       RandomForest_r127_BAG_L1       0.816601   0.764288     roc_auc        0.084191       0.10

Resultados para 31: AUC=0.7867, ACC=0.7467, Time=2725.29s

Arquivo 'resultados_autogluon_maquina_2.csv' salvo com sucesso!




In [5]:
    # --- CONFIGURAÇÃO DA EXECUÇÃO DISTRIBUÍDA ---
    # Altere esta variável em cada máquina de 1 a 10
    MACHINE_ID = 5
    TOTAL_MACHINES = 10
    # --- FIM DA CONFIGURAÇÃO ---

    # Lista completa dos 30 datasets do CC18
    cc18_ids_full = [
        11, 15, 18, 23, 29, 31, 37, 50, 54, 188, 307, 458, 469, 1049,
        1050, 1063, 1068, 1462, 1464, 1468, 1480, 1494, 1501, 1510,
        6332, 23381, 40966, 40975, 40982, 40994
    ]

    # Lógica para dividir a lista de IDs em lotes
    chunk_size = math.ceil(len(cc18_ids_full) / TOTAL_MACHINES)
    start_index = (MACHINE_ID - 1) * chunk_size
    end_index = start_index + chunk_size
    ids_to_process = cc18_ids_full[start_index:end_index]

    print(f"--- MÁQUINA {MACHINE_ID}/{TOTAL_MACHINES} ---")
    print(f"Processando os seguintes datasets: {ids_to_process}")
    
    todos_resultados = []

    for oid in ids_to_process:
        try:
            res = run_autogluon_experiment(oid)
            todos_resultados.append(res)
        except Exception as e:
            print(f"!!!!!! ERRO AO PROCESSAR O DATASET {oid}: {e} !!!!!!")
            import traceback
            traceback.print_exc()
            continue

    # Salva o arquivo com um nome único para esta máquina
    if todos_resultados:
        df_results = pd.DataFrame(todos_resultados)
        output_filename = f"resultados_autogluon_maquina_{MACHINE_ID}.csv"
        df_results.to_csv(output_filename, index=False)
        print(f"\nArquivo '{output_filename}' salvo com sucesso!")
    else:
        print("Nenhuma tarefa foi concluída com sucesso nesta máquina.")
        

--- MÁQUINA 5/10 ---
Processando os seguintes datasets: [469, 1049, 1050]
--- Processando Dataset ID: 469 com AutoGluon ---


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Debian 5.10.237-1 (2025-05-19)
CPU Count:          8
Memory Avail:       28.13 GB / 31.36 GB (89.7%)
Disk Space Avail:   91.82 GB / 97.87 GB (93.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of rem

Problema multiclasse detectado (6 classes). Usando a métrica: roc_auc_ovo


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val  eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0            XGBoost_r89_BAG_L1       0.620715   0.552877  roc_auc_ovo        0.247559       0.059158   12.437399                 0.247559                0.059158          12.437399            1       True         27
1                XGBoost_BAG_L1       0.610814   0.554193  roc_auc_ovo        0.220351       0.099872   21.909125                 0.220351                0.099872          21.909125            1       True          9
2           XGBoost_r194_BAG_L1       0.604964   0.549711  roc_auc_ovo        0.201877       0.030567   11.264147                 0.201877                0.030567          11.264147            1       True         33
3            XGBoost_r33_BAG_L1       0.585701   0.559855  roc_auc_ovo        0.269004       

Resultados para 469: AUC=0.5725, ACC=0.2125, Time=2331.46s
--- Processando Dataset ID: 1049 com AutoGluon ---


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Debian 5.10.237-1 (2025-05-19)
CPU Count:          8
Memory Avail:       28.14 GB / 31.36 GB (89.7%)
Disk Space Avail:   91.01 GB / 97.87 GB (93.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of rem

Problema binário detectado. Usando a métrica: roc_auc


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       RandomForest_r39_BAG_L1       0.975000   0.932234     roc_auc        0.080111       0.085726   1.095972                 0.080111                0.085726           1.095972            1       True         45
1      RandomForest_r195_BAG_L1       0.968929   0.933350     roc_auc        0.070724       0.106039   1.203472                 0.070724                0.106039           1.203472            1       True         26
2     NeuralNetTorch_r30_BAG_L2       0.968571   0.945912     roc_auc        0.896178       0.909086  67.120955                 0.240676                0.268437          12.908304            2       True         88
3      RandomForest_r127_BAG_L1       0.967857   0.925106     roc_auc        0.070207       0.123105 

Resultados para 1049: AUC=0.9392, ACC=0.9132, Time=3480.64s
--- Processando Dataset ID: 1050 com AutoGluon ---


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Debian 5.10.237-1 (2025-05-19)
CPU Count:          8
Memory Avail:       27.94 GB / 31.36 GB (89.1%)
Disk Space Avail:   89.96 GB / 97.87 GB (91.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of rem

Problema binário detectado. Usando a métrica: roc_auc


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         ExtraTrees_r49_BAG_L1       0.917803   0.860854     roc_auc        0.083131       0.137666   2.387998                 0.083131                0.137666           2.387998            1       True         56
1         ExtraTreesGini_BAG_L1       0.917803   0.860854     roc_auc        0.256066       0.117246   0.824960                 0.256066                0.117246           0.824960            1       True          8
2       RandomForestEntr_BAG_L2       0.905303   0.876692     roc_auc        1.387892       0.823688  46.574102                 0.082138                0.084992           0.731940            2       True         67
3         ExtraTreesEntr_BAG_L1       0.904167   0.854472     roc_auc        0.088880       0.094864 

Resultados para 1050: AUC=0.8355, ACC=0.9062, Time=3417.91s

Arquivo 'resultados_autogluon_maquina_5.csv' salvo com sucesso!


