In [1]:
import time
import pandas as pd
import numpy as np
import openml
import warnings
import math
import logging

# Imports do scikit-learn e auto-sklearn
from autosklearn.experimental.askl2 import AutoSklearn2Classifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

  from pkg_resources import parse_version  # type: ignore


In [2]:
# Configurações
SEED = 42
warnings.filterwarnings("ignore", category=FutureWarning)

def carregar_base_openml(openml_id):
    """Carrega um dataset do OpenML e prepara os tipos de dados."""
    dataset = openml.datasets.get_dataset(openml_id, download_data=True, download_qualities=True, download_features_meta_data=True)
    X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
    if isinstance(y, pd.Series):
        y = y.values.ravel()
    return X, y

In [3]:
def run_askl_experiment(openml_id):
    """
    Executa o fluxo completo de avaliação para o auto-sklearn em um dataset.
    """
    print(f"--- Processando Dataset ID: {openml_id} com auto-sklearn ---")
    
    # Orçamento de tempo em segundos. Use 3600 para os resultados finais.
    AUTOML_TIME_BUDGET = 3600  # 10 minutos para um teste rápido

    # 1. Carregamento e pré-processamento
    X_df, y_orig = carregar_base_openml(openml_id)
    
    X_train_df, X_test_df, y_train_orig, y_test_orig = train_test_split(
        X_df, y_orig, test_size=0.3, random_state=SEED, stratify=(y_orig if len(np.unique(y_orig)) > 1 else None)
    )

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train_orig)
    y_test_enc = le.transform(y_test_orig)

    # Imputação de NaNs numéricos
    num_cols = X_train_df.select_dtypes(include=np.number).columns
    if len(num_cols) > 0 and X_train_df[num_cols].isnull().sum().sum() > 0:
        imp = SimpleImputer(strategy='median')
        X_train_df.loc[:, num_cols] = imp.fit_transform(X_train_df[num_cols])
        X_test_df.loc[:, num_cols] = imp.transform(X_test_df[num_cols])

    # Imputação de NaNs categóricos (convertendo para string)
    cat_cols = X_train_df.select_dtypes(include=['category', 'object']).columns
    if len(cat_cols) > 0:
        for col in cat_cols:
            X_train_df.loc[:, col] = X_train_df[col].astype(str).fillna("__MISSING__")
            X_test_df.loc[:, col] = X_test_df[col].astype(str).fillna("__MISSING__")
            
    # 2. Treinamento com auto-sklearn
    t0_total = time.time()

    automl = AutoSklearn2Classifier(
        time_left_for_this_task=AUTOML_TIME_BUDGET,
        n_jobs=-1,                
        seed=SEED,
        memory_limit=None
    )
    
    # O auto-sklearn lida bem com DataFrames pré-processados
    automl.fit(X_train_df, y_train_enc)
    
    # 3. Predição e Métricas
    y_proba = automl.predict_proba(X_test_df)
    y_pred = automl.predict(X_test_df)
    total_time = time.time() - t0_total

    n_classes = len(le.classes_)
    if n_classes == 2:
        auc_ovo = roc_auc_score(y_test_enc, y_proba[:, 1])
    else:
        auc_ovo = roc_auc_score(y_test_enc, y_proba, multi_class='ovo')

    acc = accuracy_score(y_test_enc, y_pred)
    ce = log_loss(y_test_enc, y_proba, labels=range(n_classes))
    
    # Extrai o melhor score de validação cruzada dos resultados internos
    best_score_cv = np.max(automl.cv_results_['mean_test_score'])


    print(f"Resultados para {openml_id}: AUC={auc_ovo:.4f}, ACC={acc:.4f}, Time={total_time:.2f}s")
    
    return {
        'openml_id': openml_id, 'modelo': 'autosklearn', 'total_time_sec': total_time,
        'mean_auc_ovo': auc_ovo, 'mean_accuracy': acc, 'mean_cross_entropy': ce,
        'best_model_info': 'Ensemble', # O melhor modelo é um ensemble complexo
        'best_score_cv': best_score_cv
    }

In [4]:
logging.getLogger("autosklearn").setLevel(logging.INFO)
MACHINE_ID = 3
TOTAL_MACHINES = 4

cc18_ids = [
    11, 15, 18, 23, 29, 31, 37, 50, 54, 188,
    307, 458, 469, 1049, 1050, 1063, 1068, 1462, 1464, 1468,
    1480, 1494, 1501, 1510, 6332, 23381, 40966, 40975, 40982, 40994
]

chunk_size = math.ceil(len(cc18_ids) / TOTAL_MACHINES)
start_index = (MACHINE_ID - 1) * chunk_size
end_index = start_index + chunk_size
ids_to_process = cc18_ids[start_index:end_index]

print(f"--- MÁQUINA {MACHINE_ID}/{TOTAL_MACHINES} ---")
print(f"Processando os seguintes datasets: {ids_to_process}")
    
todos_resultados_askl = []

for oid in ids_to_process:
    try:
        res = run_askl_experiment(oid)
        todos_resultados_askl.append(res)
    except Exception as e:
        print(f"!!!!!! ERRO AO PROCESSAR O DATASET {oid}: {e} !!!!!!")
        import traceback
        traceback.print_exc()
        continue

if todos_resultados_askl:
    df_askl_results = pd.DataFrame(todos_resultados_askl)
    print("\n\n--- RESULTADOS FINAIS DO AUTO-SKLEARN ---")
    print(df_askl_results)
    df_askl_results.to_csv("resultados_autosklearn.csv", index=False)
    print("\nArquivo 'resultados_autosklearn.csv' salvo com sucesso!")
else:
    print("\nNenhuma tarefa foi concluída com sucesso.")

--- MÁQUINA 3/4 ---
Processando os seguintes datasets: [1068, 1462, 1464, 1468, 1480, 1494, 1501, 1510]
--- Processando Dataset ID: 1068 com auto-sklearn ---


  from pkg_resources import parse_version  # type: ignore




Process pynisher function call:
Traceback (most recent call last):
  File "/opt/conda/envs/askl_env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/envs/askl_env/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/pynisher/limit_function_call.py", line 133, in subprocess_func
    return_value = ((func(*args, **kwargs), 0))
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 461, in main
    requires_update = self.requires_loss_update(runs)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 666, in requires_loss_update
    for run in sorted(runs, key=lambda run: run.recorded_mtimes["ensemble"]):
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 666, i

[ERROR] [2025-06-15 08:12:28,371:Client-EnsembleBuilder] Error getting loss `accuracy` for Run(id=(42, 880, 6.25), losses={}):[Errno 2] No such file or directory: '/var/tmp/auto-sklearn_tmp_8cf3fa23-49bc-11f0-bc81-42010a9e0008/.auto-sklearn/runs/42_880_6.25/predictions_ensemble_42_880_6.25.npy'Traceback (most recent call last):
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 1076, in loss
    predictions = run.predictions(kind, precision=self.precision)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/run.py", line 130, in predictions
    with path.open("rb") as f:
  File "/opt/conda/envs/askl_env/lib/python3.9/pathlib.py", line 1180, in open
    return io.open(self, mode, buffering, encoding, errors, newline,
  File "/opt/conda/envs/askl_env/lib/python3.9/pathlib.py", line 1038, in _opener
    return self._accessor.open(self, flags, mode)
FileNotFoundError: [Errno 2] No such file or 

Process pynisher function call:
Traceback (most recent call last):
  File "/opt/conda/envs/askl_env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/envs/askl_env/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/pynisher/limit_function_call.py", line 133, in subprocess_func
    return_value = ((func(*args, **kwargs), 0))
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 461, in main
    requires_update = self.requires_loss_update(runs)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 666, in requires_loss_update
    for run in sorted(runs, key=lambda run: run.recorded_mtimes["ensemble"]):
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/ensemble_building/builder.py", line 666, i

Resultados para 1068: AUC=0.8761, ACC=0.9429, Time=3613.14s
--- Processando Dataset ID: 1462 com auto-sklearn ---
Resultados para 1462: AUC=1.0000, ACC=1.0000, Time=3604.08s
--- Processando Dataset ID: 1464 com auto-sklearn ---
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1
	Models besides current dummy model: 0
	Dummy models: 1
Resultados para 1464: AUC=0.7703, ACC=0.8089, Time=3625.83s
--- Processando Dataset ID: 1468 com auto-sklearn ---
Resultados para 1468: AUC=0.9956, ACC=0.9506, Time=3616.11s
--- Processando Dataset ID: 1480 com auto-sklearn ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_df.loc[:, col] = X_train_df[col].astype(str).fillna("__MISSING__")
  X_train_df.loc[:, col] = X_train_df[col].astype(str).fillna("__MISSING__")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df.loc[:, col] = X_test_df[col].astype(str).fillna("__MISSING__")
  X_test_df.loc[:, col] = X_test_df[col].astype(str).fillna("__MISSING__")


[ERROR] [2025-06-15 11:45:30,843:Client-AutoML(42):37ce3b55-49de-11f0-bc81-42010a9e0008] list index out of range
Traceback (most recent call last):
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/automl.py", line 899, in fit
    ) = _proc_smac.run_smbo()
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/smbo.py", line 552, in run_smbo
    smac.optimize()
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/smac/facade/smac_ac_facade.py", line 720, in optimize
    incumbent = self.solver.run()
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/smac/optimizer/smbo.py", line 211, in run
    self.start()
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/smac/optimizer/smbo.py", line 174, in start
    self.initial_design_configs = self.initial_design.select_configurations()
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/smac/initial_design/initial_design.py", line 105, in select_configurations
    inc

Traceback (most recent call last):
  File "/var/tmp/ipykernel_244865/2714729571.py", line 23, in <module>
    res = run_askl_experiment(oid)
  File "/var/tmp/ipykernel_244865/1891648276.py", line 46, in run_askl_experiment
    automl.fit(X_train_df, y_train_enc)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/experimental/askl2.py", line 540, in fit
    return super().fit(
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/estimators.py", line 1448, in fit
    super().fit(
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/estimators.py", line 540, in fit
    self.automl_.fit(load_models=self.load_models, **kwargs)
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/automl.py", line 2304, in fit
    return super().fit(
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosklearn/automl.py", line 962, in fit
    raise e
  File "/opt/conda/envs/askl_env/lib/python3.9/site-packages/autosk

Resultados para 1494: AUC=0.9373, ACC=0.8833, Time=3629.39s
--- Processando Dataset ID: 1501 com auto-sklearn ---
Resultados para 1501: AUC=0.9965, ACC=0.9289, Time=3636.58s
--- Processando Dataset ID: 1510 com auto-sklearn ---
Resultados para 1510: AUC=0.9988, ACC=0.9766, Time=3612.59s


--- RESULTADOS FINAIS DO AUTO-SKLEARN ---
   openml_id       modelo  total_time_sec  mean_auc_ovo  mean_accuracy  \
0       1068  autosklearn     3613.142015      0.876087       0.942943   
1       1462  autosklearn     3604.084644      1.000000       1.000000   
2       1464  autosklearn     3625.830725      0.770251       0.808889   
3       1468  autosklearn     3616.107053      0.995595       0.950617   
4       1494  autosklearn     3629.394815      0.937294       0.883281   
5       1501  autosklearn     3636.582160      0.996484       0.928870   
6       1510  autosklearn     3612.588383      0.998832       0.976608   

   mean_cross_entropy best_model_info  best_score_cv  
0            0.17508