# Preprocessing CC18 30 Menores Datasets
Este notebook seleciona os 30 menores datasets do benchmark OpenML-CC18, faz pré-processamento (imputação, encoding, normalização) e split 70/30, salvando os resultados em arquivos .npz e .joblib.

In [1]:
import openml
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

# Caso possua uma API key:
# openml.config.apikey = 'SUA_CHAVE_AQUI'


In [2]:
# Obter o benchmark suite CC18
benchmark_suite = openml.study.get_suite(suite_id='OpenML-CC18')
dataset_ids = benchmark_suite.data

# Listar metadados dos datasets
dlist = openml.datasets.list_datasets(data_id=dataset_ids, output_format='dataframe')
# Ordenar por número de instâncias
dlist_sorted = dlist.sort_values(by='NumberOfInstances')
# Selecionar 30 menores
smallest_30 = dlist_sorted.head(30)
smallest_30

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
23381,23381,dresses-sales,2,64,active,ARFF,290.0,24.0,210.0,2.0,13.0,500.0,401.0,835.0,1.0,12.0
1063,1063,kc2,1,2,active,ARFF,415.0,2.0,107.0,2.0,22.0,522.0,0.0,0.0,21.0,1.0
6332,6332,cylinder-bands,2,2,active,ARFF,312.0,71.0,228.0,2.0,40.0,540.0,263.0,999.0,18.0,22.0
40994,40994,climate-model-simulation-crashes,4,4265,active,ARFF,494.0,2.0,46.0,2.0,21.0,540.0,0.0,0.0,20.0,1.0
1510,1510,wdbc,1,64,active,ARFF,357.0,2.0,212.0,2.0,31.0,569.0,0.0,0.0,30.0,1.0
1480,1480,ilpd,1,64,active,ARFF,416.0,2.0,167.0,2.0,11.0,583.0,0.0,0.0,9.0,2.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0
29,29,credit-approval,1,1,active,ARFF,383.0,14.0,307.0,2.0,16.0,690.0,37.0,67.0,6.0,10.0
15,15,breast-w,1,1,active,ARFF,458.0,2.0,241.0,2.0,10.0,699.0,16.0,16.0,9.0,1.0
188,188,eucalyptus,1,1,active,ARFF,214.0,27.0,105.0,5.0,20.0,736.0,95.0,448.0,14.0,6.0


In [3]:
def build_preprocessing_pipeline(X: pd.DataFrame):
    feature_names = X.columns.to_list()
    continuous_cols = X.select_dtypes(include=['int64','float64']).columns.to_list()
    categorical_cols = X.select_dtypes(include=['object','category','bool']).columns.to_list()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, continuous_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ], remainder='drop', verbose_feature_names_out=False)
    return preprocessor

In [4]:
# Cria pasta de saída
OUTPUT_DIR = 'data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

for idx, row in smallest_30.iterrows():
    dataset_id = int(row['did'] if 'did' in row else row['data_id'])
    name = row['name'].replace(' ', '_')
    print(f"Processando dataset {dataset_id} - {name}")
    # Baixar dataset
    dataset = openml.datasets.get_dataset(dataset_id)
    X_df, y_series, categorical_mask, attr_names = dataset.get_data(dataset_format='dataframe', target=dataset.default_target_attribute)
    y = y_series.to_numpy()

    # Split 70/30 estratificado
    X_train_df, X_test_df, y_train, y_test = train_test_split(
        X_df, y, test_size=0.30, random_state=42, stratify=y
    )

    # Pré-processamento
    preprocessor = build_preprocessing_pipeline(X_train_df)
    preprocessor.fit(X_train_df)
    X_train = preprocessor.transform(X_train_df)
    X_test = preprocessor.transform(X_test_df)

    # Criar pasta específica
    folder = os.path.join(OUTPUT_DIR, f"task_{dataset_id}")
    os.makedirs(folder, exist_ok=True)
    # Salvar .npz
    npz_path = os.path.join(folder, f"dataset_{dataset_id}.npz")
    np.savez_compressed(npz_path, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    # Salvar preprocessor
    joblib_path = os.path.join(folder, f"preprocessor_{dataset_id}.joblib")
    joblib.dump(preprocessor, joblib_path)
    print(f" Salvo: {npz_path}")
    print(f" Salvo: {joblib_path}")

Processando dataset 23381 - dresses-sales
 Salvo: data\task_23381\dataset_23381.npz
 Salvo: data\task_23381\preprocessor_23381.joblib
Processando dataset 1063 - kc2
 Salvo: data\task_1063\dataset_1063.npz
 Salvo: data\task_1063\preprocessor_1063.joblib
Processando dataset 6332 - cylinder-bands
 Salvo: data\task_6332\dataset_6332.npz
 Salvo: data\task_6332\preprocessor_6332.joblib
Processando dataset 40994 - climate-model-simulation-crashes
 Salvo: data\task_40994\dataset_40994.npz
 Salvo: data\task_40994\preprocessor_40994.joblib
Processando dataset 1510 - wdbc
 Salvo: data\task_1510\dataset_1510.npz
 Salvo: data\task_1510\preprocessor_1510.joblib
Processando dataset 1480 - ilpd
 Salvo: data\task_1480\dataset_1480.npz
 Salvo: data\task_1480\preprocessor_1480.joblib
Processando dataset 11 - balance-scale
 Salvo: data\task_11\dataset_11.npz
 Salvo: data\task_11\preprocessor_11.joblib
Processando dataset 29 - credit-approval
 Salvo: data\task_29\dataset_29.npz
 Salvo: data\task_29\preproc