# Preprocessing CC18 30 Menores Datasets
Este notebook seleciona os 30 menores datasets do benchmark OpenML-CC18, faz pré-processamento (imputação, encoding, normalização) e split 70/30, salvando os resultados em arquivos .npz e .joblib.

In [2]:
import openml
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

# Caso possua uma API key:
# openml.config.apikey = 'SUA_CHAVE_AQUI'


In [14]:
# Obter o benchmark suite CC18
benchmark_suite = openml.study.get_suite(suite_id='OpenML-CC18')
dataset_ids = benchmark_suite.data

# Listar metadados dos datasets
dlist = openml.datasets.list_datasets(data_id=dataset_ids, output_format='dataframe')
# Ordenar por número de instâncias
dlist_sorted_1 = dlist.sort_values(by='status')
dlist_sorted = dlist_sorted_1.sort_values(by='NumberOfInstances')
# Selecionar 30 menores
smallest_30 = dlist_sorted.head(30)
smallest_30

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
23381,23381,dresses-sales,2,64,active,ARFF,290.0,24.0,210.0,2.0,13.0,500.0,401.0,835.0,1.0,12.0
1063,1063,kc2,1,2,active,ARFF,415.0,2.0,107.0,2.0,22.0,522.0,0.0,0.0,21.0,1.0
40994,40994,climate-model-simulation-crashes,4,4265,active,ARFF,494.0,2.0,46.0,2.0,21.0,540.0,0.0,0.0,20.0,1.0
6332,6332,cylinder-bands,2,2,active,ARFF,312.0,71.0,228.0,2.0,40.0,540.0,263.0,999.0,18.0,22.0
1510,1510,wdbc,1,64,active,ARFF,357.0,2.0,212.0,2.0,31.0,569.0,0.0,0.0,30.0,1.0
1480,1480,ilpd,1,64,active,ARFF,416.0,2.0,167.0,2.0,11.0,583.0,0.0,0.0,9.0,2.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0
29,29,credit-approval,1,1,active,ARFF,383.0,14.0,307.0,2.0,16.0,690.0,37.0,67.0,6.0,10.0
15,15,breast-w,1,1,active,ARFF,458.0,2.0,241.0,2.0,10.0,699.0,16.0,16.0,9.0,1.0
188,188,eucalyptus,1,1,active,ARFF,214.0,27.0,105.0,5.0,20.0,736.0,95.0,448.0,14.0,6.0


In [18]:
OUTPUT_DIR = "raw_cc18_30"
os.makedirs(OUTPUT_DIR, exist_ok=True)

for idx, row in smallest_30.iterrows():
    data_id = int(row["did"])
    name = row["name"].replace(" ", "_")
    csv_filename = f"{data_id}_{name}.csv"
    csv_path = os.path.join(OUTPUT_DIR, csv_filename)

    try:
        # a) Baixar o dataset completo via OpenML
        dataset = openml.datasets.get_dataset(data_id)
        X_df, y_series, _, _ = dataset.get_data(
            dataset_format="dataframe",
            target=dataset.default_target_attribute
        )

        # b) Concatena features e target num único DataFrame
        df = X_df.copy()
        df["target"] = y_series

        # c) Salvar em CSV
        df.to_csv(csv_path, index=False)
        print(f"→ Salvou: {csv_filename}  ({X_df.shape[0]} instâncias, {X_df.shape[1]} features + 1 alvo)")

    except Exception as e:
        print(f"[PULO] Falha ao baixar data_id={data_id} ({name}): {e}")

→ Salvou: 23381_dresses-sales.csv  (500 instâncias, 12 features + 1 alvo)
→ Salvou: 1063_kc2.csv  (522 instâncias, 21 features + 1 alvo)
→ Salvou: 40994_climate-model-simulation-crashes.csv  (540 instâncias, 18 features + 1 alvo)
→ Salvou: 6332_cylinder-bands.csv  (540 instâncias, 37 features + 1 alvo)
→ Salvou: 1510_wdbc.csv  (569 instâncias, 30 features + 1 alvo)
→ Salvou: 1480_ilpd.csv  (583 instâncias, 10 features + 1 alvo)
→ Salvou: 11_balance-scale.csv  (625 instâncias, 4 features + 1 alvo)
→ Salvou: 29_credit-approval.csv  (690 instâncias, 15 features + 1 alvo)
→ Salvou: 15_breast-w.csv  (699 instâncias, 9 features + 1 alvo)
→ Salvou: 188_eucalyptus.csv  (736 instâncias, 19 features + 1 alvo)
→ Salvou: 1464_blood-transfusion-service-center.csv  (748 instâncias, 4 features + 1 alvo)
→ Salvou: 37_diabetes.csv  (768 instâncias, 8 features + 1 alvo)
→ Salvou: 469_analcatdata_dmft.csv  (797 instâncias, 4 features + 1 alvo)
→ Salvou: 458_analcatdata_authorship.csv  (841 instâncias, 70