In [2]:
import openml
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [14]:
warnings.filterwarnings("ignore")

# List all datasets and their properties
datasets_openml = openml.datasets.list_datasets(output_format="dataframe")
print(datasets_openml.shape)
# Get small datasets with more than 10 features
selec_datasets = datasets_openml.query('NumberOfInstances>1000 and NumberOfInstances<30000 and NumberOfFeatures>10 and NumberOfFeatures<500')
print(selec_datasets.shape)

res_datasets = []
for name_ds in tqdm(selec_datasets.name):
    try:
        # Get dataset by name
        dataset = openml.datasets.get_dataset(name_ds)
        # Get the data itself as a dataframe (or otherwise)
        X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
        if y is not None:
            # print(name_ds, y.dtypes, len(y), len(np.unique(y)))
            res_datasets.append(dict(id_ds=dataset.dataset_id,
                                    name_ds=name_ds, 
                                    nrows=X.shape[0],
                                    len_y=len(y),
                                    type_y=y.dtypes,
                                    num_classes=len(np.unique(y)),
                                    target_name=dataset.default_target_attribute))
    except:
        continue
res_datasets = pd.DataFrame(res_datasets)
res_datasets.to_csv('res_datasets.csv', index=False)

print(res_datasets.shape)
 
res_selec = res_datasets.query('type_y!="object" and type_y!="Sparse[float64, 0]" and len_y>2000 and type_y!="int64" and nrows<40000')
print(res_selec.shape)

res_selec = res_selec.drop_duplicates()
print(res_selec.shape)

# Remove datasets with similar name
selec_names = []
pos_selec = []
for nrow, name_ds in enumerate(res_selec.name_ds.values):
    three = name_ds[:3]
    if three not in selec_names:
        selec_names.append(three)
        pos_selec.append(nrow)
res_selec = res_selec.iloc[pos_selec]
print(res_selec.shape) 
print(res_selec.head())  

(911, 7)
(300, 7)
(158, 7)
(117, 7)
   id_ds      name_ds  nrows  len_y    type_y  num_classes target_name
0      3     kr-vs-kp   3196   3196  category            2       class
1      6       letter  20000  20000  category           26       class
7     24     mushroom   8124   8124  category            2       class
8     28    optdigits   5620   5620  category           10       class
9     30  page-blocks   5473   5473  category            5       class


In [15]:
warnings.filterwarnings("ignore")
# Inputs
# array(['bool', 'category', 'float64', 'int64', 'object', 'uint8'],

# Outputs
# array(['bool', 'category', 'float64', 'uint8'], dtype='<U8')

scaler = StandardScaler()
label_enc = LabelEncoder()
res_basedata = []
for nrow, row in res_selec.iterrows():
    # try:
    # Get dataset by name
    dataset = openml.datasets.get_dataset(row['name_ds'])
    # Get the data itself as a dataframe (or otherwise)
    X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
    assert y.isna().sum()==0
    
    # Transform Inputs
    # ----------------
    X_prim = []
    for namecol in X.columns:
        # Convert numeric to float64
        if X[namecol].dtypes==np.uint8 or X[namecol].dtypes==np.int64 or X[namecol].dtypes==np.float64 or X[namecol].dtypes==bool:
            X[namecol] = X[namecol].astype(np.float64)
            X[namecol] = X[namecol].fillna(X[namecol].mean())
            X[namecol] = scaler.fit_transform(X[namecol].values.reshape(-1,1)).flatten()
            X_prim.append(X[namecol])
        if str(X[namecol].dtypes)=='category' or str(X[namecol].dtypes)=='object':
            X_prim.append(pd.get_dummies(X[namecol], prefix=namecol).astype(np.float32))
    X_prim = pd.concat(X_prim, axis=1)
    
    # Transform Target
    # --------------
    if row['type_y'] == 'float64':
        y.fillna(y.mean(), inplace=True)
        y_values = y.values.flatten()
        y_values = scaler.fit_transform(y_values.reshape(-1,1)).flatten()
        type_prob = 'regression'
    if row['type_y'] == 'uint8':
        y = y.astype(np.int32)
        y_values = y.values.flatten()
        type_prob = 'binary' if row['num_classes']==2 else 'multiclass'
        
    if row['type_y'] == 'bool':
        y = y.astype(np.int32)
        y_values = y.values.flatten()
        type_prob = 'binary' if row['num_classes']==2 else 'multiclass'
    if row['type_y'] == 'category':
        y_values = label_enc.fit_transform(y)
        type_prob = 'binary' if row['num_classes']==2 else 'multiclass'
    
    name_file = row['name_ds'].replace('-','_').split('_')[0]
    print(name_file)
    
    # Include more 
    row['name_file'] = name_file + '.csv'
    row['type_prob'] = type_prob
    row['NFs'] = X_prim.shape[1]
    X_prim['target_end'] = y_values
    res_basedata.append(row)
    
    if type_prob == 'regression':
        print(row['name_ds'], X.shape, X_prim.shape, type_prob, X_prim['target_end'].mean(), X_prim['target_end'].std())
    if type_prob == 'binary' or type_prob == 'multiclass':
        print(row['name_ds'], X.shape, X_prim.shape, type_prob, X_prim['target_end'].nunique())
    X_prim.to_csv('./datasets/' + row['name_file'], index=False)

res_basedata = pd.DataFrame(res_basedata)
res_basedata.to_csv('res_basedata.csv', index=False)

kr
kr-vs-kp (3196, 36) (3196, 75) binary 2
letter
letter (20000, 16) (20000, 17) multiclass 26
mushroom
mushroom (8124, 22) (8124, 126) binary 2
optdigits
optdigits (5620, 64) (5620, 65) multiclass 10
page
page-blocks (5473, 10) (5473, 11) multiclass 5
pendigits
pendigits (10992, 16) (10992, 17) multiclass 10
segment
segment (2310, 19) (2310, 20) multiclass 7
sick
sick (3772, 29) (3772, 53) binary 2
spambase
spambase (4601, 57) (4601, 58) binary 2
splice
splice (3190, 60) (3190, 288) multiclass 3
hypothyroid
hypothyroid (3772, 29) (3772, 53) multiclass 4
waveform
waveform-5000 (5000, 40) (5000, 41) multiclass 3
satimage
satimage (6430, 36) (6430, 37) multiclass 6
cpu
cpu_act (8192, 21) (8192, 22) multiclass 56
pol
pol (15000, 48) (15000, 49) multiclass 11
elevators
elevators (16599, 18) (16599, 19) regression -9.237612047534769e-16 1.0000301236576215
wine
wine_quality (6497, 11) (6497, 12) multiclass 7
Ailerons
Ailerons (13750, 40) (13750, 41) regression -1.0748573748225515e-16 1.00003

In [5]:
res_basedata

[id_ds                 3
 name_ds        kr-vs-kp
 nrows              3196
 len_y              3196
 type_y         category
 num_classes           2
 target_name       class
 name_file        kr.csv
 type_prob        binary
 NFs                  74
 Name: 0, dtype: object,
 id_ds                   6
 name_ds            letter
 nrows               20000
 len_y               20000
 type_y           category
 num_classes            26
 target_name         class
 name_file      letter.csv
 type_prob      multiclass
 NFs                    16
 Name: 1, dtype: object,
 id_ds                    24
 name_ds            mushroom
 nrows                  8124
 len_y                  8124
 type_y             category
 num_classes               2
 target_name           class
 name_file      mushroom.csv
 type_prob            binary
 NFs                     125
 Name: 7, dtype: object,
 id_ds                     28
 name_ds            optdigits
 nrows                   5620
 len_y                   

In [5]:
import os

FOLDER_DATASETS = "datasets/"
df = pd.read_csv("res_basedata.csv") # Este es el de Pisón del artículo de HYBparsimony. Me voy a quedar solo con los binarios.
df = df.loc[df['type_prob'] == "binary"]
print("Number of selected datasets =", df.shape[0])
df.to_csv("datasets.csv",index=False)

# Borramos todos los datasets que no sean de clasificación binaria de la carpeta datasets
name_datasets = list(df["name_file"])

list_files = os.listdir(FOLDER_DATASETS)
print(list_files)
print(name_datasets)
for file in list_files:
    if file not in name_datasets:
        os.remove(FOLDER_DATASETS + file)
        print("Removed", file)

Number of selected datasets = 53
['ada.csv', 'Ailerons.csv', 'allbp.csv', 'ames.csv', 'ASP.csv', 'autoUniv.csv', 'avocado.csv', 'BachChoralHarmony.csv', 'bank32nh.csv', 'Bioresponse.csv', 'cardiotocography.csv', 'churn.csv', 'cjs.csv', 'clean2.csv', 'coil2000.csv', 'compas.csv', 'Contaminant.csv', 'CPMP.csv', 'cpu.csv', 'credit.csv', 'CreditCardSubset.csv', 'dataset.csv', 'default.csv', 'dis.csv', 'dna.csv', 'ECG5000.csv', 'eeg.csv', 'elevators.csv', 'employee.csv', 'eye.csv', 'FICO.csv', 'fifa.csv', 'first.csv', 'fps.csv', 'gas.csv', 'GesturePhaseSegmentationProcessed.csv', 'grid.csv', 'health.csv', 'heloc.csv', 'house.csv', 'hypothyroid.csv', 'IEEE80211aa.csv', 'Indian.csv', 'Insurance.csv', 'internet.csv', 'Intersectional.csv', 'ipums.csv', 'JapaneseVowels.csv', 'jasmine.csv', 'jm1.csv', 'Job.csv', 'jungle.csv', 'Kaggle.csv', 'kc1.csv', 'kdd.csv', 'kings.csv', 'kr.csv', 'law.csv', 'led24.csv', 'letter.csv', 'Long.csv', 'madeline.csv', 'MagicTelescope.csv', 'mc1.csv', 'Mercedes.csv',