In [2]:
import openml
openml.config.apikey = 'eb60f811ca0d9f4f846d59be082919f9'

from tqdm import tqdm

import multiprocessing as mp

In [3]:
task_ids = [31, 10101, 3913, 3, 3917, 9957, 9946, 3918, 3903, 37, 9971, 9952, 3902, 49, 43, 9978, 10093, 219, 9976, 14965, 6, 9977, 53, 11, 15, 16, 14, 32, 3549, 12, 9981, 18, 28, 2074, 29, 45, 125922, 9960, 9964, 22, 2079,
            14969, 3560, 14952, 125920, 23, 3904, 3022, 9985, 9910, 14970, 3021, 3481, 7592, 3573, 146824, 146820, 146822, 146195, 146800, 146817, 146819, 146821, 167119, 14954, 167141, 167140, 167120, 167125, 146825, 167124, 167121]


In [5]:
def get_dataset_by_task_id(id):
    task = openml.tasks.get_task(id)
    X, y, categorical_indicator, attribute_names = task.get_dataset().get_data()
    return {
        'id': id, 'X': X, 'y': y,
        'categorical_indicator': categorical_indicator,
        'attribute_names': attribute_names
    }

pool = mp.Pool(mp.cpu_count())
datasets = list(tqdm(pool.imap(get_dataset_by_task_id, task_ids), total=len(task_ids)))

100%|██████████| 72/72 [03:20<00:00,  2.79s/it]


## Class imbalance

In [7]:
class_imbalance_dict = dict()

for ds in datasets:
    df = ds['X']
    last_col = df.columns[-1]
    
    uniques = df[last_col].unique()
    if len(uniques) == 2:
        perc = min(len(df[df[last_col] == u]) / len(df) for u in uniques)

        id = ds['id']
        class_imbalance_dict[id] = perc

In [13]:
print('(id, class_with_least_percetage_of_representation)')
sorted(class_imbalance_dict.items(), key=lambda x: x[1])

(id, class_with_least_percetage_of_representation)


[(146820, 0.05393676379417235),
 (3021, 0.0612407211028632),
 (9978, 0.06314127861089187),
 (3918, 0.06943192064923355),
 (146819, 0.08518518518518518),
 (3903, 0.10236724248240563),
 (14965, 0.11698480458295547),
 (3902, 0.12208504801097393),
 (167125, 0.13998170173833485),
 (167141, 0.1414),
 (3917, 0.15457562825983878),
 (3904, 0.1934772622875517),
 (3913, 0.2049808429118774),
 (10101, 0.23796791443850268),
 (7592, 0.23928176569346055),
 (9977, 0.2856230958943856),
 (9971, 0.2864493996569468),
 (9952, 0.2934863064396743),
 (31, 0.3),
 (9957, 0.33744075829383885),
 (15, 0.3447782546494993),
 (49, 0.3465553235908142),
 (37, 0.3489583333333333),
 (9946, 0.37258347978910367),
 (43, 0.39404477287546186),
 (125920, 0.42),
 (14954, 0.4222222222222222),
 (219, 0.4245453742937853),
 (14952, 0.44305744007236547),
 (10093, 0.4446064139941691),
 (29, 0.4449275362318841),
 (9910, 0.4577446014396161),
 (3, 0.47778473091364204),
 (167120, 0.4948297342192691),
 (9976, 0.5)]

## Amount of labels

In [10]:
n_labels_dict = dict()

for ds in datasets:
    df = ds['X']
    id = ds['id']
    last_col = df.columns[-1]
    n_labels_dict[id] = len(df[last_col].unique())

In [12]:
print('(id, n_labels)')
sorted(n_labels_dict.items(), key=lambda x: x[1], reverse=True)

(id, n_labels)


[(167121, 46),
 (6, 26),
 (3481, 26),
 (125922, 11),
 (3022, 11),
 (16, 10),
 (14, 10),
 (32, 10),
 (12, 10),
 (18, 10),
 (28, 10),
 (9964, 10),
 (22, 10),
 (3573, 10),
 (146824, 10),
 (146825, 10),
 (167124, 10),
 (9981, 9),
 (146800, 8),
 (146822, 7),
 (146817, 7),
 (2074, 6),
 (3560, 6),
 (9985, 6),
 (14970, 6),
 (2079, 5),
 (14969, 5),
 (53, 4),
 (3549, 4),
 (9960, 4),
 (146821, 4),
 (11, 3),
 (45, 3),
 (23, 3),
 (146195, 3),
 (167119, 3),
 (167140, 3),
 (31, 2),
 (10101, 2),
 (3913, 2),
 (3, 2),
 (3917, 2),
 (9957, 2),
 (9946, 2),
 (3918, 2),
 (3903, 2),
 (37, 2),
 (9971, 2),
 (9952, 2),
 (3902, 2),
 (49, 2),
 (43, 2),
 (9978, 2),
 (10093, 2),
 (219, 2),
 (9976, 2),
 (14965, 2),
 (9977, 2),
 (15, 2),
 (29, 2),
 (14952, 2),
 (125920, 2),
 (3904, 2),
 (9910, 2),
 (3021, 2),
 (7592, 2),
 (146820, 2),
 (146819, 2),
 (14954, 2),
 (167141, 2),
 (167120, 2),
 (167125, 2)]