In [2]:
import openml
openml.config.apikey = 'eb60f811ca0d9f4f846d59be082919f9'

from tqdm import tqdm

import multiprocessing as mp

In [3]:
task_ids = [31, 10101, 3913, 3, 3917, 9957, 9946, 3918, 3903, 37, 9971, 9952, 3902, 49, 43, 9978, 10093, 219, 9976, 14965, 6, 9977, 53, 11, 15, 16, 14, 32, 3549, 12, 9981, 18, 28, 2074, 29, 45, 125922, 9960, 9964, 22, 2079,
            14969, 3560, 14952, 125920, 23, 3904, 3022, 9985, 9910, 14970, 3021, 3481, 7592, 3573, 146824, 146820, 146822, 146195, 146800, 146817, 146819, 146821, 167119, 14954, 167141, 167140, 167120, 167125, 146825, 167124, 167121]


In [4]:
def get_dataset_by_task_id(id):
    task = openml.tasks.get_task(id)
    X, y, categorical_indicator, attribute_names = task.get_dataset().get_data()
    return {
        'id': id, 'X': X, 'y': y,
        'categorical_indicator': categorical_indicator,
        'attribute_names': attribute_names
    }

pool = mp.Pool(mp.cpu_count())
datasets = list(tqdm(pool.imap(get_dataset_by_task_id, task_ids), total=len(task_ids)))

100%|██████████| 72/72 [00:04<00:00, 15.94it/s]


## Class imbalance

In [5]:
class_imbalance_list = []

for ds in datasets:
    df = ds['X']
    last_col = df.columns[-1]
    
    uniques = df[last_col].unique()
    if len(uniques) == 2:
        perc = min(len(df[df[last_col] == u]) / len(df) for u in uniques)

        id = ds['id']
        class_imbalance_list.append({
            'id': id,
            'perc': perc,
            'n_rows': len(df),
            'n_cols': df.shape[1]
        })

In [6]:
# print('(id, class_with_least_percentage_of_representation)')
# sorted(class_imbalance_dict.items(), key=lambda x: x[1])
class_imbalance_list = sorted(class_imbalance_list, key=lambda x: x['perc'])
for l in class_imbalance_list:
    print(l)

{'id': 146820, 'perc': 0.05393676379417235, 'n_rows': 4839, 'n_cols': 6}
{'id': 3021, 'perc': 0.0612407211028632, 'n_rows': 3772, 'n_cols': 30}
{'id': 9978, 'perc': 0.06314127861089187, 'n_rows': 2534, 'n_cols': 73}
{'id': 3918, 'perc': 0.06943192064923355, 'n_rows': 1109, 'n_cols': 22}
{'id': 146819, 'perc': 0.08518518518518518, 'n_rows': 540, 'n_cols': 19}
{'id': 3903, 'perc': 0.10236724248240563, 'n_rows': 1563, 'n_cols': 38}
{'id': 14965, 'perc': 0.11698480458295547, 'n_rows': 45211, 'n_cols': 17}
{'id': 3902, 'perc': 0.12208504801097393, 'n_rows': 1458, 'n_cols': 38}
{'id': 167125, 'perc': 0.13998170173833485, 'n_rows': 3279, 'n_cols': 1559}
{'id': 167141, 'perc': 0.1414, 'n_rows': 5000, 'n_cols': 21}
{'id': 3917, 'perc': 0.15457562825983878, 'n_rows': 2109, 'n_cols': 22}
{'id': 3904, 'perc': 0.1934772622875517, 'n_rows': 10885, 'n_cols': 22}
{'id': 3913, 'perc': 0.2049808429118774, 'n_rows': 522, 'n_cols': 22}
{'id': 10101, 'perc': 0.23796791443850268, 'n_rows': 748, 'n_cols': 5}

In [7]:
from decisiontree import DecisionTree

from copy import deepcopy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import openml
with open('apikey.txt', 'r') as f:
    openml.config.apikey = f.read()

In [8]:
def get_result(dataset):
    try:
        id = dataset['id']
        task = openml.tasks.get_task(id)
        data, _, _, _ = task.get_dataset().get_data()
        data = data.dropna()

        X = data.values[:, :-1]
        y = data.values[:, -1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

        max_depth = int(np.sqrt(X.shape[1]))

        dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
        dt.fit(X_train, y_train)
        res_false = dt.evaluate(X_test, y_test)

        dt = DecisionTree(max_depth=max_depth, desimbalancer=True)
        dt.fit(X_train, y_train)
        res_true = dt.evaluate(X_test, y_test)

        line = deepcopy(dataset)
        line['result_false'] = res_false
        line['result_true'] = res_true
        
        return line
    except Exception:
        return None
    
pool = mp.Pool(mp.cpu_count())
results = list(tqdm(pool.imap(get_result, class_imbalance_list[:20]), total=20))
    

100%|██████████| 20/20 [01:34<00:00,  4.71s/it]


In [9]:
results = [r for r in results if r is not None]
results
len(results)

19

In [10]:
results = results[:10]

In [11]:
original_results_df = pd.DataFrame(results)
original_results_df

Unnamed: 0,id,perc,n_rows,n_cols,result_false,result_true
0,146820,0.053937,4839,6,0.945523,0.536005
1,9978,0.063141,2534,73,0.936679,0.734767
2,3918,0.069432,1109,22,0.918033,0.598361
3,146819,0.085185,540,19,0.899441,0.625698
4,3903,0.102367,1563,38,0.889535,0.647287
5,14965,0.116985,45211,17,0.884249,0.702882
6,3902,0.122085,1458,38,0.865145,0.661826
7,167125,0.139982,3279,1559,0.954755,0.946445
8,167141,0.1414,5000,21,0.858788,0.68303
9,3917,0.154576,2109,22,0.844828,0.622126


In [13]:
import plotly.express as px

new_results = []
for r in sorted(results, key=lambda x: x['imbalance'], reverse=False):
    new_line1 = deepcopy(r)
    new_line2 = deepcopy(r)

    new_line1['desimbalancer'] = False
    new_line1['result'] = new_line1['result_false']
    del new_line1['result_false']
    del new_line1['result_true']

    new_line2['desimbalancer'] = True
    new_line2['result'] = new_line2['result_true']
    del new_line2['result_false']
    del new_line2['result_true']

    new_results.append(new_line1)
    new_results.append(new_line2)

results_df = pd.DataFrame(new_results)
results_df['id'] = results_df['id'].astype(str)

fig = px.bar(results_df, x='id', y='result', color='desimbalancer', barmode='group',
    #    title='Quantidade de memória utilizada por tabuleiro',
    #    labels={
    #         'game_id': 'Identificador do tabuleiro',
    #         'memory_used': 'Quantidade de blocos de memória utilizada',
    #         'alg': 'Algoritmo'
    #    }
    custom_data=['result', 'imbalance', 'n_rows', 'n_cols']
)

fig.update_traces(
    hovertemplate="<br>".join([
        "Evaluation: %{customdata[0]:.4f}",
        "Imbalance: %{customdata[1]:.4f}",
        "N Rows: %{customdata[2]}",
        "N Cols: %{customdata[3]}",
    ])
)

KeyError: 'imbalance'

## Amount of labels

In [10]:
n_labels_dict = dict()

for ds in datasets:
    df = ds['X']
    id = ds['id']
    last_col = df.columns[-1]
    n_labels_dict[id] = len(df[last_col].unique())

In [12]:
print('(id, n_labels)')
sorted(n_labels_dict.items(), key=lambda x: x[1], reverse=True)

(id, n_labels)


[(167121, 46),
 (6, 26),
 (3481, 26),
 (125922, 11),
 (3022, 11),
 (16, 10),
 (14, 10),
 (32, 10),
 (12, 10),
 (18, 10),
 (28, 10),
 (9964, 10),
 (22, 10),
 (3573, 10),
 (146824, 10),
 (146825, 10),
 (167124, 10),
 (9981, 9),
 (146800, 8),
 (146822, 7),
 (146817, 7),
 (2074, 6),
 (3560, 6),
 (9985, 6),
 (14970, 6),
 (2079, 5),
 (14969, 5),
 (53, 4),
 (3549, 4),
 (9960, 4),
 (146821, 4),
 (11, 3),
 (45, 3),
 (23, 3),
 (146195, 3),
 (167119, 3),
 (167140, 3),
 (31, 2),
 (10101, 2),
 (3913, 2),
 (3, 2),
 (3917, 2),
 (9957, 2),
 (9946, 2),
 (3918, 2),
 (3903, 2),
 (37, 2),
 (9971, 2),
 (9952, 2),
 (3902, 2),
 (49, 2),
 (43, 2),
 (9978, 2),
 (10093, 2),
 (219, 2),
 (9976, 2),
 (14965, 2),
 (9977, 2),
 (15, 2),
 (29, 2),
 (14952, 2),
 (125920, 2),
 (3904, 2),
 (9910, 2),
 (3021, 2),
 (7592, 2),
 (146820, 2),
 (146819, 2),
 (14954, 2),
 (167141, 2),
 (167120, 2),
 (167125, 2)]