## Listando os datasets mais desbalanceados

In [1]:
import openml
with open('apikey.txt', 'r') as f:
    openml.config.apikey = f.read()

from tqdm import tqdm
import multiprocessing as mp

In [2]:
# Obtemos as task ids manualmente no website fornecido na descrição do trabalho
task_ids = [31, 10101, 3913, 3, 3917, 9957, 9946, 3918, 3903, 37, 9971, 9952, 3902, 49, 43, 9978, 10093, 219, 9976, 14965, 6, 9977, 53, 11, 15, 16, 14, 32, 3549, 12, 9981, 18, 28, 2074, 29, 45, 125922, 9960, 9964, 22, 2079,
            14969, 3560, 14952, 125920, 23, 3904, 3022, 9985, 9910, 14970, 3021, 3481, 7592, 3573, 146824, 146820, 146822, 146195, 146800, 146817, 146819, 146821, 167119, 14954, 167141, 167140, 167120, 167125, 146825, 167124, 167121]

In [3]:
def get_dataset_by_task_id(id):
    task = openml.tasks.get_task(id)
    X, y, categorical_indicator, attribute_names = task.get_dataset().get_data()
    return {
        'id': id, 'X': X, 'y': y,
        'categorical_indicator': categorical_indicator,
        'attribute_names': attribute_names
    }

pool = mp.Pool(mp.cpu_count())
datasets = list(tqdm(pool.imap(get_dataset_by_task_id, task_ids), total=len(task_ids)))

100%|██████████| 72/72 [00:04<00:00, 17.31it/s]


Abaixo definimos o valor de desbalanceamento `imbalance` como a maior proporção entre as labels do dataset.

In [4]:
datasets_info = []

for ds in datasets:
    df = ds['X']
    last_col = df.columns[-1]
    
    uniques = df[last_col].unique()
    if len(uniques) == 2:
        imbalance = max(len(df[df[last_col] == u]) / len(df) for u in uniques)

        id = ds['id']
        datasets_info.append({
            'id': id,
            'imbalance': imbalance,
            'n_rows': len(df),
            'n_cols': df.shape[1]
        })

In [5]:
# Ordenando do mais desbalanceado para o menos
class_imbalance_list = sorted(datasets_info, key=lambda x: x['imbalance'], reverse=True)
for l in class_imbalance_list:
    print(l)

{'id': 146820, 'imbalance': 0.9460632362058277, 'n_rows': 4839, 'n_cols': 6}
{'id': 3021, 'imbalance': 0.9387592788971368, 'n_rows': 3772, 'n_cols': 30}
{'id': 9978, 'imbalance': 0.9368587213891081, 'n_rows': 2534, 'n_cols': 73}
{'id': 3918, 'imbalance': 0.9305680793507665, 'n_rows': 1109, 'n_cols': 22}
{'id': 146819, 'imbalance': 0.9148148148148149, 'n_rows': 540, 'n_cols': 19}
{'id': 3903, 'imbalance': 0.8976327575175944, 'n_rows': 1563, 'n_cols': 38}
{'id': 14965, 'imbalance': 0.8830151954170445, 'n_rows': 45211, 'n_cols': 17}
{'id': 3902, 'imbalance': 0.877914951989026, 'n_rows': 1458, 'n_cols': 38}
{'id': 167125, 'imbalance': 0.8600182982616651, 'n_rows': 3279, 'n_cols': 1559}
{'id': 167141, 'imbalance': 0.8586, 'n_rows': 5000, 'n_cols': 21}
{'id': 3917, 'imbalance': 0.8454243717401612, 'n_rows': 2109, 'n_cols': 22}
{'id': 3904, 'imbalance': 0.8065227377124483, 'n_rows': 10885, 'n_cols': 22}
{'id': 3913, 'imbalance': 0.7950191570881227, 'n_rows': 522, 'n_cols': 22}
{'id': 10101, '

## Testando os datasets na nossa implementação do modelo

In [6]:
from decisiontree import DecisionTree

from copy import deepcopy
import itertools
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split

Selecionamos os 20 datasets mais desbalanceados, apesar de usarmos apenas os 10 mais posteriormente, pois alguns deles se mostraram problemáticos e então foram descartados.

In [7]:
def get_result(dataset):
    try:
        id = dataset['id']
        task = openml.tasks.get_task(id)
        dataframe, _, _, _ = task.get_dataset().get_data()
        dataframe = dataframe.dropna()

        X = dataframe.values[:, :-1]
        y = dataframe.values[:, -1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42, stratify=y)

        # Utilizamos desse valor de profundidade máxima por ser uma 'rule of thumb'
        max_depth = int(np.sqrt(X.shape[1]))

        lines = []

        dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
        dt.fit(X_train, y_train)
        res = dt.evaluate(X_test, y_test)

        line = deepcopy(dataset)
        line['result'] = res
        line['desimbalancer'] = False
        lines.append(line)

        dt = DecisionTree(max_depth=max_depth, desimbalancer=True)
        dt.fit(X_train, y_train)
        res = dt.evaluate(X_test, y_test)

        line = deepcopy(dataset)
        line['result'] = res
        line['desimbalancer'] = True
        lines.append(line)
        
        return lines
    except Exception:
        return None
    
pool = mp.Pool(mp.cpu_count())
results = list(tqdm(pool.imap(get_result, class_imbalance_list[:20]), total=20))

100%|██████████| 20/20 [01:37<00:00,  4.90s/it]


In [8]:
# Filtragem de somente os resultados válidos
results = [r for r in results if r is not None]

# Transformando a lista de listas em uma só
results = list(itertools.chain.from_iterable(results))

In [9]:
pd.DataFrame(results).head()

Unnamed: 0,id,imbalance,n_rows,n_cols,result,desimbalancer
0,146820,0.946063,4839,6,0.946149,False
1,146820,0.946063,4839,6,0.534126,True
2,9978,0.936859,2534,73,0.933094,False
3,9978,0.936859,2534,73,0.708483,True
4,3918,0.930568,1109,22,0.931694,False


Adequação dos resultados para um formato a ser melhor utilizado em plots

In [10]:
plot_df = pd.DataFrame(results)
plot_df['id'] = plot_df['id'].astype(str)

plot_df = plot_df.sort_values('imbalance', ascending=False)
plot_df = plot_df[:20]

In [11]:
fig = px.bar(plot_df, x='id', y='result', color='desimbalancer', barmode='group',
        title='Accuracy do modelo para os 10 datasets mais desbalanceados',
        labels={
                'id': 'ID da task',
                'result': 'Accuracy',
                'desimbalancer': 'Peso de balanceamento<br>(nossa heurística)'
        },
    custom_data=['result', 'imbalance', 'n_rows', 'n_cols']
)

fig.update_traces(
    hovertemplate="<br>".join([
        "Accuracy: %{customdata[0]:.4f}",
        "Imbalance: %{customdata[1]:.4f}",
        "N Rows: %{customdata[2]}",
        "N Cols: %{customdata[3]}",
    ])
)

True Positive Rate dos diferentes algoritmos de balanceamento

In [12]:
task = openml.tasks.get_task(146819)
dataframe, _, _, _ = task.get_dataset().get_data()
dataframe = dataframe.dropna()

X = dataframe.values[:, :-1]
y = dataframe.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42, stratify=y)

max_depth = int(np.sqrt(X.shape[1]))

set(y)

{'0', '1'}

In [13]:
X_test_1, X_test_2, y_test_1, y_test_2 = [], [], [], []
for x_i, y_i in zip(X_test, y_test):
    if y_i == '0':
        X_test_1.append(x_i)
        y_test_1.append(y_i)
    elif y_i == '1':
        X_test_2.append(x_i)
        y_test_2.append(y_i)
    else:
        raise Exception('''This shouldn't happen''')
    
X_test_1 = np.array(X_test_1)
X_test_2 = np.array(X_test_2)
y_test_1 = np.array(y_test_1)
y_test_2 = np.array(y_test_2)

In [14]:
lines = []
for alg in ['exp', 'linear', 'inv_exp']:
    dt = DecisionTree(max_depth=max_depth, desimbalancer=True, desimbalancer_func=alg)
    dt.fit(X_train, y_train)
    
    res_1 = dt.evaluate(X_test_1, y_test_1)
    res_2 = dt.evaluate(X_test_2, y_test_2)

    lines.append({'alg': alg, 'result': res_1, 'label': '1'})
    lines.append({'alg': alg, 'result': res_2, 'label': '2'})

In [15]:
plot_df = pd.DataFrame(lines)

In [18]:
fig = px.bar(plot_df, x='alg', y='result', color='label', barmode='group',
        title='Accuracy do modelo para os diferentes algoritmos de balanceamento na task 146819',
        labels={
                'alg': 'Algoritmo',
                'result': 'Accuracy',
                'label': 'Label',
        },
        #custom_data=['result', 'imbalance', 'n_rows', 'n_cols']
)

# fig.update_traces(
#     hovertemplate="<br>".join([
#         "Accuracy: %{customdata[0]:.4f}",
#         "Imbalance: %{customdata[1]:.4f}",
#         "N Rows: %{customdata[2]}",
#         "N Cols: %{customdata[3]}",
#     ])
# )

fig.show()