## Listando os datasets mais desbalanceados

In [5]:
import openml
with open('apikey.txt', 'r') as f:
    openml.config.apikey = f.read()

from tqdm import tqdm
import multiprocessing as mp

In [6]:
# Obtemos as task ids manualmente no website fornecido na descrição do trabalho
task_ids = [31, 10101, 3913, 3, 3917, 9957, 9946, 3918, 3903, 37, 9971, 9952, 3902, 49, 43, 9978, 10093, 219, 9976, 14965, 6, 9977, 53, 11, 15, 16, 14, 32, 3549, 12, 9981, 18, 28, 2074, 29, 45, 125922, 9960, 9964, 22, 2079,
            14969, 3560, 14952, 125920, 23, 3904, 3022, 9985, 9910, 14970, 3021, 3481, 7592, 3573, 146824, 146820, 146822, 146195, 146800, 146817, 146819, 146821, 167119, 14954, 167141, 167140, 167120, 167125, 146825, 167124, 167121]

In [7]:
def get_dataset_by_task_id(id):
    task = openml.tasks.get_task(id)
    X, y, categorical_indicator, attribute_names = task.get_dataset().get_data()
    return {
        'id': id, 'X': X, 'y': y,
        'categorical_indicator': categorical_indicator,
        'attribute_names': attribute_names
    }

In [8]:
datasets = [get_dataset_by_task_id(id) for id in tqdm(task_ids)]

100%|██████████| 72/72 [00:03<00:00, 20.06it/s]


Abaixo definimos o valor de desbalanceamento `imbalance` como a maior proporção entre as labels do dataset.

In [9]:
datasets_info = []

for ds in datasets:
    df = ds['X']
    last_col = df.columns[-1]
    
    uniques = df[last_col].unique()
    if len(uniques) == 2:
        imbalance = max(len(df[df[last_col] == u]) / len(df) for u in uniques)

        id = ds['id']
        datasets_info.append({
            'id': id,
            'Imbalance': "{:.2f}".format(imbalance*100)+"%",
            'N Rows': len(df),
            'N Cols': df.shape[1]
        })

In [10]:
# Ordenando do mais desbalanceado para o menos
class_imbalance_list = sorted(datasets_info, key=lambda x: x['Imbalance'], reverse=True)
for l in class_imbalance_list:
    print(l)

{'id': 146820, 'Imbalance': '94.61%', 'N Rows': 4839, 'N Cols': 6}
{'id': 3021, 'Imbalance': '93.88%', 'N Rows': 3772, 'N Cols': 30}
{'id': 9978, 'Imbalance': '93.69%', 'N Rows': 2534, 'N Cols': 73}
{'id': 3918, 'Imbalance': '93.06%', 'N Rows': 1109, 'N Cols': 22}
{'id': 146819, 'Imbalance': '91.48%', 'N Rows': 540, 'N Cols': 19}
{'id': 3903, 'Imbalance': '89.76%', 'N Rows': 1563, 'N Cols': 38}
{'id': 14965, 'Imbalance': '88.30%', 'N Rows': 45211, 'N Cols': 17}
{'id': 3902, 'Imbalance': '87.79%', 'N Rows': 1458, 'N Cols': 38}
{'id': 167125, 'Imbalance': '86.00%', 'N Rows': 3279, 'N Cols': 1559}
{'id': 167141, 'Imbalance': '85.86%', 'N Rows': 5000, 'N Cols': 21}
{'id': 3917, 'Imbalance': '84.54%', 'N Rows': 2109, 'N Cols': 22}
{'id': 3904, 'Imbalance': '80.65%', 'N Rows': 10885, 'N Cols': 22}
{'id': 3913, 'Imbalance': '79.50%', 'N Rows': 522, 'N Cols': 22}
{'id': 10101, 'Imbalance': '76.20%', 'N Rows': 748, 'N Cols': 5}
{'id': 7592, 'Imbalance': '76.07%', 'N Rows': 48842, 'N Cols': 15}


## Testando os datasets na nossa implementação do modelo

In [12]:
import itertools
import numpy as np
import pandas as pd
import plotly.express as px
from copy import deepcopy
from sklearn.model_selection import train_test_split

from decisiontree import DecisionTree

Selecionamos os 20 datasets mais desbalanceados, apesar de usarmos apenas os 10 mais posteriormente, pois alguns deles se mostraram problemáticos e então foram descartados.

In [13]:
def get_result(dataset):
    try:
        id = dataset['id']
        task = openml.tasks.get_task(id)
        dataframe, _, _, _ = task.get_dataset().get_data()
        dataframe = dataframe.dropna()

        X = dataframe.values[:, :-1]
        y = dataframe.values[:, -1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42, stratify=y)

        # Utilizamos desse valor de profundidade máxima por ser uma 'rule of thumb'
        max_depth = int(np.sqrt(X.shape[1]))

        lines = []

        dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
        dt.fit(X_train, y_train)
        res = dt.evaluate(X_test, y_test)
        c1, c2 = dt.trp(X_test, y_test)
        line = deepcopy(dataset)
        line['Accuracy'] = "{:.2f}".format(res*100)+"%"
        line['Class 1 True Rate'] = "{:.2f}".format(c1*100)+"%"
        line['Class 2 True Rate'] = "{:.2f}".format(c2*100)+"%"
        line['Desimbalancer?'] = False
        lines.append(line)

        dt = DecisionTree(max_depth=max_depth, desimbalancer=True)
        dt.fit(X_train, y_train)
        res = dt.evaluate(X_test, y_test)
        c1, c2 = dt.trp(X_test, y_test)
        
        line = deepcopy(dataset)
        line['Accuracy'] = "{:.2f}".format(res*100)+"%"
        line['Class 1 True Rate'] = "{:.2f}".format(c1*100)+"%"
        line['Class 2 True Rate'] = "{:.2f}".format(c2*100)+"%"
        line['Desimbalancer?'] = True
        lines.append(line)
        
        return lines
    except Exception:
        return None

In [14]:
results = [get_result(dataset) for dataset in tqdm(class_imbalance_list[:20])]

100%|██████████| 20/20 [03:05<00:00,  9.30s/it]


In [15]:
# Filtragem de somente os resultados válidos
results = [r for r in results if r is not None]

# Transformando a lista de listas em uma só
results = list(itertools.chain.from_iterable(results))

In [16]:
pd.DataFrame(results).head()

Unnamed: 0,id,Imbalance,N Rows,N Cols,Accuracy,Class 1 True Rate,Class 2 True Rate,Desimbalancer?
0,146820,94.61%,4839,6,94.61%,0.00%,100.00%,False
1,146820,94.61%,4839,6,53.41%,52.15%,75.58%,True
2,9978,93.69%,2534,73,93.31%,5.66%,99.23%,False
3,9978,93.69%,2534,73,70.85%,66.04%,71.17%,True
4,3918,93.06%,1109,22,93.17%,0.00%,100.00%,False


Adequação dos resultados para um formato a ser melhor utilizado em plots

In [17]:
plot_df = pd.DataFrame(results)
plot_df['id'] = plot_df['id'].astype(str)
plot_df['Accuracy'] = plot_df['Accuracy'].apply(lambda x: x.replace('%', '')).astype(float)
plot_df['Imbalance'] = plot_df['Imbalance'].apply(lambda x: x.replace('%', '')).astype(float)

plot_df = plot_df.sort_values('Imbalance', ascending=False)

plot_df = plot_df[:20]

In [18]:
fig = px.bar(plot_df, x='id', y='Accuracy', color='Desimbalancer?', barmode='group',
        title='Accuracy do modelo para os 10 datasets mais desbalanceados',
        labels={
                'id': 'ID da task',
                'Accuracy (%)': 'Accuracy',
                'Desimbalancer?': 'Peso de balanceamento<br>(nossa heurística)'
        },
    custom_data=['Accuracy', 'Imbalance', 'N Rows', 'N Cols']
)

fig.update_traces(
    hovertemplate="<br>".join([
        "Accuracy (%): %{customdata[0]:.2f}",
        "Imbalance: %{customdata[1]:.2f}",
        "N Rows: %{customdata[2]}",
        "N Cols: %{customdata[3]}",
    ])
)


True Positive Rate dos diferentes algoritmos de balanceamento

In [19]:
task = openml.tasks.get_task(146819)
dataframe, _, _, _ = task.get_dataset().get_data()
dataframe = dataframe.dropna()

X = dataframe.values[:, :-1]
y = dataframe.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42, stratify=y)

max_depth = int(np.sqrt(X.shape[1]))

set(y)

{'0', '1'}

In [20]:
X_test_1, X_test_2, y_test_1, y_test_2 = [], [], [], []
for x_i, y_i in zip(X_test, y_test):
    if y_i == '0':
        X_test_1.append(x_i)
        y_test_1.append(y_i)
    elif y_i == '1':
        X_test_2.append(x_i)
        y_test_2.append(y_i)
    else:
        raise Exception('''This shouldn't happen''')
    
X_test_1 = np.array(X_test_1)
X_test_2 = np.array(X_test_2)
y_test_1 = np.array(y_test_1)
y_test_2 = np.array(y_test_2)

In [38]:
lines = []

In [39]:
for alg in ['exp', 'linear', 'inv_exp', 'linear_compensated']:
    dt = DecisionTree(max_depth=max_depth, desimbalancer=True, desimbalancer_func=alg)
    dt.fit(X_train, y_train)
    
    res_1 = dt.evaluate(X_test_1, y_test_1) * 100
    res_2 = dt.evaluate(X_test_2, y_test_2) * 100

    lines.append({'alg': alg, 'result': res_1, 'label': '1'})
    lines.append({'alg': alg, 'result': res_2, 'label': '2'})

In [40]:
dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
dt.fit(X_train, y_train)

res_1 = dt.evaluate(X_test_1, y_test_1) * 100
res_2 = dt.evaluate(X_test_2, y_test_2) * 100

lines.append({'alg': 'no desimbalancer', 'result': res_1, 'label': '1'})
lines.append({'alg': 'no desimbalancer', 'result': res_2, 'label': '2'})

In [45]:
plot_df = pd.DataFrame(lines)

In [43]:
fig = px.bar(plot_df, x='alg', y='result', color='label', barmode='group',
             title='Accuracy do modelo para os diferentes algoritmos de balanceamento na task 146819',
             labels={
                 'alg': 'Algoritmo',
                 'result': 'Accuracy (%)',
                 'label': 'Label',
             },
)

fig.show()
