In [1]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from tqdm.notebook import tqdm
from algo import AdaFair, AdaBoost, SMOTEBoost, SMOTEBoostProtected, AdaFairCorrect

from utils import get_dataset
from metrics import calculate_metrics

In [None]:
dataset_list = [
    'adult',
    'bank',
    'compass',
    'kdd',
]
algorithms = [
    AdaBoost,
    AdaFair,
    SMOTEBoost,
]
train_metrics, test_metrics = {}, {}

for dataset in tqdm(dataset_list):
    print(f'Dataset {dataset}')
    
    X_train, y_train, is_protected_train, X_test, y_test, is_protected_test = get_dataset(dataset, test_size=0.5)
    
    print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')
    print(f'Positive ratio (train): {y_train.mean():.3f}, Positive ratio (test): {y_test.mean():.3f}')
    print(f'Protected ratio (train): {is_protected_train.mean():.3f}, Protected ratio (test): {is_protected_test.mean():.3f}')
    
    for model_class in tqdm(algorithms, position=1, leave=False):
        if model_class.__name__ == 'AdaFair' and dataset == 'compass':
            model = model_class(n_estimators=50, u_weight=1)
        else:
            model = model_class()
        model.fit(X_train, y_train, is_protected_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_metrics[dataset, model_class.__name__] = calculate_metrics(y_train, y_train_pred, is_protected_train)
        test_metrics[dataset, model_class.__name__] = calculate_metrics(y_test, y_test_pred, is_protected_test)
    
    print()

  0%|          | 0/4 [00:00<?, ?it/s]

Dataset adult
Train size: (24421, 108), Test size: (24421, 108)
Positive ratio (train): 0.239, Positive ratio (test): 0.239
Protected ratio (train): 0.332, Protected ratio (test): 0.332


  0%|          | 0/3 [00:00<?, ?it/s]


Dataset bank
Train size: (22605, 51), Test size: (22606, 51)
Positive ratio (train): 0.117, Positive ratio (test): 0.117
Protected ratio (train): 0.602, Protected ratio (test): 0.602


  0%|          | 0/3 [00:00<?, ?it/s]


Dataset compass
Train size: (3086, 11), Test size: (3086, 11)
Positive ratio (train): 0.455, Positive ratio (test): 0.455
Protected ratio (train): 0.191, Protected ratio (test): 0.190


  0%|          | 0/3 [00:00<?, ?it/s]


Dataset kdd
Train size: (149642, 407), Test size: (149643, 407)
Positive ratio (train): 0.062, Positive ratio (test): 0.062
Protected ratio (train): 0.520, Protected ratio (test): 0.520


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
test_metrics

In [None]:
def plot_metrics(ax, metrics, dataset):
    metric_names = metrics[dataset, algorithms[0].__name__].keys()
    x = np.arange(len(metric_names))
    width = 0.2  # the width of the bars
    ax.yaxis.grid()
    for i, algo in enumerate(algorithms):
        algo_name = algo.__name__
        measurement = [metrics[dataset, algo_name][m] for m in metric_names]
        rects = ax.bar(x + width * i, measurement, width, label=algo_name)
    ax.set_ylim([0, 1])
    ax.legend(loc='lower right')
    ax.set_xticks(x + width * (len(algorithms) - 1) / 2, metric_names, rotation=30,
                  rotation_mode="anchor", horizontalalignment='right', verticalalignment='top')

for metrics, name in [(test_metrics, 'test'), (train_metrics, 'train')]:
    _, axes = plt.subplots(1, len(dataset_list), figsize=(15, 3))
    if len(dataset_list) == 1:
        axes = [axes]
    print(name, 'metrics:')
    for dataset, ax in zip(dataset_list, axes):
        plot_metrics(ax, metrics, dataset)
        ax.set_title(dataset)
    plt.show()