# Get the graph for model performance and fairness
## Imports

In [1]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from lib.utils import *

faceattrmodel_attributes = ['Race', 'Gender', 'Age']

def resolve_categori_stat(attr, stat, length):
    # stat should be in shape (N, attributes, 4)
    # 4: group 1 correct / group 1 wrong / group 2 correct /  group 2 wrong
    g1_acc_list, g2_acc_list, total_acc_list, acc_diff_list = list(), list(), list(), list()
    for batch_idx in range(length):
        g1_acc = stat[batch_idx,attr,0] / (stat[batch_idx,attr,0]+stat[batch_idx,attr,1])
        g2_acc = stat[batch_idx,attr,2] / (stat[batch_idx,attr,2]+stat[batch_idx,attr,3])
        total_acc = (stat[batch_idx,attr,0]+stat[batch_idx,attr,2])/(np.sum(stat[batch_idx,attr,:]))
        acc_diff = abs(g1_acc-g2_acc)
        g1_acc_list.append(g1_acc)
        g2_acc_list.append(g2_acc)
        total_acc_list.append(total_acc)
        acc_diff_list.append(acc_diff)
    # return multiple lists of stats for a single attribute
    return g1_acc_list, g2_acc_list, total_acc_list, acc_diff_list

def resolve_categori_performance(attr, stat, length):
    # get the performance list for a single attribute
    # return a list of length = length (epochs selected)
    # The larger the number it return, the better
    _, _, total_acc_list, acc_diff_list = resolve_categori_stat(attr, stat, length)
    # the first element in the list should be the raw performance
    init_acc, init_predpq = total_acc_list[0], acc_diff_list[0]
    tacc_list = list(map(lambda x: x-init_acc, total_acc_list))
    predpq_list = list(map(lambda x: init_predpq-x, acc_diff_list))
    # linear combination on accuracy and fairness
    return [sum(x) for x in zip(tacc_list, predpq_list)]

In [2]:
def show_faceattrmodel_stat(val_stat, train_stat=np.array([]), length=None, marker=".", markersize=4, save_name='default', root_folder='./eval/celeba'):
    # resolve the output file path
    folder = Path(root_folder)
    folder.mkdir(parents=True, exist_ok=True)
    path = folder / f"{save_name}.png"
    # parse the stat, Face attributes model stats are in shape (N, attributes, 4)
    x_axis = np.linspace(0, length-1, length) if length else np.linspace(0, val_stat.shape[0]-1, val_stat.shape[0])
    fig, axs  = plt.subplots(2,3, figsize=(14,8))
    for attr in range(3): # for each attribute
        val_g1_acc_list, val_g2_acc_list, val_total_acc_list, val_acc_diff_list = resolve_categori_stat(attr, val_stat, x_axis.shape[0])
        axs[0][attr].set_title(faceattrmodel_attributes[attr])
        axs[0][attr].set_xlabel('Epochs')
        axs[0][attr].set_ylabel('Accuracy')
        axs[0][attr].set_ylim([0.0, 1.0])
        axs[1][attr].set_xlabel('Epochs')
        axs[1][attr].set_ylabel('Fairness, (lower the better)')
        axs[1][attr].set_ylim([0.0, 1.0])
        if len(train_stat):
            train_g1_acc_list, train_g2_acc_list, train_total_acc_list, train_acc_diff_list = resolve_categori_stat(attr, train_stat, x_axis.shape[0])
            train_total_acc, = axs[0][attr].plot(x_axis, train_total_acc_list, marker=marker, markersize=markersize)
            val_total_acc, = axs[0][attr].plot(x_axis, val_total_acc_list, marker=marker, markersize=markersize)
            axs[0][attr].legend((train_total_acc, val_total_acc), ('Training Acc.', 'Validation Acc.',), loc='lower right')
            train_acc_diff, = axs[1][attr].plot(x_axis, train_acc_diff_list, marker=marker, markersize=markersize)
            val_acc_diff, = axs[1][attr].plot(x_axis, val_acc_diff_list, marker=marker, markersize=markersize)
            axs[1][attr].legend((train_acc_diff, val_acc_diff), ('Training Acc. differences', 'Validation Acc. differences',), loc='upper right')
        else:
            val_g1_acc, = axs[0][attr].plot(x_axis, val_g1_acc_list, marker=marker, markersize=markersize)
            val_g2_acc, = axs[0][attr].plot(x_axis, val_g2_acc_list, marker=marker, markersize=markersize)
            val_total_acc, = axs[0][attr].plot(x_axis, val_total_acc_list, marker=marker, markersize=markersize)
            axs[0][attr].legend((val_g1_acc, val_g2_acc, val_total_acc), ('Group 1', 'Group 2', 'Total'), loc='lower right')
            val_acc_diff, = axs[1][attr].plot(x_axis, val_acc_diff_list, marker=marker, markersize=markersize)
            axs[1][attr].legend((val_acc_diff,), ('Acc. differences',), loc='upper right')
    fig.tight_layout()
    fig.savefig(path,)
    plt.close(fig)

def print_faceattrmodel_stat_by_epoch(epoch, val_stat, train_stat=np.array([])):
    for attr in range(3): # for each attribute
        print(f'==== {faceattrmodel_attributes[attr]} ====')
        val_g1_acc_list, val_g2_acc_list, val_total_acc_list, val_acc_diff_list = resolve_categori_stat(attr, val_stat, val_stat.shape[0])
        if len(train_stat):
            train_g1_acc_list, train_g2_acc_list, train_total_acc_list, train_acc_diff_list = resolve_categori_stat(attr, train_stat, train_stat.shape[0])
            print(f'Training:')
            print(f'    Group 1 Acc.: {train_g1_acc_list[epoch]:.4f}')
            print(f'    Group 2 Acc.: {train_g2_acc_list[epoch]:.4f}')
            print(f'    Total   Acc.: {train_total_acc_list[epoch]:.4f}')
            print(f'        Acc. differences: {train_acc_diff_list[epoch]:.4f}')
        print(f'Validation:')
        print(f'    Group 1 Acc.: {val_g1_acc_list[epoch]:.4f}')
        print(f'    Group 2 Acc.: {val_g2_acc_list[epoch]:.4f}')
        print(f'    Total   Acc.: {val_total_acc_list[epoch]:.4f}')
        print(f'        Acc. differences: {val_acc_diff_list[epoch]:.4f}')
        print(f'')


Get the accuracy versus fairness graph and get the best epoch

In [3]:
# there's only one fairness matrix

# remember to sum up all the attributes in performance
def show_faceattrmodel_acc2fairness(val_stat, length=None, sens_type="race", marker=".", markersize=4, save_name='default', root_folder='./eval/fairface'):
    # resolve the output file path
    folder = Path(root_folder)
    folder.mkdir(parents=True, exist_ok=True)
    path = folder / f"{save_name}.png"
    # parse the stat, CelebA stats are in shape (N, attributes, 8)
    x_axis = np.linspace(0, length-1, length) if length else np.linspace(0, val_stat.shape[0]-1, val_stat.shape[0])
    pred_performance = [0]*x_axis.shape[0]
    match sens_type:
        case 'all':
            for attr in range(3): # for each attribute 3: Race, Gender, Age
                pred_p = resolve_categori_performance(attr, val_stat, x_axis.shape[0])
                pred_performance = [sum(x) for x in zip(pred_performance, pred_p)]
        case 'race' | 'gender' | 'age':
            attr2idx = {'race': 0, 'gender': 1, 'age':2}
            pred_p = resolve_categori_performance(attr2idx[sens_type], val_stat, x_axis.shape[0])
            pred_performance = [sum(x) for x in zip(pred_performance, pred_p)]
        case _:
            assert False, f'the sensitive attribute {sens_type} not supported'
    # exclude the raw performance
    best_predpq_epoch = pred_performance[1:].index(max(pred_performance[1:]))+1
    print(f'Best epoch for prediction quality: {best_predpq_epoch:04d} with score {pred_performance[best_predpq_epoch]:.4f}')
    if max(pred_performance[1:]) < 0:
        print(f'[Worse than raw performance]')
    print_faceattrmodel_stat_by_epoch(best_predpq_epoch, val_stat)
    #
    fig, axs  = plt.subplots(1,3, figsize=(14,8))
    for attr in range(3): # for each attribute 3: Race, Gender, Age
        val_g1_acc_list, val_g2_acc_list, val_total_acc_list, val_acc_diff_list = resolve_categori_stat(attr, val_stat, x_axis.shape[0])
        axs[attr].set_title(faceattrmodel_attributes[attr])
        axs[attr].set_xlabel('Fairness')
        axs[attr].set_ylabel('Accuracy')
        axs[attr].set_box_aspect(1)
        axs[attr].set_xlim([0.0, 1.0])
        axs[attr].set_ylim([0.0, 1.0])
        # prediction quality
        predp, = axs[attr].plot(list(map(lambda x: 1.0-x, val_acc_diff_list)), val_total_acc_list, marker=marker, markersize=markersize)
        b_predp = axs[attr].scatter([1.0-val_acc_diff_list[best_predpq_epoch]], [val_total_acc_list[best_predpq_epoch]], color='#FF2301')
        axs[attr].legend((predp, b_predp,), ('difference in prediction', 'best epoch',), loc='lower left')
    fig.tight_layout()
    fig.savefig(path,)
    plt.close(fig)

advatk_ckpt_root = Path('/tmp2/npfe/noise_stats')

In [4]:
# FairFace Direct ()
# val_stat = load_stats(f'FairFaceDirect_lr_1e2_val', root_folder=advatk_ckpt_root/'FairFaceDirect_lr_1e2')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFaceDirect_lr_1e2_af', root_folder='./eval_resnet50/fairface_noise')

# val_stat = load_stats(f'FairFaceDirect_lr_1e4_val', root_folder=advatk_ckpt_root/'FairFaceDirect_lr_1e4')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFaceDirect_lr_1e4_af', root_folder='./eval_resnet50/fairface_noise')

Best epoch for prediction quality: 0110 with score 0.0042
==== Race ====
Validation:
    Group 1 Acc.: 0.7359
    Group 2 Acc.: 0.6089
    Total   Acc.: 0.6331
        Acc. differences: 0.1270

==== Gender ====
Validation:
    Group 1 Acc.: 0.9029
    Group 2 Acc.: 0.9104
    Total   Acc.: 0.9090
        Acc. differences: 0.0075

==== Age ====
Validation:
    Group 1 Acc.: 0.5287
    Group 2 Acc.: 0.5560
    Total   Acc.: 0.5508
        Acc. differences: 0.0273

Best epoch for prediction quality: 0110 with score 0.0042
==== Race ====
Validation:
    Group 1 Acc.: 0.7359
    Group 2 Acc.: 0.6089
    Total   Acc.: 0.6331
        Acc. differences: 0.1270

==== Gender ====
Validation:
    Group 1 Acc.: 0.9029
    Group 2 Acc.: 0.9104
    Total   Acc.: 0.9090
        Acc. differences: 0.0075

==== Age ====
Validation:
    Group 1 Acc.: 0.5287
    Group 2 Acc.: 0.5560
    Total   Acc.: 0.5508
        Acc. differences: 0.0273



In [5]:
# FairFace CEMasking in second pass
# val_stat = load_stats(f'FairFaceCEmasking_lr_1e_1_val', root_folder=advatk_ckpt_root/'FairFaceCEmasking_lr_1e_1')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFaceCEmasking_lr_1e_1_af', root_folder='./eval_resnet50/fairface_noise')

# val_stat = load_stats(f'FairFaceCEmasking_lr_1e_2_val', root_folder=advatk_ckpt_root/'FairFaceCEmasking_lr_1e_2')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFaceCEmasking_lr_1e_2_af', root_folder='./eval_resnet50/fairface_noise')

# val_stat = load_stats(f'FairFaceCEmasking_lr_1e_3_val', root_folder=advatk_ckpt_root/'FairFaceCEmasking_lr_1e_3')
# show_faceattrmodel_stat(val_stat, save_name='FairFaceCEmasking_lr_1e_3', root_folder='./eval_resnet50/fairface_noise')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFaceCEmasking_lr_1e_3_af', root_folder='./eval_resnet50/fairface_noise')

# val_stat = load_stats(f'FairFaceCEmasking_lr_1e_4_val', root_folder=advatk_ckpt_root/'FairFaceCEmasking_lr_1e_4')
# show_faceattrmodel_stat(val_stat, save_name='FairFaceCEmasking_lr_1e_4', root_folder='./eval_resnet50/fairface_noise')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFaceCEmasking_lr_1e_4_af', root_folder='./eval_resnet50/fairface_noise')

In [6]:
# FairFace POptim
# val_stat = load_stats(f'FairFacePOptim_lr_1e_2_val', root_folder=advatk_ckpt_root/'FairFacePOptim_lr_1e_2')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFacePOptim_lr_1e_2_af', root_folder='./eval_resnet50/fairface_noise')

# val_stat = load_stats(f'FairFacePOptim_lr_1e_3_val', root_folder=advatk_ckpt_root/'FairFacePOptim_lr_1e_3')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFacePOptim_lr_1e_3_af', root_folder='./eval_resnet50/fairface_noise')

# val_stat = load_stats(f'FairFacePOptim_lr_1e_4_val', root_folder=advatk_ckpt_root/'FairFacePOptim_lr_1e_4')
# show_faceattrmodel_acc2fairness(val_stat, save_name='FairFacePOptim_lr_1e_4_af', root_folder='./eval_resnet50/fairface_noise')