In [None]:
import sys
sys.path.append('..')

In [None]:
import torch
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from copy import deepcopy
from tqdm import tqdm
from plotly.subplots import make_subplots

from src.model import LR, NN
from src.data import FairnessDataset, Dataset, SyntheticDataset, GermanDataset, IncomeDataset
from src.effort import Optimal_Effort
from src.utils import model_performance, pdump, pareto_frontier
from src.methods import EIModel, fair_batch_proxy_bce, covariance_proxy, kde_proxy

In [None]:
default_layout = lambda width = 720, height = 540, color='#5d5d5d': dict(
    width = width,
    height = height,
    font=dict(
        family='Iosevka', 
        color=color
        ),
    title=dict(
        x=0.5,
        font=dict(size=17), 
        ),
    legend=dict(
        font=dict(size=10),
        ),
    )

In [None]:
def append_res(d, seed, method, delta, lamb, train_alpha, alpha, pred_loss, fair_loss, accuracy, ei_disparity, rei_disparity, ei_model):
    d['seed'].append(seed)
    d['method'].append(method)
    d['delta'].append(delta)
    d['lambda'].append(lamb)
    d['train_alpha'].append(train_alpha)
    d['alpha'].append(alpha)
    d['loss'].append((1-lamb)*pred_loss + lamb*fair_loss)
    d['pred_loss'].append(pred_loss)
    d['fair_loss'].append(fair_loss)
    d['accuracy'].append(accuracy)
    d['error'].append(1-accuracy)
    d['ei_disparity'].append(ei_disparity)
    d['rei_disparity'].append(rei_disparity)
    d['ei_model'].append(ei_model)

In [None]:
def run_tradeoff(dataset: Dataset, params: dict, seeds: int, test_results: dict):
    
    tau = params['tau']
    delta = dataset.delta
    
    for seed in seeds:
        train_tensors, val_tensors, test_tensors = dataset.tensor(fold=seed)
        train_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
        val_dataset = FairnessDataset(*val_tensors, dataset.imp_feats)
        test_dataset = FairnessDataset(*test_tensors, dataset.imp_feats)
        
        # model = LR(num_features=train_dataset.X.shape[1])
        model = NN(num_features=train_dataset.X.shape[1], n_layers=[50, 100, 200])
        
        for lamb in params['lambda']:
            ei_model = EIModel(
                    model = deepcopy(model).xavier_init(seed=seed),
                    proxy = params['proxy'],
                    effort = params['effort'],
                    tau = params['tau']
                    )
                
            ei_model.train(
                train_dataset,
                lamb=lamb,
                alpha=0.,
                lr=params['learning_rate'][0],
                n_epochs=params['n_epochs'][0],
                batch_size=params['batch_size'],
                abstol=params['abstol'],
                pga_n_epochs=params['pga_n_epochs']
                )
            
            Y_hat, Y_hat_max, pred_loss, fair_loss = ei_model.predict(test_dataset, alpha=0.)
            accuracy, ei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat, Y_hat_max, tau)
            
            for alpha in params['alpha']:
                Y_hat, Y_hat_max, pred_loss, fair_loss = ei_model.predict(test_dataset, alpha=alpha, pga_n_epochs=params['pga_n_epochs'])
                accuracy, rei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat, Y_hat_max, tau)
                append_res(test_results, seed, 'EI', delta, lamb.round(5), 0., alpha, pred_loss, fair_loss, accuracy, ei_disparity, rei_disparity, deepcopy(ei_model))
                
                rei_model = EIModel(
                    model = deepcopy(model).xavier_init(seed=seed),
                    proxy = params['proxy'],
                    effort = params['effort'],
                    tau = params['tau']
                    )
                
                rei_model.train(
                    train_dataset,
                    lamb=lamb,
                    alpha=alpha,
                    lr=params['learning_rate'][1],
                    n_epochs=params['n_epochs'][1],
                    batch_size=params['batch_size'],
                    abstol=params['pga_abstol'],
                    pga_n_epochs=params['pga_n_epochs']
                    )
                
                Y_hat, Y_hat_max, pred_loss, fair_loss = rei_model.predict(test_dataset, alpha=0.)
                accuracy, ei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat, Y_hat_max, tau)
                
                Y_hat_r, Y_hat_max_r, pred_loss_r, fair_loss_r = rei_model.predict(test_dataset, alpha=alpha, pga_n_epochs=params['pga_n_epochs'])
                accuracy_r, rei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat_r, Y_hat_max_r, tau)
                append_res(test_results, seed, 'REI', delta, lamb.round(5), alpha, alpha, pred_loss_r, fair_loss_r, accuracy_r, ei_disparity, rei_disparity, deepcopy(rei_model))
            print()

In [None]:
torch.manual_seed(0)

# ----- Dataset -----
dataset = SyntheticDataset(num_samples=20000, seed=0)
# dataset = GermanDataset(seed=0, z_blind=True)
# dataset = IncomeDataset(num_samples=20000, seed=0, z_blind=True)

# ----- Hyperparameters -----
params = {}
params['lambda'] = 1-np.geomspace(0.001, 0.999, 20)
# params['lambda'] = np.array(np.linspace(0., 0.75, 11).tolist() + [0.8, 0.85, 0.9, 0.95, 0.99, 0.9999])
# params['lambda'] = np.linspace(0, 0.9, 10).round(2)
params['alpha'] = [0.1, 0.5, 1., 2.]
params['tau'] = 0.5
params['learning_rate'] = [2.5/500, 2.5/1000]
params['n_epochs'] = [400, 400]
params['batch_size'] = 1024
params['proxy'] = covariance_proxy
params['effort'] = Optimal_Effort(dataset.delta)
params['abstol'] = 1e-5
params['pga_abstol'] = 1e-7
params['pga_n_epochs'] = 10

seeds = range(5)

# ----- Run Experiment -----
test_results = {'seed': [], 'method': [], 'delta': [], 'lambda': [], 'train_alpha': [], 'alpha': [], 'loss': [], 'pred_loss': [ ], 'fair_loss': [], 'accuracy': [], 'error': [], 'ei_disparity': [], 'rei_disparity': [], 'ei_model': []}
run_tradeoff(dataset, params, seeds, test_results)

In [None]:
results = {
    'params': params,
    'test_results': test_results
}
pdump(f'../results/error_disparity_tradeoff/lr_{dataset.name}_{params["proxy"].__name__}.pkl', results)

In [None]:
columns = ['method', 'seed', 'lambda', 'alpha', 'loss', 'pred_loss', 'fair_loss', 'error', 'ei_disparity', 'rei_disparity', 'theta', 'theta_adv']
df_results = pd.DataFrame(test_results).sort_values(['alpha', 'lambda'])

df_results['theta'] = df_results['ei_model'].apply(lambda model: model.model.get_theta().numpy().round(2)) 
df_results['theta_adv'] = df_results['ei_model'].apply(lambda model: model.model_adv.get_theta().numpy().round(2))

In [None]:
# s = 1
# temp = df_results[df_results['seed']==s].copy().reset_index(drop=True)
# for i in range(len(df_results)):
#     method = temp.iloc[i]['method']
#     lamb = temp.iloc[i]['lambda']
#     alpha = temp.iloc[i]['alpha']
#     fig = px.scatter(vars(temp['ei_model'].iloc[i].train_history), y='total_loss', color='fair_loss', hover_data=['pred_loss', 'theta', 'theta_adv' if method=='REI' else None])
#     fig.update_layout(
#         title_text = f'{dataset.name.capitalize()} Dataset | {method} | alpha: {alpha} | lambda: {lamb}',
#         # template='plotly_dark',
#         **default_layout(), 
#                       )
#     fig.show()

In [None]:
fig = px.scatter(df_results[(df_results['alpha']>0)].sort_values(['method', 'lambda', 'alpha']), x='lambda', y='fair_loss', hover_data='lambda', color='method', facet_col='alpha', animation_frame='seed', color_continuous_scale=['lightblue', 'blue', 'purple', 'orange', 'red'])

fig.update_layout(
    title_text = f'{dataset.name.capitalize()} Dataset | Gradient Descent', 
    # template='plotly_dark',
    **default_layout())
fig.show()


In [None]:
x_val = 'fair_loss'
for alpha in params['alpha']:
    df_pareto = pd.DataFrame()
    for method in ['EI', 'REI']:
        temp = df_results[(df_results['method']==method) & (df_results['alpha']==alpha)].copy().reset_index(drop=True).sort_values(by=[x_val, 'error'], ascending=[True, True])
        mask = pareto_frontier(temp[x_val], temp['error'])
        df_pareto = pd.concat((df_pareto, temp.iloc[mask]))
        
    fig = px.line(df_pareto.sort_values(['method', 'alpha', x_val]), x=x_val, y='error', color='method', hover_data='lambda', facet_row='alpha', markers=True)
    fig.update_layout(
        title_text = f'{dataset.name.capitalize()} Dataset | Gradient Descent | {params["proxy"].__name__} [BCE]', 
        template='plotly_dark',
        **default_layout(width=np.inf, color='white')        
        )
    fig.show()