In [1]:
import sys
sys.path.append('..')

In [2]:
import torch
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from copy import deepcopy
from tqdm import tqdm
from plotly.subplots import make_subplots

from src.model import LR
from src.data import FairnessDataset, Dataset, SyntheticDataset, GermanDataset, IncomeDataset
from src.effort import Optimal_Effort
from src.utils import model_performance, pdump, pareto_frontier
from src.methods_dev import EIModel, EIModel_S, fair_batch_proxy, covariance_proxy

In [3]:
default_layout = lambda width = 720, height = 540, color='#5d5d5d': dict(
    width = width,
    height = height,
    font=dict(
        family='Iosevka', 
        color=color
        ),
    title=dict(
        x=0.5,
        font=dict(size=17), 
        ),
    legend=dict(
        font=dict(size=10),
        ),
    )

In [4]:
def append_res(d, seed, method, delta, lamb, train_alpha, alpha, pred_loss, fair_loss, accuracy, ei_disparity, rei_disparity, ei_model):
    d['seed'].append(seed)
    d['method'].append(method)
    d['delta'].append(delta)
    d['lambda'].append(lamb)
    d['train_alpha'].append(train_alpha)
    d['alpha'].append(alpha)
    d['loss'].append((1-lamb)*pred_loss + lamb*fair_loss)
    d['pred_loss'].append(pred_loss)
    d['fair_loss'].append(fair_loss)
    d['accuracy'].append(accuracy)
    d['error'].append(1-accuracy)
    d['ei_disparity'].append(ei_disparity)
    d['rei_disparity'].append(rei_disparity)
    d['ei_model'].append(ei_model)

In [5]:
def run_tradeoff(dataset: Dataset, params: dict, seeds: int, results: dict):
    
    tau = params['tau']
    delta = dataset.delta
    
    for seed in seeds:
        train_tensors, val_tensors, test_tensors = dataset.tensor(fold=seed)
        train_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
        val_dataset = FairnessDataset(*val_tensors, dataset.imp_feats)
        test_dataset = FairnessDataset(*test_tensors, dataset.imp_feats)
        # test_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
        
        model = LR(num_features=train_dataset.X.shape[1])
        
        for lamb in params['lambda']:
            m = deepcopy(model).xavier_init()
            ei_model = EIModel(
                    model = deepcopy(model).xavier_init(),
                    proxy = params['proxy'],
                    effort = params['effort'],
                    tau = params['tau']
                    )
                
            ei_model.train(
                train_dataset,
                lamb=lamb,
                alpha=0.,
                lr=params['learning_rate'],
                n_epochs=params['n_epochs'][0],
                batch_size=params['batch_size'],
                # batch_size=len(train_dataset.X),
                abstol=params['pga_abstol'],
                pga_n_iters=params['pga_n_iters']
                )
            
            Y_hat, Y_hat_max, pred_loss, fair_loss = ei_model.predict(test_dataset, alpha=0., abstol=params['pga_abstol'])
            accuracy, ei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat, Y_hat_max, tau)
            
            for alpha in params['alpha']:
                Y_hat, Y_hat_max, pred_loss, fair_loss = ei_model.predict(test_dataset, alpha=alpha, abstol=params['pga_abstol'], pga_n_iters=params['pga_n_iters'])
                accuracy, rei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat, Y_hat_max, tau)
                append_res(results, seed, 'EI', delta, lamb, 0., alpha, pred_loss, fair_loss, accuracy, ei_disparity, rei_disparity, deepcopy(ei_model))
                
                rei_model = EIModel(
                    model = deepcopy(model).xavier_init(),
                    proxy = params['proxy'],
                    effort = params['effort'],
                    tau = params['tau']
                    )
                
                rei_model.train(
                    train_dataset,
                    lamb=lamb,
                    alpha=alpha,
                    lr=params['learning_rate'],
                    n_epochs=params['n_epochs'][1],
                    batch_size=params['batch_size'],
                    # batch_size=len(train_dataset.X),
                    abstol=params['pga_abstol'],
                    pga_n_iters=params['pga_n_iters']
                    )
                
                Y_hat, Y_hat_max, pred_loss, fair_loss = rei_model.predict(test_dataset, alpha=0., abstol=params['pga_abstol'])
                accuracy, ei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat, Y_hat_max, tau)
                
                Y_hat_r, Y_hat_max_r, pred_loss_r, fair_loss_r = rei_model.predict(test_dataset, alpha=alpha, abstol=params['pga_abstol'], pga_n_iters=params['pga_n_iters'])
                accuracy_r, rei_disparity = model_performance(test_dataset.Y.detach().numpy(), test_dataset.Z[(Y_hat<tau)].detach().numpy(), Y_hat_r, Y_hat_max_r, tau)
                append_res(results, seed, 'REI', delta, lamb, alpha, alpha, pred_loss_r, fair_loss_r, accuracy_r, ei_disparity, rei_disparity, deepcopy(rei_model))
            print()

In [14]:
torch.manual_seed(0)

# ----- Dataset -----
dataset = SyntheticDataset(num_samples=20000, seed=0)
# dataset = GermanDataset(seed=0, z_blind=True)
# dataset = IncomeDataset(num_samples=20000, seed=0, z_blind=True)

# ----- Hyperparameters -----
params = {}
# params['lambda'] = 1-np.geomspace(0.001, 0.999, 20)
# params['lambda'] = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9]
params['lambda'] = [0.9, 0.999]
params['alpha'] = [0.1]
params['tau'] = 0.5
params['learning_rate'] = 0.01
params['n_epochs'] = [100, 250]
params['batch_size'] = 1000
params['proxy'] = fair_batch_proxy
params['pga_abstol'] = 1e-7
params['pga_n_iters'] = 20
params['effort'] = Optimal_Effort(dataset.delta)

seeds = range(1)

# ----- Run Experiment -----
results = {'seed': [], 'method': [], 'delta': [], 'lambda': [], 'train_alpha': [], 'alpha': [], 'loss': [], 'pred_loss': [ ], 'fair_loss': [], 'accuracy': [], 'error': [], 'ei_disparity': [], 'rei_disparity': [], 'ei_model': []}
run_tradeoff(dataset, params, seeds, results)

Training [alpha=0.000; lambda=0.90000; delta=0.500]: 100%|[38;2;0;145;255m██████████[0m| 100/100 [00:04<00:00, 21.68epochs/s]
Training [alpha=0.100; lambda=0.90000; delta=0.500]:  70%|[38;2;0;145;255m███████   [0m| 176/250 [00:11<00:06, 11.58epochs/s]

In [7]:
# pdump(f'../results/gradientdescent/lr_{dataset.name}_{params["proxy"].__name__}_mse_alpha{params["alpha"][0]}_eistart.pkl', results) 

In [None]:
# results['dataset'] = ['Income', 'Income', 'Synthetic', 'Synthetic']

In [9]:
columns = ['method', 'seed', 'lambda', 'alpha', 'loss', 'pred_loss', 'fair_loss', 'error', 'ei_disparity', 'rei_disparity', 'theta', 'theta_adv']
df_results = pd.DataFrame(results)

df_results['theta'] = df_results['ei_model'].apply(lambda model: model.model.get_theta().numpy().round(2)) 
df_results['theta_adv'] = df_results['ei_model'].apply(lambda model: model.model_adv.get_theta().numpy().round(2))
for i in range(0, len(df_results), 2):
    print(f'{dataset.name.capitalize()} Dataset | Gradient Descent | {params["proxy"].__name__} | alpha: {params["alpha"][0]}')
    display(df_results[columns].iloc[i:i+2].style.highlight_min(subset=['loss', 'fair_loss', 'error', 'ei_disparity', 'rei_disparity'], color='#D35400'))

Income Dataset | Gradient Descent | fair_batch_proxy | alpha: 0.1


Unnamed: 0,method,seed,lambda,alpha,loss,pred_loss,fair_loss,error,ei_disparity,rei_disparity
0,EI,0,0.9,0.1,1.832209,12.612544,0.634394,0.59,0.0,0.0
1,REI,0,0.9,0.1,0.074553,0.45336,0.032463,0.19325,0.011166,0.011166


Income Dataset | Gradient Descent | fair_batch_proxy | alpha: 0.1


Unnamed: 0,method,seed,lambda,alpha,loss,pred_loss,fair_loss,error,ei_disparity,rei_disparity
2,EI,0,0.999,0.1,0.027093,5.278867,0.021836,0.5895,0.0,0.0
3,REI,0,0.999,0.1,0.033285,0.449933,0.032868,0.193,0.007795,0.007795


In [None]:
for i in range(len(df_results)):
    method = df_results.iloc[i]['method']
    lamb = df_results.iloc[i]['lambda']
    alpha = df_results.iloc[i]['alpha']
    fig = px.scatter(vars(df_results['ei_model'].iloc[i].train_history), y='total_loss', color='fair_loss', hover_data=['pred_loss', 'thetas', 'theta_advs' if method=='REI' else None])
    fig.update_layout(
        title_text = f'{dataset.name.capitalize()} Dataset | {method} | alpha: {alpha} | lambda: {lamb}',
        # template='plotly_dark',
        **default_layout(width=700, height=400), 
                      )
    fig.show()

In [10]:
fig = px.scatter(df_results[(df_results['alpha']>0)].sort_values(['method', 'lambda']), x='lambda', y='fair_loss', hover_data='lambda', color='method', facet_row='seed', color_continuous_scale=['lightblue', 'blue', 'purple', 'orange', 'red'])

fig.update_layout(
    title_text = f'{dataset.name.capitalize()} Dataset | Gradient Descent | alpha: {params["alpha"][0]}', 
    # template='plotly_dark',
    **default_layout())
fig.show()


In [None]:
fig = px.scatter(df_results.sort_values('fair_loss'), x='fair_loss', y='error', color='method', hover_data='lambda')
fig.update_layout(
    **default_layout()
    )
fig.show()

In [None]:
x_val = 'fair_loss'
df_pareto = pd.DataFrame()
for method in ['EI', 'REI']:
    temp = df_results[df_results['method']==method].copy().reset_index(drop=True).sort_values(by=[x_val, 'error'], ascending=[True, True])
    mask = pareto_frontier(temp[x_val], temp['error'])
    df_pareto = pd.concat((df_pareto, temp.iloc[mask]))

In [None]:

fig = px.scatter(df_pareto.sort_values(x_val), x=x_val, y='error', color='method', hover_data='lambda')
fig.update_layout(
    title_text = f'{dataset.name.capitalize()} Dataset | Gradient Descent | {params["proxy"].__name__} | alpha: {params["alpha"][0]}', 
    **default_layout()
    )
fig.show()

In [None]:
for i in range(len(df_results)):
    method = df_results.iloc[i]['method']
    if method == 'REI':
        lamb = df_results.iloc[i]['lambda']
        alpha = df_results.iloc[i]['alpha']
        train_history = vars(df_results['ei_model'].iloc[i].train_history)
        df_th = pd.DataFrame()
        df_th['theta x'] = [theta[0].item() for theta in train_history['thetas']]
        df_th['theta y'] = [theta[1].item() for theta in train_history['thetas']]
        df_th['theta bias'] = [theta[2].item() for theta in train_history['thetas']]
        df_th['theta_adv x'] = [theta_adv[0].item() for theta_adv in train_history['theta_advs']]
        df_th['theta_adv y'] = [theta_adv[1].item() for theta_adv in train_history['theta_advs']]
        df_th['theta_adv bias'] = [theta_adv[2].item() for theta_adv in train_history['theta_advs']]
        df_th['total_loss'] = train_history['total_loss']
        df_th['fair_loss'] = train_history['fair_loss']
        df_th['iteration'] = range(1, 101)
        
        fig = px.scatter(df_th, x='theta_adv x', y='theta_adv y', color='iteration')
        fig.update_layout(
            title_text = f'Synthetic Dataset | {method} | alpha: {alpha} | lambda: {lamb}',
            # template='plotly_dark',
            **default_layout(), 
                        )
        fig.show()