In [1]:
import tqdm
import torch
import numpy as np
import pandas as pd
import plotly.express as px
from copy import deepcopy
from typing import Iterable

from model import LR
from data import FairnessDataset, Dataset, SyntheticDataset, GermanDataset, IncomeDataset
from ei_effort import Optimal_Effort, PGD_Effort
from ei_utils import model_performance, pareto_frontier
from ei_model import EIModel, fair_batch_proxy, covariance_proxy

ModuleNotFoundError: No module named 'ei_data'

In [59]:
def append_res(d, method, delta, lamb, alpha, total_loss, pred_loss, fair_loss, disparity, error, theta, theta_adv):
    d['method'].append(method)
    d['delta'].append(delta)
    d['lambda'].append(lamb)
    d['alpha'].append(alpha)
    d['loss'].append(total_loss)
    d['pred_loss'].append(pred_loss)
    d['fair_loss'].append(fair_loss)
    d['disparity'].append(disparity)
    d['error'].append(error)
    d['theta'].append(theta)
    d['theta_adv'].append(theta_adv)

In [60]:
def generate_grid(center, widths, n=15):
    if isinstance(widths, int) or isinstance(widths, float):
        widths = [widths for _ in range(len(center))]
    axes = [np.linspace(center[i]-widths[i], center[i]+widths[i], n) for i in range(len(center))]
    grids = np.meshgrid(*axes)
    points = np.stack([grid.reshape(-1) for grid in grids]).T
    return np.unique(points, axis=0)

In [113]:
def run_gridsearch(dataset: Dataset, hyper_params: dict, results: dict, fold: int = 0):
    delta = dataset.delta
    tau = hyper_params['tau']
    lamb = hyper_params['lambda']
    alpha = hyper_params['alpha']
    proxy = hyper_params['proxy']
    if hyper_params['optimal_effort']:
        effort = Optimal_Effort(delta)
    else:
        effort = PGD_Effort(delta)
    
    loss_fn = torch.nn.BCELoss(reduction='mean')
    thetas = generate_grid(center=[0., 0., 0.,], widths=1., n=15)
    
    train_tensors, val_tensors, test_tensors = dataset.tensor(fold=fold, z_blind=hyper_params['z_blind'])
    train_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
    val_dataset = FairnessDataset(*val_tensors, dataset.imp_feats)
    test_dataset = FairnessDataset(*test_tensors, dataset.imp_feats)
    
    model = LR(num_features=train_dataset.X.shape[1])
    model_adv = LR(num_features=train_dataset.X.shape[1])
    
    for i in tqdm.trange(len(thetas), desc=f"Grid Search [alpha={alpha:.3f}; lambda={lamb:.5f}; effort={effort.__class__.__name__}]", colour='#0091ff'):
        theta = thetas[i]
        # theta = np.array([1.,0., 0., -1])
        model = model.set_theta(theta)
        Y_hat = model(train_dataset.X).reshape(-1)
        pred_loss =  loss_fn(Y_hat, train_dataset.Y).detach().item()
        
        X_e = train_dataset.X[(Y_hat<tau).reshape(-1),:]
        Z_e = train_dataset.Z[(Y_hat<tau)]
            
        X_hat_max = effort(model, train_dataset, X_e)
        Y_hat_max = model(X_hat_max).reshape(-1)
        fair_loss = proxy(Z_e, Y_hat_max).detach().item()
        total_loss = ((1-lamb) * pred_loss) + (lamb * fair_loss)
        
        X_hat_max_full = effort(model, train_dataset, train_dataset.X)
        Y_hat_max = model(X_hat_max_full).reshape(-1)
        
        accuracy, ei_disparity = model_performance(train_dataset.Y.detach().numpy(), train_dataset.Z.detach().numpy(), Y_hat.clone().detach().numpy(), Y_hat_max.clone().detach().numpy(), tau)
        
        append_res(results, 'EI', delta, lamb, 0., total_loss, pred_loss, fair_loss, ei_disparity, 1-accuracy, theta, theta)
        
        # Find theta adv that maximizes fair loss
        theta_advs = generate_grid(center=theta, widths=alpha)
        fair_losses = []
        X_hat_max = effort(model, train_dataset, X_e)
        for theta_adv in theta_advs:
            model_adv = model_adv.set_theta(theta_adv)
            Y_hat_max = model_adv(X_hat_max).reshape(-1)
            fair_loss = proxy(Z_e, Y_hat_max)
            fair_losses.append(fair_loss.detach().item())
        
        max_i = int(np.argmax(fair_losses))
        theta_adv = theta_advs[max_i]
        fair_loss = fair_losses[max_i]
        total_loss = ((1-lamb) * pred_loss) + (lamb * fair_loss)
        
        model_adv = model_adv.set_theta(theta_adv)
        Y_hat_max = model_adv(X_hat_max_full).reshape(-1)
        
        accuracy, ei_disparity = model_performance(train_dataset.Y.detach().numpy(), train_dataset.Z.detach().numpy(), Y_hat.clone().detach().numpy(), Y_hat_max.clone().detach().numpy(), tau)
            
        append_res(results, 'EI', delta, lamb, alpha, total_loss, pred_loss, fair_loss, ei_disparity, 1-accuracy, theta, theta_adv)
        append_res(results, 'REI', delta, lamb, alpha, total_loss, pred_loss, fair_loss, ei_disparity, 1-accuracy, theta, theta_adv)

In [114]:
torch.manual_seed(0)

# ----- Dataset -----
dataset = SyntheticDataset(num_samples=1000, seed=0)

In [115]:
df = dataset.data
df[['y', 'z']] = df[['y', 'z']].astype('str')
px.scatter(df, x='x1', y='x2', color='y', symbol='z', width=1000, height=700, title=f'{dataset.name.capitalize()} Dataset')

In [118]:
# ----- Hyperparameters -----
hyper_params = {}
# hyper_params['lambda'] = 1-np.geomspace(0.001, 0.999, 20)
hyper_params['lambda'] = 1.
hyper_params['alpha'] = 1.
hyper_params['tau'] = 0.5
hyper_params['proxy'] = covariance_proxy
hyper_params['z_blind'] = True
hyper_params['optimal_effort'] = True

# ----- Run Experiment -----
results_xl = {'method': [], 'delta': [], 'lambda': [], 'alpha': [], 'loss': [], 'pred_loss': [ ], 'fair_loss': [], 'disparity': [], 'error': [], 'theta': [], 'theta_adv': []}
run_gridsearch(dataset, hyper_params, results_xl)

Grid Search [alpha=1.000; lambda=1.00000; effort=Optimal_Effort]:  64%|[38;2;0;145;255m██████▍   [0m| 2170/3375 [03:24<01:58, 10.18it/s]

In [117]:
df = pd.DataFrame(results_xl)
df[['theta', 'theta_adv']] = df[['theta', 'theta_adv']].map(lambda x: x.round(4))

df_res = pd.DataFrame()
for method in df['method'].unique():
    temp = df[df['method']==method]
    for alpha in temp['alpha'].unique():
        if method =='EI' and alpha==0.:
            temp2 = temp[temp['alpha']==alpha]
            df_res = pd.concat((df_res, temp2.iloc[[int(temp2['loss'].argmin())]][['method', 'lambda', 'alpha', 'loss', 'pred_loss', 'fair_loss', 'disparity', 'error', 'theta', 'theta_adv']]))
            theta_ei = df_res['theta'].iloc[0]
        elif method == 'EI' and alpha > 0:
            temp2 = temp[(temp['alpha']==alpha) & (temp['theta'].astype(str)==str(theta_ei))]
            df_res = pd.concat((df_res, temp2.iloc[[int(temp2['loss'].argmin())]][['method', 'lambda', 'alpha', 'loss', 'pred_loss', 'fair_loss', 'disparity', 'error', 'theta', 'theta_adv']]))
        else: 
            temp2 = temp[temp['alpha']==alpha]
            df_res = pd.concat((df_res, temp2.iloc[[int(temp2['loss'].argmin())]][['method', 'lambda', 'alpha', 'loss', 'pred_loss', 'fair_loss', 'disparity', 'error', 'theta', 'theta_adv']]))
# df[df['method']=='EI']
df_res

Unnamed: 0,method,lambda,alpha,loss,pred_loss,fair_loss,disparity,error,theta,theta_adv
9942,EI,0.0,0.0,0.151356,0.151356,0.002895,0.035839,0.0,"[1.0, 0.4286, 1.0]","[1.0, 0.4286, 1.0]"
9943,EI,0.0,1.0,0.151356,0.151356,0.042533,0.624615,0.0,"[1.0, 0.4286, 1.0]","[0.2857, 1.4286, 0.1429]"
9944,REI,0.0,1.0,0.151356,0.151356,0.042533,0.624615,0.0,"[1.0, 0.4286, 1.0]","[0.2857, 1.4286, 0.1429]"


In [50]:
def generate_boundary(theta, label=-1):
    a = theta[0]
    b = theta[1]
    c = theta[3]
    
    points = []
    if a == 0:
        x2 = -c/b
        for x1 in np.arange(-3,3, 0.01).round(3):
            points.append((x1, x2, label, label))
    elif b == 0:
        x1 = -c/a
        for x2 in np.arange(-3,3, 0.01).round(3):
            points.append((x1, x2, label, label))
    else:
        for x1 in np.arange(-5, 1, 0.01).round(3):
            # add eps=10^-9 and clip it to -3,3
            x2 = (-a*x1 - c + 0.5) / b
            points.append((x1, x2, label, label))
    return pd.DataFrame(points, columns=['x1', 'x2', 'z', 'y'])

In [51]:
data = dataset.data
boundary = generate_boundary(df_res['theta_adv'].iloc[-1], 'rei')
data = pd.concat((data, boundary))

boundary = generate_boundary(df_res['theta'].iloc[-1], 'ei')
data = pd.concat((data, boundary))

In [52]:
data[['y', 'z']] = data[['y', 'z']].astype('str')
px.scatter(data, x='x1', y='x2', color='y', symbol='z', width=1000, height=700, title=f'{dataset.name.capitalize()} Dataset')