In [1]:
import sys
sys.path.append('..')

In [2]:
import tqdm
import torch
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from copy import deepcopy
from typing import List

from src.model import LR
from src.utils import model_performance, pdump, pload
from src.effort import Optimal_Effort
from src.data import FairnessDataset, Dataset, SyntheticDataset, GermanDataset, IncomeDataset

In [3]:
default_layout = lambda width = 720, height = 540, color='#5d5d5d': dict(
    width = width,
    height = height,
    font=dict(
        family='Iosevka', 
        color=color
        ),
    title=dict(
        x=0.5,
        font=dict(size=17), 
        ),
    legend=dict(
        font=dict(size=10),
        )
    )

In [4]:
def generate_boundary(theta):
    a = theta[0]
    b = theta[1]
    c = theta[2]
    
    points = []
    if a==0 and b==0:
        for x1 in np.arange(-2, 2, 0.001).round(3):
            x2 = (-(a)*x1 - c) / (b)
            points.append((x1, x2))
    elif a == 0:
        x2 = -c/b
        for x1 in np.arange(-2,2, 0.01).round(3):
            points.append((x1, x2))
    elif b == 0:
        x1 = -c/a
        for x2 in np.arange(-2,2, 0.01).round(3):
            points.append((x1, x2))
    else:
        for x1 in np.arange(-2, 2, 0.001).round(3):
            x2 = (-(a)*x1 - c) / (b)
            points.append((x1, x2))
    return pd.DataFrame(points, columns=['x1', 'x2'])

In [5]:
def generate_Y_pred(model, data, tau=0.5):
    Y_hat_max = model(torch.from_numpy(data).float())
    Y_pred_max = (Y_hat_max >= tau) * 1
    return Y_pred_max

In [6]:
def generate_grid(center, widths, n=15):
    if isinstance(widths, int) or isinstance(widths, float):
        widths = [widths for _ in range(len(center))]
    axes = [np.linspace(center[i]-widths[i], center[i]+widths[i], n) for i in range(len(center))]
    grids = np.meshgrid(*axes)
    points = np.stack([grid.reshape(-1) for grid in grids]).T
    return np.unique(points, axis=0)

In [7]:
def fair_batch_proxy(Z: torch.tensor, Y_hat_max: torch.tensor, bce: bool = True):
    proxy_value = torch.tensor(0.)
    if bce:
        loss_fn = torch.nn.BCELoss(reduction='mean')
    else:
        loss_fn = torch.nn.MSELoss(reduction='mean')

    loss_mean = loss_fn(Y_hat_max, torch.ones(len(Y_hat_max)))

    for z in [0,1]:
        z = int(z)
        group_idx = (Z==z)
        if group_idx.sum() == 0:
            loss_z  = 0
        else:
            loss_z = loss_fn(Y_hat_max[group_idx], torch.ones(group_idx.sum()))
        proxy_value += torch.abs(loss_z - loss_mean)
    return proxy_value

In [8]:
def append_res(d, method, delta, lamb, alpha, total_loss, pred_loss, fair_loss, disparity, error, theta, theta_adv):
    d['method'].append(method)
    d['delta'].append(delta)
    d['lambda'].append(lamb)
    d['alpha'].append(alpha)
    d['loss'].append(total_loss)
    d['pred_loss'].append(pred_loss)
    d['fair_loss'].append(fair_loss)
    d['disparity'].append(disparity)
    d['error'].append(error)
    d['model'].append(theta)
    d['model_adv'].append(theta_adv)

In [15]:
def run_gridsearch(dataset: Dataset, params: dict, fold: int = 0):
    delta = dataset.delta
    tau = params['tau']
    proxy = params['proxy']
    effort = params['effort']
    
    for lamb in params['lambda']:
        for alpha in params['alpha']:
            run_results = {'method': [], 'delta': [], 'lambda': [], 'alpha': [], 'loss': [], 'pred_loss': [ ], 'fair_loss': [], 'disparity': [], 'error': [], 'model': [], 'model_adv': []}
            
            train_tensors, val_tensors, test_tensors = dataset.tensor(fold=fold)
            train_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
            
            model = LR(num_features=train_dataset.X.shape[1])
            model_adv = LR(num_features=train_dataset.X.shape[1])
            
            loss_fn = torch.nn.BCELoss(reduction='mean')
            thetas = generate_grid(center=[0., 0., 0.,], widths=1.5, n=15)
            
            for theta in tqdm.tqdm(thetas, desc=f"Grid Search [alpha={alpha:.2f}; lambda={lamb:.2f}; effort={effort.name}]", colour='#0091ff'):
                model = model.set_theta(theta)
                
                Y_hat = model(train_dataset.X).reshape(-1)
                pred_loss =  loss_fn(Y_hat, train_dataset.Y).detach().item()
                
                if torch.sum(Y_hat<tau) > 0:
                    X_e = train_dataset.X[(Y_hat<tau)]
                    Z_e = train_dataset.Z[(Y_hat<tau)]
                        
                    X_hat_max = effort(model, train_dataset, X_e)
                    Y_hat_max = model(X_hat_max).reshape(-1)
                    fair_loss = proxy(Z_e, Y_hat_max, False).detach().item()
                    
                    total_loss = ((1-lamb) * pred_loss) + (lamb * fair_loss)
                    
                    accuracy, ei_disparity = model_performance(train_dataset.Y.detach().numpy(), Z_e.numpy(), Y_hat.detach().numpy(), Y_hat_max.detach().numpy(), tau)
                    append_res(run_results, 'EI', delta, lamb, 0., total_loss, pred_loss, fair_loss, ei_disparity, 1-accuracy, deepcopy(model), deepcopy(model))
                    
                    fair_losses = []
                    theta_advs = generate_grid(center=theta, widths=alpha, n=15)
                    for theta_adv in theta_advs:
                        model_adv = model_adv.set_theta(theta_adv)
                        Y_hat_max = model_adv(X_hat_max).reshape(-1)
                        fair_loss = proxy(Z_e, Y_hat_max, False)
                        fair_losses.append(fair_loss.detach().item())
                    
                    max_i = int(np.argmax(fair_losses))
                    theta_adv = theta_advs[max_i]
                    fair_loss = fair_losses[max_i]
                    total_loss = ((1-lamb) * pred_loss) + (lamb * fair_loss)
                    
                    model_adv = model_adv.set_theta(theta_adv)
                    Y_hat_max = model_adv(X_hat_max).reshape(-1)
                    
                    accuracy, rei_disparity = model_performance(train_dataset.Y.detach().numpy(), Z_e.numpy(), Y_hat.detach().numpy(), Y_hat_max.detach().numpy(), tau)
                    append_res(run_results, 'EI', delta, lamb, alpha, total_loss, pred_loss, fair_loss, rei_disparity, 1-accuracy, deepcopy(model), deepcopy(model_adv))
                    append_res(run_results, 'REI', delta, lamb, alpha, total_loss, pred_loss, fair_loss, rei_disparity, 1-accuracy, deepcopy(model), deepcopy(model_adv))
            
            pdump(f'../results/gridsearch/lr_synthetic_fb_alpha{alpha}_lambda{lamb.round(5)}_d1_sqloss.pkl', run_results) 
            

In [16]:
torch.manual_seed(0)

# ----- Dataset -----
dataset = SyntheticDataset(num_samples=1000, seed=0)

train_tensors, val_tensors, test_tensors = dataset.tensor(fold=0)
X_train, Y_train, Z_train = train_tensors

symbols = ['x', 'circle']
colors = ['#636EFA', '#EF553B']
fig = go.Figure()


for z in [0, 1]:
    for y in [0, 1]:
        fig.add_trace(go.Scatter(
            x=X_train[(Z_train==z) * (Y_train==y), 0],
            y=X_train[(Z_train==z) * (Y_train==y), 1],
            marker=dict(color=colors[y], symbol=symbols[z], size=5),
            mode='markers',
            name=f'{y}, {z}'
        ))
   
fig.update_layout(
    title_text=f'{dataset.name.capitalize()} Dataset',
    legend_title='y, z',
    xaxis_title='x1',
    yaxis_title='x2',
    **default_layout()
)
fig.show()

In [18]:
# ----- Hyperparameters -----
params = {}
params['lambda'] = 1-np.geomspace(0.001, 0.999, 20)
# params['lambda'] = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]
params['alpha'] = [0.1]
params['tau'] = 0.5
params['proxy'] = fair_batch_proxy
params['z_blind'] = True
params['effort'] = Optimal_Effort(dataset.delta)

# ----- Run Experiment -----

run_gridsearch(dataset, params)

Grid Search [alpha=0.10; lambda=1.00; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:20<00:00,  3.67it/s]
Grid Search [alpha=0.10; lambda=1.00; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:24<00:00,  3.65it/s]
Grid Search [alpha=0.10; lambda=1.00; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:23<00:00,  3.65it/s]
Grid Search [alpha=0.10; lambda=1.00; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:24<00:00,  3.65it/s]
Grid Search [alpha=0.10; lambda=1.00; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:21<00:00,  3.66it/s]
Grid Search [alpha=0.10; lambda=0.99; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:20<00:00,  3.67it/s]
Grid Search [alpha=0.10; lambda=0.99; effort=Optimal-Effort]: 100%|[38;2;0;145;255m██████████[0m| 3375/3375 [15:20<00:00,  3.67it/s]
Grid Search [alpha=0.10; lambda=0.99; effort=Optimal-Ef

In [None]:
results = pload(f'../results/gridsearch/lr_synthetic_fb_alpha{params["alpha"][0]}_lambda{params["lambda"][0]}_d1.pkl', 'rb')

In [None]:
df = pd.DataFrame(results)
df[['theta', 'theta_adv']] = df[['model', 'model_adv']].map(lambda model: model.get_theta().numpy().round(4))

df_results = pd.DataFrame()
for lamb in params['lambda'][-1:]:
    for method in ['EI', 'REI']:
        for alpha in df['alpha'].sort_values().unique():
            if method =='EI' and alpha==0.:
                temp = df[(df['lambda']==lamb) & (df['method']=='EI') & (df['alpha']==0)]
                df_results = pd.concat((df_results, temp.iloc[[int(temp['loss'].argmin())]]))
                theta_ei = temp.iloc[[int(temp['loss'].argmin())]]['theta'].iloc[0]
            elif method == 'EI' and alpha > 0:
                temp = df[(df['lambda']==lamb) & (df['method']=='EI') & (df['alpha']==alpha) & (df['theta'].astype(str)==str(theta_ei))]
                df_results = pd.concat((df_results, temp.iloc[[int(temp['loss'].argmin())]]))
            elif method == 'REI' and alpha > 0:
                temp = df[(df['lambda']==lamb) & (df['method']=='REI') & (df['alpha']==alpha) & (df['fair_loss']!=0)]
                df_results = pd.concat((df_results, temp.iloc[[int(temp['loss'].argmin())]]))
            else:
                continue

columns = ['method', 'lambda', 'alpha', 'loss', 'pred_loss', 'fair_loss', 'error', 'disparity', 'theta', 'theta_adv']
print(f'[{dataset.name.capitalize()} Dataset][{params["proxy"].__name__}] Grid Search')
for i in range(0, len(df_results), 3):
    display(df_results[columns].iloc[i:i+3].style.highlight_min(subset=['fair_loss', 'error', 'disparity'], color='#D35400'))

In [None]:
ei_theta_boundary = generate_boundary(df_results['theta'].iloc[1])
ei_theta_adv_boundary = generate_boundary(df_results['theta_adv'].iloc[1])

rei_theta = generate_boundary(df_results['theta'].iloc[-1])
rei_theta_adv = generate_boundary(df_results['theta_adv'].iloc[-1])

In [None]:
train_tensors, val_tensors, test_tensors = dataset.tensor(fold=0)
train_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
X_train, Y_train, Z_train = train_tensors

data = pd.DataFrame({'x1': X_train[:, 0], 'x2': X_train[:, 1], 'y': Y_train, 'z': Z_train})
m = 'ei'
i = 1 if m=='ei' else -1

data['y_pred'] = generate_Y_pred(df_results['model'].iloc[i], train_dataset.X.numpy())
X_hat_max = Optimal_Effort(dataset.delta)(df_results['model'].iloc[i], train_dataset, train_dataset.X)
data['y_pred_max'] = generate_Y_pred(df_results['model'].iloc[i], X_hat_max.detach().numpy())
data['y_pred_max_adv'] = generate_Y_pred(df_results['model_adv'].iloc[i], X_hat_max.detach().numpy())

colors = {
    0: {0: '#636EFA', # plotly blue
        1: '#0511a1' # dark blue
    }, 
    1: {
        0: '#EF553B', # plotly red
        1: '#EF553B', # plotly red    
        # 1: '#af260e', # dark red    
    }
}
symbols = {
    0: {
        0: {
            0: 'circle',
            1: 'circle-open'
            },
        1: {
            0: 'circle',
            1: 'circle'
        }
    },
    1: {
        0: {
            0: 'x',
            1: 'x-open'
            },
        1: {
            0: 'x',
            1: 'x'
        }
    }
}

fig = go.Figure()
for z in data['z'].sort_values().unique():
    for y_pred in data['y_pred'].sort_values().unique():
        for y_pred_max in data['y_pred_max'].unique():
            for y_pred_max_adv in data['y_pred_max_adv'].unique():
                z = int(z)
                y_pred = int(y_pred)
                y_pred_max = int(y_pred_max)
                y_pred_max_adv = int(y_pred_max_adv)
                df_im = data[(data['z']==z) & (data['y_pred']==y_pred) & (data['y_pred_max']==y_pred_max) & (data['y_pred_max_adv']==y_pred_max_adv)]
                fig.add_trace(go.Scatter(
                    x=df_im['x1'],
                    y=df_im['x2'],
                    marker = dict(color=colors[y_pred][y_pred_max], symbol=symbols[z][y_pred][y_pred_max_adv], size=4.5),
                    mode='markers',
                    name=f'{y_pred_max}, {y_pred}, {z}',
                ))
            

if m == 'ei':
    fig.add_trace(go.Scatter(
        x=ei_theta_boundary['x1'],
        y=ei_theta_boundary['x2'],
        marker=dict(color='lightgreen', size=3),
        mode='lines',
        name='ei_theta'
    ))

    fig.add_trace(go.Scatter(
        x=ei_theta_adv_boundary['x1'],
        y=ei_theta_adv_boundary['x2'],
        marker=dict(color='pink', size=3),
        mode='lines',
        name='ei_theta_adv'
    ))
else:
    fig.add_trace(go.Scatter(
        x=rei_theta['x1'],
        y=rei_theta['x2'],
        marker=dict(color='darkgreen', size=3),
        mode='lines',
        name='rei_theta'
    ))

    fig.add_trace(go.Scatter(
        x=rei_theta_adv['x1'],
        y=rei_theta_adv['x2'],
        marker=dict(color='hotpink', size=3),
        mode='lines',
        name='rei_theta_adv'
    ))


fig.update_layout(
    title_text = f'{dataset.name.capitalize()} Dataset | alpha: {params["alpha"][0]} | lambda: {params["lambda"][0]}', 
    legend_title = 'y_pred_max, y_pred, z',
    xaxis_title='x1',
    yaxis_title='x2',
    **default_layout(width=np.inf)
)

fig.show()

In [None]:
train_tensors, val_tensors, test_tensors = dataset.tensor(fold=0)
train_dataset = FairnessDataset(*train_tensors, dataset.imp_feats)
X_train, Y_train, Z_train = train_tensors

In [None]:
tau = 0.5

losses = []

for i in tqdm.trange(len(df)):    
    model = deepcopy(df.iloc[i]['model'])
    model_adv = deepcopy(df.iloc[i]['model_adv'])

    Y_hat = model_adv(X_train).reshape(-1)
    X_e = X_train[(Y_hat<tau)]
    Z_e = Z_train[(Y_hat<tau)]

    X_hat_max = Optimal_Effort(dataset.delta)(model, train_dataset, X_e)
    Y_hat_max = model_adv(X_hat_max).reshape(-1)
    fair_loss = fair_batch_proxy(Z_e, Y_hat_max, False)
    losses.append(fair_loss.item())
    
df['fair_loss (sq)'] = losses

In [None]:
losses = {'sq_loss': [], 'bce_loss': [], 'y_hat_max': [], 'z': []}

for z in [0, 1]:
    total = 0.
    for y_hat_max in Y_hat_max[Z_e==z]:
        bce_val = - (np.log(y_hat_max.item()))
        sq_val = (1-y_hat_max.item())**2
        total += bce_val
        losses['bce_loss'].append(bce_val)
        losses['sq_loss'].append(sq_val)
        losses['y_hat_max'].append(y_hat_max.item())
        losses['z'].append(str(z))
            
    print(total / len(Y_hat_max[Z_e==z]))
loss_mean = np.mean(losses['bce_loss'])
sq_loss_mean = np.mean(losses['sq_loss'])

In [None]:
fig = px.scatter(losses, x='sq_loss', y='bce_loss', facet_col='z', color='y_hat_max')
fig.add_hline(y=loss_mean, line_dash='dash', line_width=2, line_color='#3e3e3e', annotation_text='bce_loss mean')
fig.add_vline(x=sq_loss_mean, line_dash='dash', line_width=2, line_color='#3e3e3e', annotation_text='sq_loss mean')
fig.update_layout(
    **default_layout(width=np.inf)
)