In [219]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn.functional as F
import torch.optim as optim

import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../")))
from libs import data as dt, neuronshap as ns, sim
from cfgs.fedargs import *

from fairlearn.metrics import (
    demographic_parity_difference,
    demographic_parity_ratio,
    equalized_odds_difference,
    equalized_odds_ratio,
)
from libs.helpers.finance import bin_hours_per_week
from libs.helpers.metrics import (
    conditional_demographic_parity_difference,
    conditional_demographic_parity_ratio,
)
from libs.helpers.plot import group_box_plots

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [220]:
df = pd.read_csv('../../data/ccc/ccc.csv', index_col='ID')
df.rename(columns={'default payment next month':'DEFAULT'}, inplace=True)
df.rename(columns={'PAY_0': 'PAY_1'}, inplace=True)
df.rename(columns=lambda x: x.upper(), inplace=True)

df = df.drop(df[df['MARRIAGE']==0].index)
df = df.drop(df[df['EDUCATION']==0].index)
df = df.drop(df[df['EDUCATION']==5].index)
df = df.drop(df[df['EDUCATION']==6].index)

pay_features = ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
for p in pay_features:
    df.loc[df[p]<0, p] = -1
    df.loc[df[p]>=0, p] = df.loc[df[p]>=0, p] + 1
    df[p] = df[p].astype('int64')
    
df['GRAD_SCHOOL'] = (df['EDUCATION'] == 1).astype('int64')
df['UNIVERSITY'] = (df['EDUCATION'] == 2).astype('int64')
df['HIGH_SCHOOL'] = (df['EDUCATION'] == 3).astype('int64')
df.drop('EDUCATION', axis=1, inplace=True)

df['MALE'] = (df['SEX'] == 1).astype('int64')
df.drop('SEX', axis=1, inplace=True)

df['MARRIED'] = (df['MARRIAGE'] == 1).astype('int64')
df.drop('MARRIAGE', axis=1, inplace=True)

cts_features = ['LIMIT_BAL', 'AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
ss = StandardScaler()

df[cts_features] = ss.fit_transform(df[cts_features])

df.head()

Unnamed: 0_level_0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT,GRAD_SCHOOL,UNIVERSITY,HIGH_SCHOOL,MALE,MARRIED
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.135512,-1.244325,1.646675,1.670289,-1.17951,-1.178579,-1.184503,-1.147751,-0.641203,-0.646339,...,-0.29569,-0.307347,-0.314553,-0.293444,1,0,1,0,0,1
2,-0.365937,-1.027242,-1.34018,1.670289,0.260241,0.302104,0.343007,1.871139,-0.657981,-0.665755,...,-0.238809,-0.243696,-0.314553,-0.180174,1,0,1,0,0,0
3,-0.59681,-0.158912,0.153248,0.23882,0.260241,0.302104,0.343007,0.361694,-0.296016,-0.492299,...,-0.238809,-0.243696,-0.248953,-0.010269,0,0,1,0,0,0
4,-0.90464,0.166712,0.153248,0.23882,0.260241,0.302104,0.343007,0.361694,-0.054075,-0.009999,...,-0.227433,-0.237331,-0.244427,-0.236809,0,0,1,0,0,1
5,-0.90464,2.337536,-1.34018,0.23882,-1.17951,0.302104,0.343007,0.361694,-0.577089,-0.610131,...,0.273118,0.265508,-0.269355,-0.254989,0,0,1,0,1,1


In [221]:
train_oh, test_oh = train_test_split(df, test_size=0.3, random_state=42)

In [222]:
test_oh.head(5)

Unnamed: 0_level_0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT,GRAD_SCHOOL,UNIVERSITY,HIGH_SCHOOL,MALE,MARRIED
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4239,-0.750725,-0.701618,0.153248,0.23882,0.260241,0.302104,0.343007,0.361694,-0.668244,-0.648511,...,-0.287499,-0.297799,-0.300777,-0.265126,0,0,1,0,0,1
9264,-0.442895,-0.593077,0.153248,0.23882,-1.17951,1.782787,1.870517,1.871139,-0.340163,-0.563179,...,-0.210369,-0.176099,-0.314553,0.115573,0,0,1,0,1,1
25370,0.32668,-0.918701,-1.34018,0.23882,0.260241,0.302104,1.870517,-1.147751,-0.157566,-0.144653,...,-0.221745,-0.307347,-0.180729,-0.062317,0,1,0,0,0,0
4629,0.634511,1.360665,0.899961,-1.192649,-1.17951,-1.178579,-1.184503,-1.147751,-0.694536,-0.690077,...,-0.29569,-0.307347,-0.314553,-0.293444,1,1,0,0,0,0
4515,-0.90464,0.492335,0.153248,0.23882,0.260241,0.302104,0.343007,0.361694,-0.048841,-0.020095,...,0.042523,-0.116395,0.144645,-0.259463,1,0,1,0,0,1


In [223]:
#https://github.com/MatteoM95/Default-of-Credit-Card-Clients-Dataset-Analisys/blob/main/Default_of_Credit_Card_Clients.ipynb

In [224]:
m_dh_oh = test_oh.loc[test_oh["MALE"] == 1]
m_dh_oh = m_dh_oh.head(10)
fm_dh_oh = test_oh.loc[test_oh["MALE"] == 0]
fm_dh_oh = fm_dh_oh.head(10)

In [225]:
X_train = train_oh.drop(columns="DEFAULT").values
Y_train = train_oh['DEFAULT'].values
X_test = test_oh.drop(columns="DEFAULT").values
Y_test = test_oh['DEFAULT'].values
X_m = m_dh_oh.drop(columns="DEFAULT").values
Y_m = m_dh_oh['DEFAULT'].values
X_fm = fm_dh_oh.drop(columns="DEFAULT").values
Y_fm = fm_dh_oh['DEFAULT'].values

#creating torch dataset and loader using original dataset. 
#to use resampled dataset, replace ex. xtrain with xtrain_over etc.
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(Y_train).long())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(Y_test).long())
m_data = torch.utils.data.TensorDataset(torch.tensor(X_m).float(), torch.tensor(Y_m).long())
fm_data = torch.utils.data.TensorDataset(torch.tensor(X_fm).float(), torch.tensor(Y_fm).long())

train_loader = torch.utils.data.DataLoader(train_data,batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data))
m_loader = torch.utils.data.DataLoader(m_data, batch_size=1)
fm_loader = torch.utils.data.DataLoader(fm_data, batch_size=1)

In [226]:
class BasicNet(torch.nn.Module):
    
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.layers = 0
        
        self.lin1 = torch.nn.Linear(self.num_features,  150)        
        self.lin2 = torch.nn.Linear(50, 50)        
        self.lin3 = torch.nn.Linear(50, 50)
        
        self.lin4 = torch.nn.Linear(150, 150) 
        
        self.lin5 = torch.nn.Linear(50, 50)        
        self.lin6 = torch.nn.Linear(50, 50)
        self.lin10 = torch.nn.Linear(150, self.num_classes)
        
        self.prelu = torch.nn.PReLU()
        self.dropout = torch.nn.Dropout(0.25)

    def forward(self, xin):
        self.layers = 0
        
        x = F.relu(self.lin1(xin))
        self.layers += 1
        
        #x = F.relu(self.lin2(x))
        #self.layers += 1
        for y in range(8):
            x = F.relu(self.lin4(x)) 
            self.layers += 1
           
        x = self.dropout(x)
        
        x = F.relu(self.lin10(x)) 
        self.layers += 1
        return x

In [227]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    
    for inputs, target in train_loader:
      
        #inputs, target = inputs.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target.long())
        # Backprop
        loss.backward()
        optimizer.step()
        ###

In [228]:
def test(model, test_loader):
    model.eval()
    
    test_loss = 0
    correct = 0
    test_size = 0
    
    with torch.no_grad():
      
        for inputs, target in test_loader:
            
            #inputs, target = inputs.to(device), target.to(device)
            
            output = model(inputs)
            test_size += len(inputs)
            test_loss += test_loss_fn(output, target.long()).item() 
            pred = output.max(1, keepdim=True)[1] 
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_size
    accuracy = correct / test_size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_size,
        100. * accuracy))
    
    return test_loss, accuracy

In [233]:
model = BasicNet(25, 2)
test_accuracy = []
train_loss = []
nbr_epochs = 5
lr = 0.0001# 
weight_decay = 0

# Surrogate loss used for training
loss_fn = torch.nn.CrossEntropyLoss()
test_loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
#optimizer = optim.SGD(model.parameters(), lr=lr ,weight_decay=weight_decay)
#optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)

print('Training beginning...')
#start_time = time.time()

for epoch in range(1, nbr_epochs+1):
    print('Epoch ', epoch, ':')
    train(model, train_loader, optimizer, epoch)
    loss, acc = test(model, test_loader)
    
    # save results every epoch
    test_accuracy.append(acc)
    train_loss.append(loss)
    
#end_time = time.time()
#print('Training on ' + str(nbr_epochs) + ' epochs done in ', str(end_time-start_time),' seconds')

Training beginning...
Epoch  1 :

Test set: Average loss: 0.5260, Accuracy: 6859/8881 (77%)

Epoch  2 :

Test set: Average loss: 0.4959, Accuracy: 6859/8881 (77%)

Epoch  3 :

Test set: Average loss: 0.4636, Accuracy: 6859/8881 (77%)

Epoch  4 :

Test set: Average loss: 0.4593, Accuracy: 6867/8881 (77%)

Epoch  5 :

Test set: Average loss: 0.4519, Accuracy: 7170/8881 (81%)



In [234]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]

print(sum(Y_test), sum(Y_pred), sum(pred))


Accuracy: 7170/8881 (81%)

2022 tensor([1403]) tensor([1403])


<h1>Demographic Parity</h1>

<h2>Distribution of scores by sex</h2>

In [235]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.029
Demographic parity ratio: 0.833


<h2>Distribution of scores by marriage</h2>

In [236]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.021
Demographic parity ratio: 0.877


<h1>Conditional Demographic Parity</h1>

<h2>Distribution of scores by sex and hours worked per week</h2>

In [237]:
test_hpw_enum = test.hours_per_week.map(bin_hours_per_week)

cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

AttributeError: 'function' object has no attribute 'hours_per_week'

<h2>Distribution of scores by race and hours worked per week</h2>

In [158]:
cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.race, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.race, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

IndexError: boolean index did not match indexed array along dimension 0; dimension is 8881 but corresponding boolean dimension is 15060

<h1>Equalised Odds</h1>

<h2>Distribution of scores by sex for high and low earners</h2>

In [189]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.012
Equalised odds ratio: 0.796


<h2>Distribution of scores by race for high and low earners</h2>

In [169]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

ZeroDivisionError: float division by zero

<h1>Shapley based Neuron Pruning for Fairness</h1>

In [202]:
m_shapley_values = ns.calculate_shapley_values_fa(model, m_loader, 10)
print(m_shapley_values)
fm_shapley_values = ns.calculate_shapley_values_fa(model, fm_loader, 10)
print(fm_shapley_values)

[0.06606107 0.06768857 0.12627816 ... 4.874558   0.         0.        ]
[0.09096518 0.07978442 0.14642611 ... 5.3610454  0.         0.        ]


In [213]:
diff_shap_values = np.abs(m_shapley_values - fm_shapley_values)
max_diff_shap_values_ind = np.argpartition(diff_shap_values, -20)[-20:]
diff_shap_values[max_diff_shap_values_ind]

array([0.20136762, 0.20322788, 0.22048664, 0.21353096, 0.21179128,
       0.2123821 , 0.21049368, 0.22313958, 0.29447818, 0.2394085 ,
       0.25583547, 0.25395095, 0.28545606, 0.24384797, 0.26192316,
       0.22396778, 0.24085248, 0.4864874 , 0.23142874, 0.23466134],
      dtype=float32)

In [214]:
model_arr, model_slist = sim.get_net_arr(model)
model_arr[max_diff_shap_values_ind] = 0
updated_model = sim.get_arr_net(model, model_arr, model_slist)

In [215]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = updated_model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))
        

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]
test = pd.read_csv("../../data/adult/test.csv")


Accuracy: 7220/8881 (81%)



In [216]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.016
Demographic parity ratio: 0.866


In [217]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MALE,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.010
Equalised odds ratio: 0.801
