In [73]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn.functional as F
import torch.optim as optim

import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../")))
from libs import data as dt, neuronshap as ns, sim
from cfgs.fedargs import *

from fairlearn.metrics import (
    demographic_parity_difference,
    demographic_parity_ratio,
    equalized_odds_difference,
    equalized_odds_ratio,
)
from libs.helpers.finance import bin_hours_per_week
from libs.helpers.metrics import (
    conditional_demographic_parity_difference,
    conditional_demographic_parity_ratio,
)
from libs.helpers.plot import group_box_plots

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
df = pd.read_csv('../../data/kdd/kdd.csv')
df['income'] = [1 if v == ">50K" else 0 for v in df['income']]

le = preprocessing.LabelEncoder()
for i in df.columns:
    if df[i].dtypes == 'object':
        df[i] = le.fit_transform(df[i])

df.head()

Unnamed: 0,age,workclass,industry,occupation,education,wage-per-hour,enroll-in-edu-inst-last-wk,marital-status,major-industry,major-occupation,...,country-father,country-mother,country-birth,citizenship,own-business,fill-questionnaire,veterans-benefits,weeks-worked,year,income
0,73,3,0,0,12,0,2,6,14,6,...,39,39,39,4,0,1,2,0,95,0
1,58,6,4,34,16,0,2,0,4,8,...,39,39,39,4,0,1,2,52,94,0
2,18,3,0,0,0,0,1,4,14,6,...,40,40,40,0,0,1,2,0,95,0
3,9,3,0,0,10,0,2,4,14,6,...,39,39,39,4,0,1,0,0,94,0
4,10,3,0,0,10,0,2,4,14,6,...,39,39,39,4,0,1,0,0,94,0


In [75]:
train_oh, test_oh = train_test_split(df, test_size=0.2, random_state=42)

In [76]:
test_oh.head(5)

Unnamed: 0,age,workclass,industry,occupation,education,wage-per-hour,enroll-in-edu-inst-last-wk,marital-status,major-industry,major-occupation,...,country-father,country-mother,country-birth,citizenship,own-business,fill-questionnaire,veterans-benefits,weeks-worked,year,income
25305,4,3,0,0,10,0,2,4,14,6,...,39,39,39,4,0,1,0,0,95,0
129821,32,4,34,17,14,0,2,4,7,12,...,39,39,39,4,0,1,2,52,94,0
253617,23,4,42,30,1,0,2,4,12,7,...,39,39,39,4,0,1,2,43,95,0
16250,22,4,33,29,12,0,2,3,19,7,...,39,39,39,4,0,1,2,52,95,0
200353,54,7,41,6,14,0,2,3,9,10,...,2,2,2,0,0,1,2,52,94,0


In [77]:
#https://github.com/MatteoM95/Default-of-Credit-Card-Clients-Dataset-Analisys/blob/main/Default_of_Credit_Card_Clients.ipynb

In [78]:
m_dh_oh = test_oh.loc[test_oh["sex"] == 1]
m_dh_oh = m_dh_oh.head(100)
fm_dh_oh = test_oh.loc[test_oh["sex"] == 0]
fm_dh_oh = fm_dh_oh.head(100)

In [79]:
X_train = train_oh.drop(columns="income").values
Y_train = train_oh['income'].values
X_test = test_oh.drop(columns="income").values
Y_test = test_oh['income'].values
X_m = m_dh_oh.drop(columns="income").values
Y_m = m_dh_oh['income'].values
X_fm = fm_dh_oh.drop(columns="income").values
Y_fm = fm_dh_oh['income'].values

#creating torch dataset and loader using original dataset. 
#to use resampled dataset, replace ex. xtrain with xtrain_over etc.
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(Y_train).long())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(Y_test).long())
m_data = torch.utils.data.TensorDataset(torch.tensor(X_m).float(), torch.tensor(Y_m).long())
fm_data = torch.utils.data.TensorDataset(torch.tensor(X_fm).float(), torch.tensor(Y_fm).long())

train_loader = torch.utils.data.DataLoader(train_data,batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data))
m_loader = torch.utils.data.DataLoader(m_data, batch_size=1)
fm_loader = torch.utils.data.DataLoader(fm_data, batch_size=1)

In [80]:
class BasicNet(torch.nn.Module):
    
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.layers = 0
        
        self.lin1 = torch.nn.Linear(self.num_features,  150)        
        self.lin2 = torch.nn.Linear(50, 50)        
        self.lin3 = torch.nn.Linear(50, 50)
        
        self.lin4 = torch.nn.Linear(150, 150) 
        
        self.lin5 = torch.nn.Linear(50, 50)        
        self.lin6 = torch.nn.Linear(50, 50)
        self.lin10 = torch.nn.Linear(150, self.num_classes)
        
        self.prelu = torch.nn.PReLU()
        self.dropout = torch.nn.Dropout(0.25)

    def forward(self, xin):
        self.layers = 0
        
        x = F.relu(self.lin1(xin))
        self.layers += 1
        
        #x = F.relu(self.lin2(x))
        #self.layers += 1
        for y in range(8):
            x = F.relu(self.lin4(x)) 
            self.layers += 1
           
        x = self.dropout(x)
        
        x = F.relu(self.lin10(x)) 
        self.layers += 1
        return x

In [81]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    
    for inputs, target in train_loader:
      
        #inputs, target = inputs.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target.long())
        # Backprop
        loss.backward()
        optimizer.step()
        ###

In [82]:
def test(model, test_loader):
    model.eval()
    
    test_loss = 0
    correct = 0
    test_size = 0
    
    with torch.no_grad():
      
        for inputs, target in test_loader:
            
            #inputs, target = inputs.to(device), target.to(device)
            
            output = model(inputs)
            test_size += len(inputs)
            test_loss += test_loss_fn(output, target.long()).item() 
            pred = output.max(1, keepdim=True)[1] 
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_size
    accuracy = correct / test_size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_size,
        100. * accuracy))
    
    return test_loss, accuracy

In [83]:
model = BasicNet(36, 2)
test_accuracy = []
train_loss = []
nbr_epochs = 5
lr = 0.0025# 
weight_decay = 0

# Surrogate loss used for training
loss_fn = torch.nn.CrossEntropyLoss()
test_loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
#optimizer = optim.SGD(model.parameters(), lr=lr ,weight_decay=weight_decay)
#optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)

print('Training beginning...')
#start_time = time.time()

for epoch in range(1, nbr_epochs+1):
    print('Epoch ', epoch, ':')
    train(model, train_loader, optimizer, epoch)
    loss, acc = test(model, test_loader)
    
    # save results every epoch
    test_accuracy.append(acc)
    train_loss.append(loss)
    
#end_time = time.time()
#print('Training on ' + str(nbr_epochs) + ' epochs done in ', str(end_time-start_time),' seconds')

Training beginning...
Epoch  1 :

Test set: Average loss: 0.1462, Accuracy: 53475/56912 (94%)

Epoch  2 :

Test set: Average loss: 0.1427, Accuracy: 53917/56912 (95%)

Epoch  3 :

Test set: Average loss: 0.1439, Accuracy: 53475/56912 (94%)

Epoch  4 :

Test set: Average loss: 0.1457, Accuracy: 53475/56912 (94%)

Epoch  5 :

Test set: Average loss: 0.1464, Accuracy: 53475/56912 (94%)



In [84]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]

print(sum(Y_test), sum(Y_pred), sum(pred))


Accuracy: 53475/56912 (94%)

3437 tensor([0]) tensor([0])


<h1>Demographic Parity</h1>

<h2>Distribution of scores by sex</h2>

In [62]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.016
Demographic parity ratio: 0.334


<h2>Distribution of scores by marriage</h2>

In [187]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.014
Demographic parity ratio: 0.889


<h1>Conditional Demographic Parity</h1>

<h2>Distribution of scores by sex and hours worked per week</h2>

In [188]:
test_hpw_enum = test.hours_per_week.map(bin_hours_per_week)

cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

IndexError: boolean index did not match indexed array along dimension 0; dimension is 8881 but corresponding boolean dimension is 15060

<h2>Distribution of scores by race and hours worked per week</h2>

In [158]:
cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.race, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.race, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

IndexError: boolean index did not match indexed array along dimension 0; dimension is 8881 but corresponding boolean dimension is 15060

<h1>Equalised Odds</h1>

<h2>Distribution of scores by sex for high and low earners</h2>

In [63]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.010
Equalised odds ratio: 0.622


<h2>Distribution of scores by race for high and low earners</h2>

In [169]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.MARRIED,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

ZeroDivisionError: float division by zero

<h1>Shapley based Neuron Pruning for Fairness</h1>

In [64]:
m_shapley_values = ns.calculate_shapley_values_fa(model, m_loader, 100)
print(m_shapley_values)
fm_shapley_values = ns.calculate_shapley_values_fa(model, fm_loader, 100)
print(fm_shapley_values)

[2.5755290e-03 2.4965693e-06 0.0000000e+00 ... 2.0104255e+03 5.8078485e+02
 0.0000000e+00]
[5.9698927e-03 6.5512427e-06 0.0000000e+00 ... 1.6270671e+03 3.1382614e+02
 0.0000000e+00]


In [68]:
diff_shap_values = np.abs(m_shapley_values - fm_shapley_values)
max_diff_shap_values_ind = np.argpartition(diff_shap_values, -100)[-100:]
diff_shap_values[max_diff_shap_values_ind]

array([ 22.555035,  22.612995,  22.807823,  22.957474,  22.94933 ,
        22.972733,  23.697567,  39.228905,  33.247314,  23.862822,
        31.285164,  39.440807,  24.934755,  23.096247,  34.458427,
        25.766638,  26.46446 ,  29.394264,  27.494488,  28.910027,
        26.42323 ,  32.40368 ,  39.00482 ,  33.743385,  25.526945,
        23.772076,  35.775436,  24.46581 ,  26.267853,  27.39611 ,
        23.459423,  37.937027,  24.335651,  27.772589,  34.54961 ,
        27.114605,  33.08581 ,  39.054413,  29.189547,  29.986631,
        33.915546,  25.542051,  25.880674,  32.75854 ,  24.621471,
        34.404816,  25.15136 ,  31.471264,  24.418486,  31.507065,
        26.1749  ,  32.225166,  23.676643,  39.42842 ,  33.505207,
        34.004772,  25.156286,  34.56465 ,  35.794533,  23.560934,
        34.353546,  23.37143 ,  24.555092,  34.908783,  28.327225,
        31.53728 ,  25.308632,  24.949059,  32.583786,  26.471794,
        31.647612,  23.808586,  39.763256,  57.393127,  43.729

In [69]:
model_arr, model_slist = sim.get_net_arr(model)
model_arr[max_diff_shap_values_ind] = 0
updated_model = sim.get_arr_net(model, model_arr, model_slist)

In [70]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = updated_model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))
        

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]
test = pd.read_csv("../../data/adult/test.csv")


Accuracy: 53795/56912 (95%)



In [71]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.008
Demographic parity ratio: 0.383


In [72]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.sex,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.006
Equalised odds ratio: 0.698
