In [110]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn.functional as F
import torch.optim as optim

import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../")))
from libs import data as dt, neuronshap as ns, sim
from cfgs.fedargs import *

from fairlearn.metrics import (
    demographic_parity_difference,
    demographic_parity_ratio,
    equalized_odds_difference,
    equalized_odds_ratio,
)
from libs.helpers.finance import bin_hours_per_week
from libs.helpers.metrics import (
    conditional_demographic_parity_difference,
    conditional_demographic_parity_ratio,
)
from libs.helpers.plot import group_box_plots

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [111]:
dataset = pd.read_csv('../../data/German_Credit_Data/german.data',header = None, delim_whitespace = True)
dataset.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [112]:
dataset.columns=["Existing-Account-Status","Month-Duration","Credit-History","Purpose","Credit-Amount","Saving-Acount","Present-Employment","Instalment-Rate","Sex","Guarantors","Residence","Property","Age","Installment","Housing","Existing-Credits","Job","Num-People","Telephone","Foreign-Worker","Status"]
dataset.head(5)

Unnamed: 0,Existing-Account-Status,Month-Duration,Credit-History,Purpose,Credit-Amount,Saving-Acount,Present-Employment,Instalment-Rate,Sex,Guarantors,...,Property,Age,Installment,Housing,Existing-Credits,Job,Num-People,Telephone,Foreign-Worker,Status
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [113]:
CategoricalFeatures=['Existing-Account-Status','Credit-History','Purpose','Saving-Acount', 'Present-Employment', 'Sex','Guarantors','Property','Installment','Housing','Job','Telephone','Foreign-Worker']
data_encode=dataset.copy()
data_visual=dataset.copy()
data_encode.head(5)

Unnamed: 0,Existing-Account-Status,Month-Duration,Credit-History,Purpose,Credit-Amount,Saving-Acount,Present-Employment,Instalment-Rate,Sex,Guarantors,...,Property,Age,Installment,Housing,Existing-Credits,Job,Num-People,Telephone,Foreign-Worker,Status
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [115]:
label_encoder = preprocessing.LabelEncoder()

for x in CategoricalFeatures:
    data_encode[x]=label_encoder.fit_transform(data_encode[x])
    data_encode[x].unique()

#data_encode[data_encode['Status'] == 1] = 0
data_encode[data_encode['Status'] == 2] = 0
data_encode[data_encode['Sex'] == 2] = 1

cts_features = ["Credit-Amount", "Age"]
ss = StandardScaler()
data_encode[cts_features] = ss.fit_transform(data_encode[cts_features])

data_encode.head(10)

Unnamed: 0,Existing-Account-Status,Month-Duration,Credit-History,Purpose,Credit-Amount,Saving-Acount,Present-Employment,Instalment-Rate,Sex,Guarantors,...,Property,Age,Installment,Housing,Existing-Credits,Job,Num-People,Telephone,Foreign-Worker,Status
0,1,1,1,1,0.896442,1,1,1,1,1,...,1,0.828824,1,1,1,1,1,1,1,1
1,0,0,0,0,-1.171394,0,0,0,0,0,...,0,-1.408943,0,0,0,0,0,0,0,0
2,1,1,1,1,0.896442,1,1,1,1,1,...,1,0.828824,1,1,1,1,1,1,1,1
3,1,1,1,1,0.896442,1,1,1,1,1,...,1,0.828824,1,1,1,1,1,1,1,1
4,0,0,0,0,-1.171394,0,0,0,0,0,...,0,-1.408943,0,0,0,0,0,0,0,0
5,1,1,1,1,0.896442,1,1,1,1,1,...,1,0.828824,1,1,1,1,1,1,1,1
6,1,1,1,1,0.896442,1,1,1,1,1,...,1,0.828824,1,1,1,1,1,1,1,1
7,1,1,1,1,0.896442,1,1,1,1,1,...,1,0.828824,1,1,1,1,1,1,1,1
8,3,12,2,4,0.228238,3,3,2,0,0,...,0,1.659191,2,1,1,1,1,0,0,1
9,0,0,0,0,-1.171394,0,0,0,0,0,...,0,-1.408943,0,0,0,0,0,0,0,0


In [78]:
# https://github.com/kaymomin/German-Credit-Rating/blob/master/German%20Rank.ipynb
train_oh, test_oh = train_test_split(data_encode, test_size=0.3, random_state=42)

In [79]:
m_dh_oh = test_oh.loc[test_oh["Sex"] == 1]
m_dh_oh = m_dh_oh.head(10)
fm_dh_oh = test_oh.loc[test_oh["Sex"] == 0]
fm_dh_oh = fm_dh_oh.head(10)

In [80]:
X_train = train_oh.drop(columns="Status").values
Y_train = train_oh['Status'].values
X_test = test_oh.drop(columns="Status").values
Y_test = test_oh['Status'].values
X_m = m_dh_oh.drop(columns="Status").values
Y_m = m_dh_oh['Status'].values
X_fm = fm_dh_oh.drop(columns="Status").values
Y_fm = fm_dh_oh['Status'].values

#creating torch dataset and loader using original dataset. 
#to use resampled dataset, replace ex. xtrain with xtrain_over etc.
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(Y_train).long())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(Y_test).long())
m_data = torch.utils.data.TensorDataset(torch.tensor(X_m).float(), torch.tensor(Y_m).long())
fm_data = torch.utils.data.TensorDataset(torch.tensor(X_fm).float(), torch.tensor(Y_fm).long())

train_loader = torch.utils.data.DataLoader(train_data,batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data))
m_loader = torch.utils.data.DataLoader(m_data, batch_size=1)
fm_loader = torch.utils.data.DataLoader(fm_data, batch_size=1)

In [81]:
class BasicNet(torch.nn.Module):
    
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.layers = 0
        
        self.lin1 = torch.nn.Linear(self.num_features,  150)        
        self.lin2 = torch.nn.Linear(50, 50)        
        self.lin3 = torch.nn.Linear(50, 50)
        
        self.lin4 = torch.nn.Linear(150, 150) 
        
        self.lin5 = torch.nn.Linear(50, 50)        
        self.lin6 = torch.nn.Linear(50, 50)
        self.lin10 = torch.nn.Linear(150, self.num_classes)
        
        self.prelu = torch.nn.PReLU()
        self.dropout = torch.nn.Dropout(0.25)

    def forward(self, xin):
        self.layers = 0
        
        x = F.relu(self.lin1(xin))
        self.layers += 1
        
        #x = F.relu(self.lin2(x))
        #self.layers += 1
        for y in range(8):
            x = F.relu(self.lin4(x)) 
            self.layers += 1
           
        x = self.dropout(x)
        
        x = F.relu(self.lin10(x)) 
        self.layers += 1
        return x

In [82]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    
    for inputs, target in train_loader:
      
        #inputs, target = inputs.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target.long())
        # Backprop
        loss.backward()
        optimizer.step()
        ###

In [83]:
def test(model, test_loader):
    model.eval()
    
    test_loss = 0
    correct = 0
    test_size = 0
    
    with torch.no_grad():
      
        for inputs, target in test_loader:
            
            #inputs, target = inputs.to(device), target.to(device)
            
            output = model(inputs)
            test_size += len(inputs)
            test_loss += test_loss_fn(output, target.long()).item() 
            pred = output.max(1, keepdim=True)[1] 
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_size
    accuracy = correct / test_size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_size,
        100. * accuracy))
    
    return test_loss, accuracy

In [84]:
model = BasicNet(20, 2)
test_accuracy = []
train_loss = []
nbr_epochs = 5
lr = 0.0025# 
weight_decay = 0

# Surrogate loss used for training
loss_fn = torch.nn.CrossEntropyLoss()
test_loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
#optimizer = optim.SGD(model.parameters(), lr=lr ,weight_decay=weight_decay)
#optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)

print('Training beginning...')
#start_time = time.time()

for epoch in range(1, nbr_epochs+1):
    print('Epoch ', epoch, ':')
    train(model, train_loader, optimizer, epoch)
    loss, acc = test(model, test_loader)
    
    # save results every epoch
    test_accuracy.append(acc)
    train_loss.append(loss)
    
#end_time = time.time()
#print('Training on ' + str(nbr_epochs) + ' epochs done in ', str(end_time-start_time),' seconds')

Training beginning...
Epoch  1 :

Test set: Average loss: 0.4279, Accuracy: 209/300 (70%)

Epoch  2 :

Test set: Average loss: 0.3112, Accuracy: 209/300 (70%)

Epoch  3 :

Test set: Average loss: 0.2290, Accuracy: 209/300 (70%)

Epoch  4 :

Test set: Average loss: 0.2123, Accuracy: 209/300 (70%)

Epoch  5 :

Test set: Average loss: 0.2103, Accuracy: 300/300 (100%)



In [85]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]

print(sum(Y_test), sum(Y_pred), sum(pred))


Accuracy: 300/300 (100%)

209 tensor([209]) tensor([209])


<h1>Demographic Parity</h1>

<h2>Distribution of scores by sex</h2>

In [86]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.892
Demographic parity ratio: 0.108


<h1>Equalised Odds</h1>

<h2>Distribution of scores by sex for high and low earners</h2>

In [87]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)
'''
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)
'''

print(f"Equalised odds difference: {eod:.3f}")
#print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.000


<h1>Shapley based Neuron Pruning for Fairness</h1>

In [88]:
m_shapley_values = ns.calculate_shapley_values_fa(model, m_loader, 100)
print(m_shapley_values)
fm_shapley_values = ns.calculate_shapley_values_fa(model, fm_loader, 100)
print(fm_shapley_values)

[0.03342103 0.04455322 0.04611694 ... 0.009887   1.3468606  0.        ]
[9.4366414e-06 2.6361988e-04 0.0000000e+00 ... 1.0006590e+00 3.3067398e+00
 0.0000000e+00]


In [89]:
diff_shap_values = np.abs(m_shapley_values - fm_shapley_values)
max_diff_shap_values_ind = np.argpartition(diff_shap_values, -100)[-100:]
diff_shap_values[max_diff_shap_values_ind]

array([ 2.3024883,  2.3223257,  2.379363 ,  2.3277562,  2.3902104,
        2.3933585,  2.4477372,  2.484197 ,  2.5544975,  2.606608 ,
        2.8475337,  2.8283644,  2.5910747,  2.85245  ,  3.2894642,
        3.0080683,  3.2758923,  2.9312315,  2.918551 ,  2.8914022,
        3.2507064,  2.9012377,  2.9240448,  3.2674215,  3.392113 ,
        7.203474 , 12.726632 , 11.473837 ,  7.7141905,  8.39189  ,
       13.208869 , 11.354346 ,  6.901027 , 16.800743 , 12.575794 ,
       15.596204 ,  4.736683 , 10.767804 , 11.551165 ,  7.626538 ,
        9.0657425,  9.661372 ,  9.692471 ,  6.751466 ,  3.4327412,
       14.177499 ,  8.2977   ,  5.161647 ,  7.7474184,  3.712352 ,
        5.2523665,  9.352481 ,  6.9395304, 13.194591 ,  4.739308 ,
        8.077386 ,  7.7829876,  5.1630836,  3.537106 ,  6.0242143,
        8.715162 , 13.214752 ,  5.9832025,  9.424977 , 12.074395 ,
        6.7845573,  9.605252 ,  4.4549685, 12.42171  ,  3.7852285,
        7.584855 , 12.202214 ,  3.8586655,  8.215441 ,  6.6199

In [90]:
model_arr, model_slist = sim.get_net_arr(model)
model_arr[max_diff_shap_values_ind] = 0
updated_model = sim.get_arr_net(model, model_arr, model_slist)

In [91]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = updated_model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))
        

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]
test = pd.read_csv("../../data/adult/test.csv")


Accuracy: 300/300 (100%)



In [92]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.892
Demographic parity ratio: 0.108


In [21]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test_oh.Sex,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 1.000
Equalised odds ratio: 0.000
