In [47]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import shap
import torch
import torch.nn.functional as F
import torch.optim as optim

import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../")))
from libs import data as dt, neuronshap as ns, sim
from cfgs.fedargs import *

from fairlearn.metrics import (
    demographic_parity_difference,
    demographic_parity_ratio,
    equalized_odds_difference,
    equalized_odds_ratio,
    false_negative_rate,
    false_positive_rate,
    true_negative_rate,
    true_positive_rate,
)
from libs.helpers.finance import bin_hours_per_week
from libs.helpers.metrics import (
    conditional_demographic_parity_difference,
    conditional_demographic_parity_ratio,
)
from libs.helpers.plot import group_box_plots

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "salary",
]

In [49]:
def clean_string(s):
    """
    Helper function that strips leading / trailing whitespace, lower
    cases, and replaces hyphens with underscores.
    """
    return s.strip().lower().replace("-", "_")


def parse_native_country(country):
    """
    Group countries other than United-States and Mexico into single
    "other" category"
    """
    country = clean_string(country)
    if country == "united_states" or country == "mexico":
        return country
    return "other"

In [50]:
train = (
    pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        header=None,
        na_values=[" ?"],
        names=names,
    )
    .drop(columns=["fnlwgt", "education_num"])
    # drop all rows with missing values
    .dropna()
    .reset_index(drop=True)
    # simple preprocessing on columns
    .assign(
        # clean all string columns
        education=lambda df: df.education.map(clean_string),
        marital_status=lambda df: df.marital_status.map(clean_string),
        occupation=lambda df: df.occupation.map(clean_string),
        race=lambda df: df.race.map(clean_string),
        relationship=lambda df: df.relationship.map(clean_string),
        workclass=lambda df: df.workclass.map(clean_string),
        # clean and aggregate native_country
        native_country=lambda df: df.native_country.map(parse_native_country),
        # encode binary features as integers
        salary=lambda df: (df.salary == " >50K").astype(np.int32),
        sex=lambda df: (df.sex == " Male").astype(np.int32),
    )
)

In [51]:
test = (
    pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
        header=None,
        na_values=[" ?"],
        skiprows=1,
        names=names,
    )
    .drop(columns=["fnlwgt", "education_num"])
    # drop all rows with missing values
    .dropna()
    .reset_index(drop=True)
    # simple preprocessing on columns
    .assign(
        # clean all string columns
        education=lambda df: df.education.map(clean_string),
        marital_status=lambda df: df.marital_status.map(clean_string),
        occupation=lambda df: df.occupation.map(clean_string),
        race=lambda df: df.race.map(clean_string),
        relationship=lambda df: df.relationship.map(clean_string),
        workclass=lambda df: df.workclass.map(clean_string),
        # clean and aggregate native_country
        native_country=lambda df: df.native_country.map(parse_native_country),
        # encode binary features as integers
        # note extra '.' in test set not present in train set
        salary=lambda df: (df.salary == " >50K.").astype(np.int32),
        sex=lambda df: (df.sex == " Male").astype(np.int32),
    )
)

In [52]:
assert set(train.education) == set(test.education)
assert set(train.race) == set(test.race)
assert set(train.relationship) == set(test.relationship)
assert set(train.marital_status) == set(test.marital_status)

In [53]:
one_hot_features = [
    "workclass",
    "education",
    "occupation",
    "race",
    "relationship",
    "marital_status",
    "native_country",
]

cts_features = ["age", "capital_gain", "capital_loss", "hours_per_week"]

binary_features = ["sex", "salary"]

In [54]:
train["race"].value_counts()

race
white                 25933
black                  2817
asian_pac_islander      895
amer_indian_eskimo      286
other                   231
Name: count, dtype: int64

In [55]:
train_df = pd.concat(
    [train, pd.get_dummies(train.loc[:, one_hot_features], dtype=np.int32)],
    axis=1,
)

test_df = pd.concat(
    [test, pd.get_dummies(test.loc[:, one_hot_features], dtype=np.int32)],
    axis=1,
)

In [56]:
test_df

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,...,marital_status_divorced,marital_status_married_af_spouse,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,native_country_mexico,native_country_other,native_country_united_states
0,25,private,11th,never_married,machine_op_inspct,own_child,black,1,0,0,...,0,0,0,0,1,0,0,0,0,1
1,38,private,hs_grad,married_civ_spouse,farming_fishing,husband,white,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2,28,local_gov,assoc_acdm,married_civ_spouse,protective_serv,husband,white,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,44,private,some_college,married_civ_spouse,machine_op_inspct,husband,black,1,7688,0,...,0,0,1,0,0,0,0,0,0,1
4,34,private,10th,never_married,other_service,not_in_family,white,1,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,private,bachelors,never_married,prof_specialty,own_child,white,1,0,0,...,0,0,0,0,1,0,0,0,0,1
15056,39,private,bachelors,divorced,prof_specialty,not_in_family,white,0,0,0,...,1,0,0,0,0,0,0,0,0,1
15057,38,private,bachelors,married_civ_spouse,prof_specialty,husband,white,1,0,0,...,0,0,1,0,0,0,0,0,0,1
15058,44,private,bachelors,divorced,adm_clerical,own_child,asian_pac_islander,1,5455,0,...,1,0,0,0,0,0,0,0,0,1


In [57]:
assert train_df.columns.tolist() == test_df.columns.tolist()

In [58]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [59]:
data_dir = "../../data/adult"

In [60]:
original_features = cts_features + one_hot_features + binary_features

train_df[original_features].to_csv("../../data/adult/train.csv", index=False)
val_df[original_features].to_csv("../../data/adult/val.csv", index=False)
test_df[original_features].to_csv("../../data/adult/test.csv", index=False)

In [61]:
ss = StandardScaler()

train_df[cts_features] = ss.fit_transform(train_df[cts_features])
val_df[cts_features] = ss.transform(val_df[cts_features])
test_df[cts_features] = ss.transform(test_df[cts_features])

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `

In [62]:
train_df.drop(columns=one_hot_features).to_csv("../../data/adult/train-one-hot.csv", index=False)
val_df.drop(columns=one_hot_features).to_csv("../../data/adult/val-one-hot.csv", index=False)
test_df.drop(columns=one_hot_features).to_csv("../../data/adult/test-one-hot.csv", index=False)

In [63]:
train = pd.read_csv("../../data/adult/train.csv")
val = pd.read_csv("../../data/adult/val.csv")
test = pd.read_csv("../../data/adult/test.csv")

train_oh = pd.read_csv("../../data/adult/train-one-hot.csv")
val_oh = pd.read_csv("../../data/adult/val-one-hot.csv")
test_oh = pd.read_csv("../../data/adult/test-one-hot.csv")

In [64]:
test_oh.head(5)

Unnamed: 0,age,sex,capital_gain,capital_loss,hours_per_week,salary,workclass_federal_gov,workclass_local_gov,workclass_private,workclass_self_emp_inc,...,marital_status_divorced,marital_status_married_af_spouse,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,native_country_mexico,native_country_other,native_country_united_states
0,-1.015917,1,-0.147741,-0.218133,-0.079269,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
1,-0.029378,1,-0.147741,-0.218133,0.752765,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,-0.788255,1,-0.147741,-0.218133,-0.079269,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0.425948,1,0.872159,-0.218133,-0.079269,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
4,-0.332929,1,-0.147741,-0.218133,-0.911303,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1


In [65]:
#https://github.com/ritvikkhanna09/Census-classifier-comparison

In [66]:
'''
race_amer_indian_eskimo               15060 non-null  int64  
race_asian_pac_islander               15060 non-null  int64  
race_black                            15060 non-null  int64  
race_other                            15060 non-null  int64  
race_white
 
'''

mr_dh_oh = test_oh.loc[(test_oh["race_asian_pac_islander"] == 1) | (test_oh["race_white"] == 1)]
mr_dh_oh = mr_dh_oh.head(100)
fmr_dh_oh = test_oh.loc[(test_oh["race_amer_indian_eskimo"] == 1) | (test_oh["race_black"] == 1) | (test_oh["race_other"] == 1)]
fmr_dh_oh = fmr_dh_oh.head(100)


m_dh_oh = test_oh.loc[test_oh["sex"] == 1]
m_dh_oh = m_dh_oh.head(100)
fm_dh_oh = test_oh.loc[test_oh["sex"] == 0]
fm_dh_oh = fm_dh_oh.head(100)

In [67]:
X_train = train_oh.drop(columns="salary").values
Y_train = train_oh['salary'].values
X_test = test_oh.drop(columns="salary").values
Y_test = test_oh['salary'].values
X_m = m_dh_oh.drop(columns="salary").values
Y_m = m_dh_oh['salary'].values
X_fm = fm_dh_oh.drop(columns="salary").values
Y_fm = fm_dh_oh['salary'].values

#creating torch dataset and loader using original dataset. 
#to use resampled dataset, replace ex. xtrain with xtrain_over etc.
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(Y_train).long())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(Y_test).long())
m_data = torch.utils.data.TensorDataset(torch.tensor(X_m).float(), torch.tensor(Y_m).long())
fm_data = torch.utils.data.TensorDataset(torch.tensor(X_fm).float(), torch.tensor(Y_fm).long())

train_loader = torch.utils.data.DataLoader(train_data,batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data))
m_loader = torch.utils.data.DataLoader(m_data, batch_size=1)
fm_loader = torch.utils.data.DataLoader(fm_data, batch_size=1)
mr_loader = torch.utils.data.DataLoader(m_data, batch_size=1)
fmr_loader = torch.utils.data.DataLoader(fm_data, batch_size=1)

In [68]:
class BasicNet(torch.nn.Module):
    
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        self.layers = 0
        
        self.lin1 = torch.nn.Linear(self.num_features,  150)        
        self.lin2 = torch.nn.Linear(50, 50)        
        self.lin3 = torch.nn.Linear(50, 50)
        
        self.lin4 = torch.nn.Linear(150, 150) 
        
        self.lin5 = torch.nn.Linear(50, 50)        
        self.lin6 = torch.nn.Linear(50, 50)
        self.lin10 = torch.nn.Linear(150, self.num_classes)
        
        self.prelu = torch.nn.PReLU()
        self.dropout = torch.nn.Dropout(0.25)

    def forward(self, xin):
        self.layers = 0
        
        x = F.relu(self.lin1(xin))
        self.layers += 1
        
        #x = F.relu(self.lin2(x))
        #self.layers += 1
        for y in range(8):
            x = F.relu(self.lin4(x)) 
            self.layers += 1
           
        x = self.dropout(x)
        
        x = F.relu(self.lin10(x)) 
        self.layers += 1
        return x

In [69]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    
    for inputs, target in train_loader:
      
        #inputs, target = inputs.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target.long())
        # Backprop
        loss.backward()
        optimizer.step()
        ###

In [70]:
def test(model, test_loader):
    model.eval()
    
    test_loss = 0
    correct = 0
    test_size = 0
    
    with torch.no_grad():
      
        for inputs, target in test_loader:
            
            #inputs, target = inputs.to(device), target.to(device)
            
            output = model(inputs)
            test_size += len(inputs)
            test_loss += test_loss_fn(output, target.long()).item() 
            pred = output.max(1, keepdim=True)[1] 
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_size
    accuracy = correct / test_size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_size,
        100. * accuracy))
    
    return test_loss, accuracy

In [71]:
model = BasicNet(63, 2)
test_accuracy = []
train_loss = []
nbr_epochs = 5
lr = 0.0025# 
weight_decay = 0

# Surrogate loss used for training
loss_fn = torch.nn.CrossEntropyLoss()
test_loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)
#optimizer = optim.SGD(model.parameters(), lr=lr ,weight_decay=weight_decay)
#optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)

print('Training beginning...')
#start_time = time.time()

for epoch in range(1, nbr_epochs+1):
    print('Epoch ', epoch, ':')
    train(model, train_loader, optimizer, epoch)
    loss, acc = test(model, test_loader)
    
    # save results every epoch
    test_accuracy.append(acc)
    train_loss.append(loss)
    
#end_time = time.time()
#print('Training on ' + str(nbr_epochs) + ' epochs done in ', str(end_time-start_time),' seconds')

Training beginning...
Epoch  1 :

Test set: Average loss: 0.3206, Accuracy: 12791/15060 (85%)

Epoch  2 :

Test set: Average loss: 0.3252, Accuracy: 12673/15060 (84%)

Epoch  3 :

Test set: Average loss: 0.3254, Accuracy: 12805/15060 (85%)

Epoch  4 :

Test set: Average loss: 0.3344, Accuracy: 12805/15060 (85%)

Epoch  5 :

Test set: Average loss: 0.3224, Accuracy: 12779/15060 (85%)



In [142]:
for inputs, target in test_loader:
    inputs = inputs[1:100]
    print(inputs.shape)
    e = shap.DeepExplainer(model, inputs)
    
    m_impact_of_sex = []
    for i, _ in m_loader:
        shap_values =  e.shap_values(i)
        #print(np.array(shap_values[1]))
        impact_of_sex = np.array(shap_values[1])[0]
        impact_of_sex[impact_of_sex<0] = 0
        #print(impact_of_sex)
        m_impact_of_sex.append(impact_of_sex)
        
        
    fm_impact_of_sex = []
    for i, _ in fm_loader:
        shap_values =  e.shap_values(i)
        #print(np.array(shap_values[1]))
        impact_of_sex = np.array(shap_values[1])[0]
        print(impact_of_sex)
        impact_of_sex[impact_of_sex>0] = 0
        impact_of_sex = impact_of_sex * -1
        fm_impact_of_sex.append(impact_of_sex)        
        
    break
    
diff = np.array(m_impact_of_sex) - np.array(fm_impact_of_sex)

#print(m_impact_of_sex[0])

#print(m_impact_of_sex[0], fm_impact_of_sex[0], diff[0])

s_diff = [sum(arr) for arr in diff]
m_impact_of_sex = [sum(arr) for arr in m_impact_of_sex]
fm_impact_of_sex = [sum(arr) for arr in fm_impact_of_sex]
#print(s_diff)

#print(m_impact_of_sex)

m_ind = np.argpartition(m_impact_of_sex, -50)[-50:]
f_ind = np.argpartition(fm_impact_of_sex, -50)[-50:]
#print(ind)

torch.Size([99, 63])
[ 9.00074840e-02 -8.45844597e-02 -4.55300391e-01  0.00000000e+00
  3.19386646e-02 -3.55740963e-03  5.62841073e-03 -7.49419024e-03
 -1.01426260e-04  1.03654610e-02  5.60003240e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.66253392e-03
 -4.56439011e-04 -1.12660863e-02  0.00000000e+00  4.75395359e-02
 -2.30722371e-02  0.00000000e+00 -2.24308446e-02 -3.43054198e-02
  3.94927710e-03  0.00000000e+00  1.61199516e-03 -2.18056217e-02
  0.00000000e+00  0.00000000e+00  1.92718394e-02 -2.01605484e-01
  0.00000000e+00 -1.73117965e-02 -4.02477337e-03 -5.68673771e-04
 -8.22058786e-03  3.41295614e-03  0.00000000e+00  0.00000000e+00
  5.93788223e-03  0.00000000e+00 -4.87497170e-03  4.92350291e-03
 -4.70414804e-03  0.00000000e+00  0.00000000e+00 -4.12643813e-02
 -1.80634353e-02  0.00000000e+00  0.00000000e+00 -9.20194387e-02
  0.00000000e+00 -8.11003447e-02  0.00000000e+00  0.00000000e+00
  0.

[ 1.24396645e-01 -8.45844597e-02 -4.55300391e-01  0.00000000e+00
  3.19386646e-02 -3.55740963e-03  5.62841073e-03 -7.49419024e-03
 -1.01426260e-04  1.03654610e-02  5.60003240e-03  0.00000000e+00
 -2.57996887e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.66253392e-03
 -4.56439011e-04 -1.12660863e-02  0.00000000e+00  4.75395359e-02
 -2.30722371e-02  0.00000000e+00 -2.24308446e-02  9.62056313e-03
 -2.46728281e-03  0.00000000e+00  1.61199516e-03 -2.18056217e-02
  0.00000000e+00  0.00000000e+00  1.92718394e-02  1.49389654e-02
  0.00000000e+00 -1.73117965e-02 -4.02477337e-03 -5.68673771e-04
 -8.22058786e-03  3.41295614e-03  0.00000000e+00  0.00000000e+00
  5.93788223e-03  0.00000000e+00 -4.87497170e-03  4.92350291e-03
 -4.70414804e-03  0.00000000e+00  0.00000000e+00 -4.12643813e-02
 -1.80634353e-02 -1.61552638e-01  0.00000000e+00 -9.20194387e-02
  0.00000000e+00  7.83996098e-03  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000

In [113]:
update_m = []
update_fm = []

for index, (i, m) in enumerate(m_loader):
    if index in m_ind:
        update_m.append(tuple([i[0], m[0]]))
        
for index, (j, f) in enumerate(fm_loader):
    if index in f_ind:
        update_fm.append(tuple([j[0], f[0]]))       
        
um_loader = torch.utils.data.DataLoader(update_m, batch_size=1)
ufm_loader = torch.utils.data.DataLoader(update_fm, batch_size=1)

In [121]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]

print(sum(Y_test), sum(Y_pred), sum(pred))

test = pd.read_csv("../../data/adult/test.csv")


Accuracy: 12779/15060 (85%)

3700 tensor([3661]) tensor([3661])


<h1>Demographic Parity</h1>

<h2>Distribution of scores by sex</h2>

In [122]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test.sex,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test.sex,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.214
Demographic parity ratio: 0.315


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


<h2>Distribution of scores by race</h2>

In [76]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test.race,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test.race,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.226
Demographic parity ratio: 0.263


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


<h1>Conditional Demographic Parity</h1>

<h2>Distribution of scores by sex and hours worked per week</h2>

In [77]:
test_hpw_enum = test.hours_per_week.map(bin_hours_per_week)

cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

Conditional demographic parity difference: 0.200
Conditional demographic parity ratio: 0.496


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map inst

<h2>Distribution of scores by race and hours worked per week</h2>

In [78]:
cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.race, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.race, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

Conditional demographic parity difference: 0.306
Conditional demographic parity ratio: 0.107


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map inst

<h1>Equalised Odds</h1>

<h2>Distribution of scores by sex for high and low earners</h2>

In [79]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test.sex,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test.sex,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.106
Equalised odds ratio: 0.254


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


<h2>Distribution of scores by race for high and low earners</h2>

In [80]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test.race,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test.race,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

Equalised odds difference: 0.419
Equalised odds ratio: 0.172


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


<h1>Shapley based Neuron Pruning for Fairness</h1>

In [133]:
m_shapley_values = ns.calculate_shapley_values_fa(model, um_loader, 100)
print(m_shapley_values)
fm_shapley_values = ns.calculate_shapley_values_fa(model, ufm_loader, 100)
print(fm_shapley_values)

[  1.2874211    2.974666     0.31111678 ... 214.10822    246.08226
   0.        ]
[1.3887310e+00 0.0000000e+00 2.0470287e-01 ... 2.6251630e+02 2.5253532e+01
 0.0000000e+00]


In [134]:
diff_shap_values = m_shapley_values - fm_shapley_values
max_diff_shap_values_ind = np.argpartition(diff_shap_values, -50)[-50:]
diff_shap_values[max_diff_shap_values_ind]

array([ 12.821328,  13.037482,  13.06879 ,  13.396547,  13.121091,
        13.601552,  13.848332,  14.943618,  13.933263,  14.207593,
        14.069859,  15.283588,  20.148027,  20.540659,  16.61804 ,
        19.85488 ,  22.82862 ,  36.814438,  67.26234 ,  16.733624,
        18.175508,  15.791675,  16.668663,  20.8291  ,  21.520172,
        17.76001 ,  15.917422,  25.213995,  21.500858,  16.0526  ,
        17.43257 ,  16.80828 ,  21.917643,  20.746685,  20.868774,
        23.394451, 112.742134,  56.731434,  35.975353,  16.110249,
        18.491457,  58.12367 ,  22.970488,  22.444593,  22.68563 ,
        20.445074,  21.750399,  16.03243 , 220.82874 ,  25.486677],
      dtype=float32)

In [135]:
model_arr, model_slist = sim.get_net_arr(model)
model_arr[max_diff_shap_values_ind] = 0
updated_model = sim.get_arr_net(model, model_arr, model_slist)

In [136]:
with torch.no_grad():
    for inputs, target in test_loader:
        outputs = updated_model(inputs)
        pred = outputs.max(1, keepdim=True)[1] 
        correct = pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / len(inputs)
        print('\nAccuracy: {}/{} ({:.0f}%)\n'.format(correct, len(inputs), 100. * accuracy))
        

Y_prob = F.softmax(outputs, dim=1)[:, 1]
Y_pred = outputs.max(1, keepdim=True)[1]

print(sum(Y_test), sum(Y_pred), sum(pred))

test = pd.read_csv("../../data/adult/test.csv")


Accuracy: 12386/15060 (82%)

3700 tensor([1258]) tensor([1258])


In [137]:
dpd = demographic_parity_difference(
    Y_test, Y_pred, sensitive_features=test.sex,
)
dpr = demographic_parity_ratio(
    Y_test, Y_pred, sensitive_features=test.sex,
)

print(f"Demographic parity difference: {dpd:.3f}")
print(f"Demographic parity ratio: {dpr:.3f}")

Demographic parity difference: 0.084
Demographic parity ratio: 0.240


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


In [104]:
test_hpw_enum = test.hours_per_week.map(bin_hours_per_week)

cdpd = conditional_demographic_parity_difference(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)
cdpr = conditional_demographic_parity_ratio(
    Y_test, Y_pred, test.sex, test_hpw_enum,
)

print(f"Conditional demographic parity difference: {cdpd:.3f}")
print(f"Conditional demographic parity ratio: {cdpr:.3f}")

DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map inst

Conditional demographic parity difference: 0.117
Conditional demographic parity ratio: 0.432


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


In [41]:
eod = equalized_odds_difference(
    Y_test, Y_pred, sensitive_features=test.sex,
)
eor = equalized_odds_ratio(
    Y_test, Y_pred, sensitive_features=test.sex,
)

print(f"Equalised odds difference: {eod:.3f}")
print(f"Equalised odds ratio: {eor:.3f}")

DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


Equalised odds difference: 0.001
Equalised odds ratio: 0.804


DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.
DataFrame.applymap has been deprecated. Use DataFrame.map instead.


In [None]:
[ 9.00074840e-02 -8.45844597e-02 -4.55300391e-01  0.00000000e+00
  3.19386646e-02 -3.55740963e-03  5.62841073e-03 -7.49419024e-03
 -1.01426260e-04  1.03654610e-02  5.60003240e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.66253392e-03
 -4.56439011e-04 -1.12660863e-02  0.00000000e+00  4.75395359e-02
 -2.30722371e-02  0.00000000e+00 -2.24308446e-02 -3.43054198e-02
  3.94927710e-03  0.00000000e+00  1.61199516e-03 -2.18056217e-02
  0.00000000e+00  0.00000000e+00  1.92718394e-02 -2.01605484e-01
  0.00000000e+00 -1.73117965e-02 -4.02477337e-03 -5.68673771e-04
 -8.22058786e-03  3.41295614e-03  0.00000000e+00  0.00000000e+00
  5.93788223e-03  0.00000000e+00 -4.87497170e-03  4.92350291e-03
 -4.70414804e-03  0.00000000e+00  0.00000000e+00 -4.12643813e-02
 -1.80634353e-02  0.00000000e+00  0.00000000e+00 -9.20194387e-02
  0.00000000e+00 -8.11003447e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00]
[ 9.38284919e-02 -8.45844597e-02 -4.55300391e-01  0.00000000e+00
  2.65410505e-02 -3.55740963e-03  5.62841073e-03 -7.49419024e-03
 -1.01426260e-04  1.03654610e-02  5.60003240e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.66253392e-03
 -4.56439011e-04 -1.12660863e-02  0.00000000e+00 -6.34444803e-02
 -2.30722371e-02  0.00000000e+00 -2.24308446e-02  9.62056313e-03
 -2.46728281e-03  0.00000000e+00  1.61199516e-03 -2.18056217e-02
  0.00000000e+00  0.00000000e+00  1.92718394e-02  1.49389654e-02
  0.00000000e+00 -1.73117965e-02 -4.02477337e-03 -5.68673771e-04
 -8.22058786e-03  3.41295614e-03  0.00000000e+00  0.00000000e+00
  5.93788223e-03  0.00000000e+00 -4.87497170e-03  4.92350291e-03
 -2.31849048e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -1.80634353e-02  0.00000000e+00  0.00000000e+00 -9.20194387e-02
  0.00000000e+00 -8.11003447e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00]
[ 1.26307175e-01 -8.45844597e-02 -4.55300391e-01  0.00000000e+00
 -2.20374744e-02 -3.55740963e-03  5.62841073e-03 -7.49419024e-03
 -1.01426260e-04  1.03654610e-02  5.60003240e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.66253392e-03
 -4.56439011e-04 -1.12660863e-02  0.00000000e+00 -6.34444803e-02
 -2.30722371e-02  0.00000000e+00 -2.24308446e-02  9.62056313e-03
 -2.46728281e-03  0.00000000e+00  1.61199516e-03 -2.18056217e-02
  0.00000000e+00  0.00000000e+00  1.92718394e-02  1.49389654e-02
  0.00000000e+00 -1.73117965e-02 -4.02477337e-03 -5.68673771e-04
 -8.22058786e-03  3.41295614e-03  0.00000000e+00  0.00000000e+00
  5.93788223e-03  0.00000000e+00 -4.87497170e-03  4.92350291e-03
 -4.70414804e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.33578435e-01  0.00000000e+00  0.00000000e+00  1.73560332e-03
  0.00000000e+00  7.83996098e-03  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00]
[ 1.14844121e-01 -8.45844597e-02 -4.55300391e-01  0.00000000e+00
 -7.60136098e-02 -3.55740963e-03  5.62841073e-03 -7.49419024e-03
 -1.01426260e-04  1.03654610e-02  5.60003240e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.66253392e-03
 -4.56439011e-04 -1.12660863e-02  0.00000000e+00 -6.34444803e-02
 -2.30722371e-02  0.00000000e+00 -2.24308446e-02  9.62056313e-03
  3.94927710e-03  0.00000000e+00  1.61199516e-03 -2.18056217e-02
  0.00000000e+00  0.00000000e+00  4.19547223e-03  1.49389654e-02
  0.00000000e+00 -1.73117965e-02 -4.02477337e-03 -5.68673771e-04
 -8.22058786e-03  3.41295614e-03  0.00000000e+00  0.00000000e+00
  5.93788223e-03  0.00000000e+00 -4.87497170e-03  4.92350291e-03
 -4.70414804e-03  0.00000000e+00  0.00000000e+00 -4.12643813e-02
 -1.80634353e-02  0.00000000e+00  0.00000000e+00 -9.20194387e-02
  0.00000000e+00  7.83996098e-03  0.00000000e+00 -9.31027234e-02
  0.00000000e+00  0.00000000e+00  0.00000000e+00]