In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
from load_data import *
from bias_functions import *
from utility_functions import *

In [3]:
def getDataset(setSelection):
    if setSelection == 'toy':
        X,y = generate_toy_data(1000,200,2)
    elif setSelection == 'adult':
        protectedAttributes={'race':'White','gender':'Male'}
        X,y = load_adult(protectedAttributes=protectedAttributes)
    elif setSelection == 'bank':
        X,y = load_bank(smaller=True)
    elif setSelection == 'german':
        X,y = load_german()
    elif setSelection == 'mortgage':
        protectedCategoricalAttributes={'applicant_ethnicity_name':'Not Hispanic or Latino',
                                'applicant_race_name_1':'White','applicant_sex_name':'Male'}
        protectedNumericalAttributes=['minority_population']
        X,y = load_mortgage(protectedCategoricalAttributes=protectedCategoricalAttributes, \
                            protectedNumericalAttributes=protectedNumericalAttributes)
    else:
        print('dataset not recognised')
        
    X = np.hstack([X, np.ones((X.shape[0],1))]) ## add ones to solve for affine functions
    
    return X,y

In [6]:
def computeUnconstrainedMeasures_initOnly(trainxs,testxs,trainys,testys,sensitiveAttributeIndex=0,noOfSamplingRuns=20):
    
    # note that this function performs multiple runs in order to average over logistic regression initialisations
    
    # resultsArray: 1st dimension - train/test; 3rd dimension - col0:accuracy, col1:fairnessMeasure1, col2:fairnessMeasure2, col3:fairnessMeasure3
    resultsArray = np.zeros((2,noOfSamplingRuns,4))
    
    for run in range(noOfSamplingRuns):
        w = minimize(fun = logisticLoss,
            x0 = np.random.rand(trainxs.shape[1],),
            args = (trainxs, trainys),
            method = 'SLSQP',
            options = {"maxiter":100000},
            constraints = []
            )
    
        resultsArray[0,run,0] = 1-errorRate(w.x,trainxs, trainys)
        resultsArray[1,run,0] = 1-errorRate(w.x,testxs, testys)

        # fairness measures on training data
        fairnessMeasure1 = differenceDisparateImpactModel(w.x,trainxs,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure2 = differenceEqualOpportunity(w.x,trainxs,trainys,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure3 = differenceDisparateMistreatment(w.x,trainxs,trainys,sensitiveAttributeIndex=sensitiveAttributeIndex,type='OMR')

        resultsArray[0,run,1:] = [np.abs(fairnessMeasure1[0]-fairnessMeasure1[1]),np.abs(fairnessMeasure2[0]-fairnessMeasure2[1]),\
                                 np.abs(fairnessMeasure3[0]-fairnessMeasure3[1])]

        # fairness measures on test data
        fairnessMeasure1 = differenceDisparateImpactModel(w.x,testxs,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure2 = differenceEqualOpportunity(w.x,testxs,testys,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure3 = differenceDisparateMistreatment(w.x,testxs,testys,sensitiveAttributeIndex=sensitiveAttributeIndex,type='OMR')

        resultsArray[1,run,1:] = [np.abs(fairnessMeasure1[0]-fairnessMeasure1[1]),np.abs(fairnessMeasure2[0]-fairnessMeasure2[1]),\
                                 np.abs(fairnessMeasure3[0]-fairnessMeasure3[1])]
        
    # resultsSummary: 
    # 1st dimension: train/test
    # 2nd dimension: accuracy, fairnessMeasures (x3)
    # 3rd dimension: col0: mean, col1: std
    resultsSummary = np.zeros((2,4,2))
    resultsSummary[:,:,0] = np.mean(resultsArray, axis=1)
    resultsSummary[:,:,1] = np.std(resultsArray, axis=1)
    
    return resultsSummary

In [9]:
def computeUnconstrainedMeasures_fullRandomisation(X,y,sensitiveAttributeIndex=0,noOfSamplingRuns=20,train_size=0.8):
    
    # this function performs multiple runs in order to average over training/test splits and logistic regression initialisations
    
    # resultsArray: 1st dimension - train/test; 3rd dimension - col0:accuracy, col1:fairnessMeasure1, col2:fairnessMeasure2, col3:fairnessMeasure3
    resultsArray = np.zeros((2,noOfSamplingRuns,4))
    
    for run in range(noOfSamplingRuns):
        
        trainxs, testxs, trainys, testys  = train_test_split(X,y,train_size=train_size)
        
        w = minimize(fun = logisticLoss,
            x0 = np.random.rand(trainxs.shape[1],),
            args = (trainxs, trainys),
            method = 'SLSQP',
            options = {"maxiter":100000},
            constraints = []
            )
    
        resultsArray[0,run,0] = 1-errorRate(w.x,trainxs, trainys)
        resultsArray[1,run,0] = 1-errorRate(w.x,testxs, testys)

        # fairness measures on training data
        fairnessMeasure1 = differenceDisparateImpactModel(w.x,trainxs,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure2 = differenceEqualOpportunity(w.x,trainxs,trainys,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure3 = differenceDisparateMistreatment(w.x,trainxs,trainys,sensitiveAttributeIndex=sensitiveAttributeIndex,type='OMR')

        resultsArray[0,run,1:] = [np.abs(fairnessMeasure1[0]-fairnessMeasure1[1]),np.abs(fairnessMeasure2[0]-fairnessMeasure2[1]),\
                                 np.abs(fairnessMeasure3[0]-fairnessMeasure3[1])]

        # fairness measures on test data
        fairnessMeasure1 = differenceDisparateImpactModel(w.x,testxs,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure2 = differenceEqualOpportunity(w.x,testxs,testys,sensitiveAttributeIndex=sensitiveAttributeIndex)
        fairnessMeasure3 = differenceDisparateMistreatment(w.x,testxs,testys,sensitiveAttributeIndex=sensitiveAttributeIndex,type='OMR')

        resultsArray[1,run,1:] = [np.abs(fairnessMeasure1[0]-fairnessMeasure1[1]),np.abs(fairnessMeasure2[0]-fairnessMeasure2[1]),\
                                 np.abs(fairnessMeasure3[0]-fairnessMeasure3[1])]
        
    # resultsSummary: 
    # 1st dimension: train/test
    # 2nd dimension: accuracy, fairnessMeasures (x3)
    # 3rd dimension: col0: mean, col1: std
    resultsSummary = np.zeros((2,4,2))
    resultsSummary[:,:,0] = np.nanmean(resultsArray, axis=1)
    resultsSummary[:,:,1] = np.nanstd(resultsArray, axis=1)
    
    return resultsSummary

## Sensitive attributes list:

adult: 8 - race, 9 - gender ;  variables = 15 ; 45222 samples

bank: 0 - age ; variables = 21 ; 41188 samples

german: 6 - gender, 9 - age, 14 - foreign worker ; variables = 25 ; 1000 samples

mortgage: 12 - ethnicity, 14 - race, 16 - gender, 24 - minority population ; variables = 30 ; 200000 samples

In [17]:
# sensitiveAttributeList = ['Age','Age','Gender','Gender','Gender','Race','Race']
# datasetList = ['Bank','German','Adult','German','Mortgage','Adult','Mortgage']
# sensitiveAttributeIndexList = [0,9,9,6,16,8,14]

sensitiveAttributeList = ['Age']
datasetList = ['Bank']
sensitiveAttributeIndexList = [0]

cell_text = []

for topLevelRun in range(len(sensitiveAttributeList)):   
    sensitiveAttributeIndex = sensitiveAttributeIndexList[topLevelRun]
    dataset = datasetList[topLevelRun]
    X,y = getDataset(dataset.lower())
    
    # resultsSummary: 
    # 1st dimension: train/test
    # 2nd dimension: accuracy, fairnessMeasures (x3)
    # 3rd dimension: col0: mean, col1: std
    resultsSummary = computeUnconstrainedMeasures_fullRandomisation(X,y,sensitiveAttributeIndex=sensitiveAttributeIndex)
    
    cell_text_row = []
    cell_text_row.append(sensitiveAttributeList[topLevelRun])
    cell_text_row.append(datasetList[topLevelRun])
    for i in range(resultsSummary.shape[0]):
        for j in range(resultsSummary.shape[1]):
            cell_text_row.append('{:.4f} $\pm$ {:.4f}'.format(resultsSummary[i,j,0], resultsSummary[i,j,1]))

    print(cell_text_row)
    
    cell_text.append(cell_text_row)
    

(41188, 21)
(41188, 20)
age: 0
A smaller version of the dataset is loaded...


  p_yHatEqualY_z1 = n_yHatEqualY_z1 / n_z1


['Age', 'Bank', '0.9793 $\\pm$ 0.0013', '0.0213 $\\pm$ 0.0174', '0.2661 $\\pm$ 0.0588', '0.0723 $\\pm$ 0.0325', '0.9806 $\\pm$ 0.0043', '0.0099 $\\pm$ 0.0039', '0.2840 $\\pm$ 0.1569', '0.0779 $\\pm$ 0.1183']
