In [None]:
import os
import pandas as pd

from aif360.datasets import BinaryLabelDataset, CompasDataset

from humancompatible.repair.methods.data_analysis import rdata_analysis
from humancompatible.repair.preprocess.baseline_preprocess import Baselinepreprocess

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Compas dataset

In [3]:
pa = 'race'
label_map = {1.0: 'Did recid.', 0.0: 'No recid.'}
protected_attribute_maps = {1.0: 'Caucasian', 0.0: 'Not Caucasian'}
privileged_groups = [{pa: 1}]
unprivileged_groups = [{pa: 0}]
cd = CompasDataset(protected_attribute_names=[pa],privileged_classes=[['Caucasian'],[1]], 
                    metadata={'label_map': label_map,'protected_attribute_maps': protected_attribute_maps},
                    features_to_drop=['age', 'sex', 'c_charge_desc'])

In [4]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv('../data/report_preprocess_compas_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.88268198865032
Disparate Impact of origin 0.7990410000946503
f1 macro of origin 0.6606574329136953
Disparate Impact of train 1
Disparate Impact of RW 0.7826524620417751
f1 macro of RW 0.6545518585569539
Disparate Impact of train 0.88268198865032
Disparate Impact of DIremover 0.9068771372318745
f1 macro of DIremover 0.6380307369175431
Disparate Impact of train 0.9224133615871055
Disparate Impact of LFR 0.8785597034899163
f1 macro of LFR 0.6639718528842544
Disparate Impact of train 0.8036080534443975
Disparate Impact of origin 0.7933092665979047
f1 macro of origin 0.6609621777941983
Disparate Impact of train 1
Disparate Impact of RW 0.7671817527586758
f1 macro of RW 0.6686182076847955
Disparate Impact of train 0.8036080534443975
Disparate Impact of DIremover 0.8868412177698747
f1 macro of DIremover 0.6691445795313495
Disparate Impact of train 0.9683940526910703
Disparate Impact of LFR 1.0373354850277927
f1 macro of LFR 0.663497173954508
Disparate Impact of tra

# Adult dataset

In [5]:
def load_data(data_path,var_list,pa):
    """
    Load and clean the Adult dataset, and discretize selected attributes 
    (age, hours-per-week, capital-gain, capital-loss).

    Parameters:
        data_path (str): Path to the input data file.
        var_list (list of str): List of non-protected attribute names.
        pa (str): Name of the protected attribute.

    Returns:
        pd.DataFrame: The cleaned dataset with discretized attributes.
    """

    column_names = ['age', 'workclass', 'fnlwgt', 'education',
                'education-num', 'marital-status', 'occupation', 'relationship',
                'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'Y']
    na_values=['?']
    pa_dict={'Male':1,'Female':0,'White':1,'Black':0}
    label_dict={'>50K.':1,'>50K':1,'<=50K.':0,'<=50K':0}
    train_path = os.path.join(data_path, 'adult.data')
    test_path = os.path.join(data_path, 'adult.test')
    train = pd.read_csv(train_path, header=None,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    test = pd.read_csv(test_path, header=0,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    messydata = pd.concat([test, train], ignore_index=True)[var_list+[pa,'Y']]
    messydata=messydata.rename(columns={pa:'S'})
    messydata['S']=messydata['S'].replace(pa_dict)
    messydata['Y']=messydata['Y'].replace(label_dict)
    messydata=messydata[(messydata['S']==0)|(messydata['S']==1)]
    for col in var_list+['S','Y']:
        messydata[col]=messydata[col].astype('int64')
    messydata['W']=1

    # Define bin thresholds for discretizing attributes.
    bins_capitalgain=[100,3500,7500,10000]
    bins_capitalloss=[100,1600,1900,2200]
    bins_age=[26,36,46,56]
    bins_hours=[21,36,46,61]

    # Apply discretization to attributes using predefined bins.
    messydata=categerise(messydata,'age',bins_age)
    messydata=categerise(messydata,'hours-per-week',bins_hours)
    messydata=categerise(messydata,'capital-gain',bins_capitalgain)
    messydata=categerise(messydata,'capital-loss',bins_capitalloss)
    
    return messydata

def categerise(df,col,bins):
    # Apply discretization to attributes using predefined bins.
    for i in range(len(bins)+1):
        if i == 0:
            df.loc[df[col] < bins[i], col] = i
        elif i == len(bins):
            df.loc[df[col] >= bins[i-1], col] = i
        else:
            df.loc[(df[col] >= bins[i-1])& (df[col] < bins[i]), col] = i        
    return df

In [6]:
def choose_x(var_list,messydata):
    """
    Select non-protected attributes to repair based on their 
    protected-attribute-wise Total Variation distance.

    Attributes are selected if their Total Variation distance exceeds a threshold (default: 0.1).

    Parameters:
        var_list (list of str): List of non-protected attribute names.
        messydata (pd.DataFrame): The cleaned dataset.

    Returns:
        x_list (list of str): List of non-protected attributes that need to be repaired.
        tv_dist (dict): Dictionary mapping each non-protected attribute to its 
                        protected-attribute-wise Total Variation distance.
    """

    tv_dist=dict()
    for x_name in var_list:
        x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index) 
        dist=rdata_analysis(messydata,x_range_single,x_name)
        tv_dist[x_name]=sum(abs(dist['x_0']-dist['x_1']))/2
    x_list=[]
    for key,val in tv_dist.items():
        if val>0.1:
            x_list+=[key]  
    return x_list,tv_dist

In [7]:
data_path='..//data//adult'
var_list=['hours-per-week','age','capital-gain','capital-loss','education-num'] #,'education-num'
pa='race'
favorable_label = 1
var_dim=len(var_list)

messydata = load_data(data_path,var_list,pa)
x_list,tv_dist = choose_x(var_list,messydata)
messydata=messydata.rename(columns={'S':pa})
cd=BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=messydata,label_names='Y',protected_attribute_names=[pa])
# train,test = cd.split([0.4], shuffle=True) 
# valid,test = test.split([0.3], shuffle=True)

In [8]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv('../data/report_preprocess_adult_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.5047295646456601
Disparate Impact of origin 0.43384447781112045
f1 macro of origin 0.6873499495899884
Disparate Impact of train 1
Disparate Impact of RW 0.424028841499607
f1 macro of RW 0.6802092930363968
Disparate Impact of train 0.5047295646456601
Disparate Impact of DIremover 0.4079993090603388
f1 macro of DIremover 0.6738015015640795
Disparate Impact of train 0.8233927395305995
Disparate Impact of LFR 0.8527292796915192
f1 macro of LFR 0.7036787732472928
Disparate Impact of train 0.46625508009785355
Disparate Impact of origin 0.46116931191508526
f1 macro of origin 0.6869889279851386
Disparate Impact of train 0.9999999999999999
Disparate Impact of RW 0.4671465020563257
f1 macro of RW 0.6794516354102634
Disparate Impact of train 0.46625508009785355
Disparate Impact of DIremover 0.44687666949114396
f1 macro of DIremover 0.688378873582636
Disparate Impact of train 0.8333115118577075
Disparate Impact of LFR 0.8760194063951693
f1 macro of LFR 0.694463012783399

In [9]:
report

Unnamed: 0,DI of train,DI,f1 macro,f1 micro,f1 weighted,method
0,0.50473,0.433844,0.68735,0.817828,0.792008,origin
1,1.0,0.424029,0.680209,0.818238,0.789075,RW
2,0.50473,0.407999,0.673802,0.816956,0.785775,DIremover
3,0.823393,0.852729,0.703679,0.812343,0.796661,LFR
4,0.466255,0.461169,0.686989,0.818802,0.79297,origin
5,1.0,0.467147,0.679452,0.816751,0.788911,RW
6,0.466255,0.446877,0.688379,0.819109,0.793689,DIremover
7,0.833312,0.876019,0.694463,0.809678,0.792357,LFR
8,0.448994,0.561786,0.674593,0.815419,0.785847,origin
9,1.0,0.566771,0.675858,0.816136,0.786679,RW
