In [1]:
import os
import pandas as pd
import numpy as np

from aif360.algorithms.preprocessing import DisparateImpactRemover,Reweighing,LFR
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

from sklearn.ensemble import RandomForestClassifier
from aif360.datasets import CompasDataset, AdultDataset
from sklearn.metrics import f1_score

from humancompatible.repair.methods.cost import c_generate, c_generate_higher
from humancompatible.repair.methods.data_analysis import rdata_analysis
from humancompatible.repair.methods.coupling_utils import projection, projection_higher
from humancompatible.repair.group_blind_repair import GroupBlindRepair

# if you need "OptimPreproc"
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions\
            import get_distortion_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools


pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [19]:
class Projpreprocess:
    
    def __init__(self,traindata,x_list,var_list,K,e):

        self.K=K
        self.e=e
        self.x_list=x_list
        self.var_list=var_list
            
        print("var_list ====== ")
        print(self.var_list)
        if 'W' in self.var_list:
            self.var_list.remove('W')
      
        self.var_dim=len(self.var_list)
        self.arg_list=[elem for elem in self.var_list if elem not in x_list]
        self.train = traindata.copy()
        self.df = self.train.convert_to_dataframe()[0]
        self.pa = self.train.protected_attribute_names[0]
        self.pa_index = self.train.feature_names.index(self.pa)
        self.label_name = self.train.label_names[0]
        self.df=self.df.rename(columns={self.pa:'S',self.label_name:'Y'})

        self.df['W'] = self.train.instance_weights
        for col in self.var_list+['S','Y']:
            self.df[col]=self.df[col].astype('int64')
        self.df=self.df[self.var_list+['S','W','Y']]
        if len(x_list)>1:
            self.df['X'] = list(zip(*[self.df[c] for c in x_list]))
            self.x_range=sorted(set(self.df['X']))
            weight=list(1/(self.df[x_list].max()-self.df[x_list].min())) # because ranges of attributes differ
            self.C=c_generate_higher(self.x_range,weight)
        else:
            self.df['X']=self.df[x_list]
            self.x_range=sorted(set(self.df['X']))
            self.C=c_generate(self.x_range)
        
        # DEBUGGING
        # print(type(self.df))
        # print(self.arg_list)
        # print(self.df.columns)
        # print(self.df)
        # print(self.df['W'])
        # print(check_duplicate_columns_equal(self.df))
        # self.df = remove_duplicate_columns(self.df)
        # print(type(self.df))
        # print(self.arg_list)
        # print(self.df.columns)
        # print(self.df)
        # print(self.df['W'])
        # print(check_duplicate_columns_equal(self.df))


        self.df = self.df[list(set(self.arg_list+['X','S','Y','W']))].groupby(by=list(set(self.arg_list+['X','S','Y'])),as_index=False).sum()
        self.distribution_generator()
        
    def distribution_generator(self):
        bin=len(self.x_range)
        dist=rdata_analysis(self.df,self.x_range,'X')
        dist['v']=[(dist['x_0'][i]-dist['x_1'][i])/dist['x'][i] for i in range(bin)]
        dist['t_x']=dist['x'] # #dist['x'] #dist['x_0']*0.5+dist['x_1']*0.5 
        self.px=np.matrix(dist['x']).T
        self.ptx=np.matrix(dist['t_x']).T
        if np.any(dist['x_0']==0): 
            self.p0=np.matrix((dist['x_0']+1.0e-9)/sum(dist['x_0']+1.0e-9)).T
        else:
            self.p0=np.matrix(dist['x_0']).T 
        if np.any(dist['x_1']==0):
            self.p1=np.matrix((dist['x_1']+1.0e-9)/sum(dist['x_1']+1.0e-9)).T
        else:
            self.p1=np.matrix(dist['x_1']).T 
        self.V=np.matrix(dist['v']).T
        # self.tv_origin=sum(abs(dist['x_0']-dist['x_1']))/2
        # return px,ptx,V,p0,p1
    
    def _run_method(self, method, C, eps, px, ptx, K, V=None, theta=None):
        group_blind = GroupBlindRepair(C, px, ptx, V=V, epsilon=eps, K=K)
        if method == "baseline":
            group_blind.fit_baseline()
        elif method == "partial_repair":
            group_blind.fit_partial(theta)
        elif method == "total_repair":
            group_blind.fit_total()
        return group_blind.coupling_matrix()

    def coupling_generator(self,method,Theta=1e-2):
        if method == 'unconstrained':
            coupling=self._run_method(method="baseline", C=self.C, eps=self.e, px=self.px, ptx=self.ptx, K=self.K)
        elif method == 'barycentre':
            coupling=self._run_method(method="baseline", C=self.C, eps=self.e, px=self.p0, ptx=self.p1, K=self.K)
        elif method == 'partial':
            coupling=self._run_method(method="partial_repair", C=self.C, eps=self.e, px=self.px, ptx=self.ptx, V=self.V, theta=Theta, K=self.K)
        return coupling

    def preprocess(self,method,Theta=1e-2):
        coupling = self.coupling_generator(method,Theta)
        if len(self.x_list)>1:
            df_proj=projection_higher(self.df,coupling,self.x_range,self.x_list,self.var_list)
        else:
            df_proj=projection(self.df,coupling,self.x_range,self.x_list[0],self.var_list)
        df_proj = df_proj.groupby(by=self.arg_list+['X','S','Y'],as_index=False).sum()
        X=list(zip(*df_proj['X']))
        df_proj = df_proj.assign(**{self.x_list[i]:X[i] for i in range(len(self.x_list))})
        df_proj=df_proj.drop('X',axis=1)
        df_proj=df_proj.rename(columns={'S':self.pa,'Y':self.label_name})
        binaryLabelDataset = BinaryLabelDataset(
            favorable_label=0,
            unfavorable_label=1,
            df=df_proj.drop('W',axis=1), 
            label_names=self.train.label_names,
            protected_attribute_names=self.train.protected_attribute_names,
            privileged_protected_attributes=[np.array([1.0])],unprivileged_protected_attributes=[np.array([0.])])
        binaryLabelDataset.instance_weights = df_proj['W'].tolist()
        # return binaryLabelDataset.align_datasets(self.train)

        # print("preprocess function ===================")
        # print(df_proj)
        # print(df_proj['W'])
        # print("binaryLabelDataset ===================")
        # print(type(binaryLabelDataset))
        # print(binaryLabelDataset)
        # print("self.train ===================")")
        # print(type(self.train))
        # print(self.train)

        return self.train.align_datasets(binaryLabelDataset)

In [4]:
class Baselinepreprocess:

    def __init__(self,train,test):
        self.train = train
        self.test = test
        self.pa = train.protected_attribute_names[0]
        self.pa_index = train.feature_names.index(self.pa)
        self.prigroups = [{self.pa: 1}]
        self.unprigroups = [{self.pa: 0}]

    def preprocessing(self,method):
        test_tranf = self.test.copy()
        if method == 'RW':
            RW = Reweighing(privileged_groups = self.prigroups,unprivileged_groups = self.unprigroups) #DisparateImpactRemover(repair_level = 1)
            RW.fit(self.train)
            train_tranf = RW.transform(self.train)
        elif method == 'DIremover':
            di = DisparateImpactRemover(repair_level = 1,sensitive_attribute=self.pa)
            train_tranf = di.fit_transform(self.train)
            test_tranf = di.fit_transform(self.test)
        elif method == 'LFR':
            TR = LFR(privileged_groups = self.prigroups,unprivileged_groups = self.unprigroups,
                     Az = 1, Ax = 0.01, Ay = 1,verbose=0)
            TR = TR.fit(self.train)
            train_tranf = TR.transform(self.train)
            test_tranf = TR.transform(self.test)
        elif method == 'OP':
            optim_options = {
                "distortion_fun": get_distortion_adult,
                "epsilon": 0.05,
                "clist": [0.99, 1.99, 2.99],
                "dlist": [.1, 0.05, 0]
            }
            OP = OptimPreproc(OptTools, optim_options)
            OP = OP.fit(self.train)
            train_tranf = OP.transform(self.train, transform_Y=True)
        return train_tranf, test_tranf

    def prediction(self,method,para=None):
        test_tranf = self.test.copy()
        if method == 'origin':
            train_tranf = self.train
        elif method in ['RW','DIremover','LFR','OP']:
            train_tranf,test_tranf = self.preprocessing(method)
        else:
            K=200
            e=0.01
            var_list=self.train.feature_names.copy()
            var_list.remove(self.pa)
            projpre=Projpreprocess(self.train,para['x_list'],var_list,K,e)
            train_tranf=projpre.preprocess(method,para['Theta'])

        di=self.DisparateImpact(train_tranf)
        print('Disparate Impact of train',di)

        if method != 'LFR':
            X_train = np.delete(train_tranf.features, self.pa_index, axis=1)
            y_train = train_tranf.labels.ravel()
            weight_train = train_tranf.instance_weights
            model=RandomForestClassifier(max_depth=5).fit(X_train,y_train, sample_weight=weight_train)

            X_test = np.delete(test_tranf.features, self.pa_index, axis=1)
            y_pred = model.predict(X_test)
        else:
            y_pred = test_tranf.labels
        return y_pred,di
    
    def DisparateImpact(self,data):
        di = pd.DataFrame({'S':data.protected_attributes.ravel().tolist(),
            'Y':data.labels.ravel().tolist(),
            'W':list(data.instance_weights)},columns=['S','Y','W'])
        privileged = self.train.privileged_protected_attributes[0][0]
        unprivileged = self.train.unprivileged_protected_attributes[0][0]
        numerator=sum(di[(di['S']==unprivileged)&(di['Y']==data.favorable_label)]['W'])/sum(di[di['S']==unprivileged]['W'])
        denominator=sum(di[(di['S']==privileged)&(di['Y']==data.favorable_label)]['W'])/sum(di[di['S']==privileged]['W'])
        if numerator==denominator:
            return 1
        return numerator/denominator

    def assess(self,method,para=None):
        if para != None:
            y_pred,di_train = self.prediction(method,para)
        else:
            y_pred,di_train = self.prediction(method)
        y_test_pred = self.test.copy()
        y_test_pred.labels = y_pred

        di=self.DisparateImpact(y_test_pred)
        f1_macro = f1_score(self.test.labels, y_pred, average='macro',sample_weight=self.test.instance_weights)
        f1_micro = f1_score(self.test.labels, y_pred, average='micro',sample_weight=self.test.instance_weights)
        f1_weighted = f1_score(self.test.labels, y_pred, average='weighted',sample_weight=self.test.instance_weights)
        print('Disparate Impact of '+str(method),di)
        print('f1 macro of '+str(method),f1_macro)

        new_row=pd.Series({'DI of train':di_train,'DI':di,'f1 macro':f1_macro,'f1 micro':f1_micro,'f1 weighted':f1_weighted,'method':method})
        return new_row.to_frame().T

## Compas

In [5]:
pa = 'race'
label_map = {1.0: 'Did recid.', 0.0: 'No recid.'}
protected_attribute_maps = {1.0: 'Caucasian', 0.0: 'Not Caucasian'}
privileged_groups = [{pa: 1}]
unprivileged_groups = [{pa: 0}]
cd = CompasDataset(protected_attribute_names=[pa],privileged_classes=[['Caucasian'],[1]], 
                    metadata={'label_map': label_map,'protected_attribute_maps': protected_attribute_maps},
                    features_to_drop=['age', 'sex', 'c_charge_desc'])
train,test = cd.split([0.6], shuffle=True) #len(test.instance_names) = 2057
var_list = cd.feature_names.copy()
var_list.remove(pa)
var_dim=len(var_list)

# df_train = df.loc[train.instance_names,:].reset_index(drop=True)
# df_test = df.loc[test.instance_names,:].reset_index(drop=True)
df=cd.convert_to_dataframe()[0]
df=df.rename(columns={pa:'S',cd.label_names[0]:'Y'})
df['W'] = cd.instance_weights
for col in var_list+['S','Y']:
    df[col]=df[col].astype('int64')
df=df[var_list+['S','W','Y']]

tv_dist=dict()
for x_name in var_list:
    x_range_single=list(pd.pivot_table(df,index=x_name,values=['W'],observed=False)[('W')].index) 
    dist=rdata_analysis(df,x_range_single,x_name)
    tv_dist[x_name]=sum(abs(dist['x_0']-dist['x_1']))/2
    print(x_name, tv_dist[x_name])
x_list=[]
for key,val in tv_dist.items():
    if val>0.1:
        x_list+=[key]

juv_fel_count 0.03210337325453563
juv_misd_count 0.04323143324022939
juv_other_count 0.021763780679615215
priors_count 0.12622233191661625
age_cat=25 - 45 0.054431947619680315
age_cat=Greater than 45 0.13519019921101838
age_cat=Less than 25 0.08075825159133806
c_charge_degree=F 0.07840757396162046
c_charge_degree=M 0.07840757396162046


In [6]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv('../data/report_preprocess_compas_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.8402770892329867
Disparate Impact of origin 0.7920019090772499
f1 macro of origin 0.6661448830604686
Disparate Impact of train 1.0000000000000002
Disparate Impact of RW 0.7977096816672113
f1 macro of RW 0.6619634977938227
Disparate Impact of train 0.8402770892329867
Disparate Impact of DIremover 0.8765711943309157
f1 macro of DIremover 0.6694498322659348
Disparate Impact of train 0.9182176686120846
Disparate Impact of LFR 0.9658723272211144
f1 macro of LFR 0.6704781999734366
Disparate Impact of train 0.8414590862964464
Disparate Impact of origin 0.783236210503632
f1 macro of origin 0.661457485134364
Disparate Impact of train 1
Disparate Impact of RW 0.784047014034174
f1 macro of RW 0.6610096760777926
Disparate Impact of train 0.8414590862964464
Disparate Impact of DIremover 0.7808610487655581
f1 macro of DIremover 0.6627885764081127
Disparate Impact of train 0.936318521857122
Disparate Impact of LFR 0.8947374197990362
f1 macro of LFR 0.6612939836228457
Dispa

## Adult

In [7]:
def load_data(data_path,var_list,pa):
    column_names = ['age', 'workclass', 'fnlwgt', 'education',
                'education-num', 'marital-status', 'occupation', 'relationship',
                'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'Y']
    na_values=['?']
    pa_dict={'Male':1,'Female':0,'White':1,'Black':0}
    label_dict={'>50K.':1,'>50K':1,'<=50K.':0,'<=50K':0}
    train_path = os.path.join(data_path, 'adult.data')
    test_path = os.path.join(data_path, 'adult.test')
    train = pd.read_csv(train_path, header=None,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    test = pd.read_csv(test_path, header=0,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    messydata = pd.concat([test, train], ignore_index=True)[var_list+[pa,'Y']]
    messydata=messydata.rename(columns={pa:'S'})
    messydata['S']=messydata['S'].replace(pa_dict)
    messydata['Y']=messydata['Y'].replace(label_dict)
    messydata=messydata[(messydata['S']==0)|(messydata['S']==1)]
    for col in var_list+['S','Y']:
        messydata[col]=messydata[col].astype('int64')
    messydata['W']=1
    bins_capitalgain=[100,3500,7500,10000]
    bins_capitalloss=[100,1600,1900,2200]
    bins_age=[26,36,46,56]
    bins_hours=[21,36,46,61]

    messydata=categerise(messydata,'age',bins_age)
    messydata=categerise(messydata,'hours-per-week',bins_hours)
    messydata=categerise(messydata,'capital-gain',bins_capitalgain)
    messydata=categerise(messydata,'capital-loss',bins_capitalloss)
    
    return messydata

def categerise(df,col,bins):
    for i in range(len(bins)+1):
        if i == 0:
            df.loc[df[col] < bins[i], col] = i
        elif i == len(bins):
            df.loc[df[col] >= bins[i-1], col] = i
        else:
            df.loc[(df[col] >= bins[i-1])& (df[col] < bins[i]), col] = i        
    return df

def choose_x(var_list,messydata):
    tv_dist=dict()
    for x_name in var_list:
        x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index) 
        dist=rdata_analysis(messydata,x_range_single,x_name)
        tv_dist[x_name]=sum(abs(dist['x_0']-dist['x_1']))/2
    x_list=[]
    for key,val in tv_dist.items():
        if val>0.1:
            x_list+=[key]  
    return x_list,tv_dist

In [8]:
data_path='..//data//adult'
var_list=['hours-per-week','age','capital-gain','capital-loss','education-num'] #,'education-num'
pa='race'
favorable_label = 1
var_dim=len(var_list)

messydata = load_data(data_path,var_list,pa)
x_list,tv_dist = choose_x(var_list,messydata)
messydata=messydata.rename(columns={'S':pa})
cd=BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=messydata,label_names='Y',protected_attribute_names=[pa])
train,test = cd.split([0.4], shuffle=True) 
valid,test = test.split([0.3], shuffle=True)

In [9]:
para={'x_list':x_list,'Theta':1e-2}
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv('../data/report_preprocess_adult_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.4172208918780259
Disparate Impact of origin 0.42191947586702916
f1 macro of origin 0.6824367895587713
Disparate Impact of train 1
Disparate Impact of RW 0.427539518219476
f1 macro of RW 0.6806524556317192
Disparate Impact of train 0.4172208918780259
Disparate Impact of DIremover 0.45325924090779085
f1 macro of DIremover 0.6770051707768875
Disparate Impact of train 0.7957295462799464
Disparate Impact of LFR 0.9057899639263494
f1 macro of LFR 0.6926326573082515
Disparate Impact of train 0.4821497649060907
Disparate Impact of origin 0.43924953095684804
f1 macro of origin 0.6817760750473485
Disparate Impact of train 1
Disparate Impact of RW 0.47618272790139526
f1 macro of RW 0.6775300833076211
Disparate Impact of train 0.4821497649060907
Disparate Impact of DIremover 0.4709819543480859
f1 macro of DIremover 0.6810161760758444
Disparate Impact of train 0.8128851988115321
Disparate Impact of LFR 0.891769203267237
f1 macro of LFR 0.7069633280183247
Disparate Impact

In [10]:
report

Unnamed: 0,DI of train,DI,f1 macro,f1 micro,f1 weighted,method
0,0.417221,0.421919,0.682437,0.815829,0.788811,origin
1,1.0,0.42754,0.680652,0.815367,0.787852,RW
2,0.417221,0.453259,0.677005,0.814855,0.786063,DIremover
3,0.79573,0.90579,0.692633,0.810344,0.790942,LFR
4,0.48215,0.43925,0.681776,0.820288,0.792351,origin
5,1.0,0.476183,0.67753,0.818545,0.789842,RW
6,0.48215,0.470982,0.681016,0.819622,0.79176,DIremover
7,0.812885,0.891769,0.706963,0.814496,0.800456,LFR
8,0.490168,0.466148,0.686451,0.817366,0.791185,origin
9,1.0,0.480087,0.684429,0.816649,0.790023,RW


In [11]:
report.to_csv('../data/report_preprocess_compas_'+str(pa)+'_'+str(para['Theta'])+'.csv',index=None)

In [12]:
Baselinepreprocess(train,test).assess('partial',para=para)

ValueError: Grouper for 'W' not 1-dimensional

In [20]:
Baselinepreprocess(train,test).assess('partial',para=para)

['hours-per-week', 'age', 'capital-gain', 'capital-loss', 'education-num', 'W']


ValueError: feature_names, label_names, and protected_attribute_names should match between this and other dataset.

In [None]:
Baselinepreprocess(train,test).assess('LFR')

Disparate Impact of train 0.8456240675896421
Disparate Impact of LFR 0.93713336144422
f1 macro of LFR 0.6988885331213838


Unnamed: 0,DI of train,DI,f1 macro,f1 micro,f1 weighted,method
0,0.845624,0.937133,0.698889,0.812548,0.795925,LFR


In [None]:
Baselinepreprocess(train,test).assess('RW')

Disparate Impact of train 0.9999999999999999
Disparate Impact of RW 0.48223483793688754
f1 macro of RW 0.6893390802908452


Unnamed: 0,DI of train,DI,f1 macro,f1 micro,f1 weighted,method
0,1.0,0.482235,0.689339,0.818802,0.794531,RW


In [None]:
Baselinepreprocess(train,test).assess('DIremover')

In [None]:
Baselinepreprocess(train,test).assess('origin')