In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from fairdata import FairData
import folktables

In [2]:
data_source = folktables.ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
df = data_source.get_data(states=["CA"], download=True)
features = ['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX', 'RAC1P']
target = 'PINCP'
df = folktables.adult_filter(df)
df = df[features + [target]]
df[target] = (df[target] > 50000).astype(int)

In [3]:
# Encode categorical variables
categorical = ['COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'SEX', 'RAC1P']
for feature in categorical:
    le = preprocessing.LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
df

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,30,5,13,0,517,4,16,40.0,0,7,0
6,21,3,15,4,115,4,17,20.0,0,0,0
7,65,1,21,4,130,4,17,8.0,0,0,0
10,33,0,13,2,517,32,16,40.0,0,0,0
13,18,1,18,4,62,4,17,18.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
378801,38,0,21,0,62,109,0,40.0,0,5,1
378802,39,0,21,0,62,109,1,40.0,1,5,1
378807,61,0,18,0,325,13,0,45.0,0,0,1
378811,69,6,23,0,130,107,0,45.0,0,5,0


In [4]:
a = df.drop(['SEX', 'RAC1P', 'PINCP'], axis=1)
s = df[['SEX', 'RAC1P']]
y = df[['PINCP']]
a_train, a_test, s_train, s_test, y_train, y_test = train_test_split(
    a, s, y, test_size=0.2, random_state=0)

In [5]:
# Feature scaling
scaler = preprocessing.StandardScaler()
a_train = pd.DataFrame(scaler.fit_transform(a_train), columns=a_train.columns)
a_test = pd.DataFrame(scaler.transform(a_test), columns=a_test.columns)
a_train.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP
0,0.218604,-0.076323,0.895725,-0.895674,-0.906178,-0.772527,-0.564248,0.547399
1,0.151449,-0.605931,0.642048,0.187269,0.437733,-0.687528,2.365575,0.930903
2,-0.184328,-0.605931,0.642048,1.270213,-1.203324,1.097451,0.562607,0.163895
3,1.091623,-0.605931,1.403079,-0.895674,-1.493717,1.012452,-0.564248,0.163895
4,-1.325968,-0.605931,0.134694,1.270213,-0.203833,-0.772527,1.914833,0.163895


In [6]:
# PCA
pca = PCA()
a_train = pd.DataFrame(pca.fit_transform(a_train), columns=a_train.columns)
a_test = pd.DataFrame(pca.transform(a_test), columns=a_test.columns)

In [7]:
# Combine sensitive attributes
s_train = pd.DataFrame({'RACE_SEX':s_train.RAC1P * 2 + s_train.SEX})
s_test = pd.DataFrame({'RACE_SEX':s_test.RAC1P * 2 + s_test.SEX})

In [8]:
fairdata_ortho = FairData(s_train, a_train, y_train, preprocess_method='o')
fairdata_mdm = FairData(s_train, a_train, y_train, preprocess_method='m')



In [27]:
fairdata_mdm_eval = fairdata_mdm.evaluate(
    a_test, s_test, y_test, metrics=['cfb', 'cfbm', 'mae'], p_range=0, b=1
)
pd.DataFrame(fairdata_mdm_eval, index=['CFB', 'CFBM', 'MAE'], columns=['ML', 'FTU', 'FL', 'AA', 'FLAP-1', 'FLAP-2'])

Unnamed: 0,ML,FTU,FL,AA,FLAP-1,FLAP-2
CFB,0.997793,0.862082,0.893325,0.896558,0.647318,0.656356
CFBM,0.997793,0.862082,0.893325,0.896558,0.647318,0.656356
MAE,0.298178,0.305165,0.323217,0.32375,0.327116,0.325726


In [28]:
fairdata_mdm_eval = fairdata_mdm.evaluate(
    a_test, s_test, y_test, metrics=['cfb', 'cfbm', 'mae'], p_range=0.025, b=25
)
pd.DataFrame(fairdata_mdm_eval, index=['CFB', 'CFBM', 'MAE'], columns=['ML', 'FTU', 'FL', 'AA', 'FLAP-1', 'FLAP-2'])

Unnamed: 0,ML,FTU,FL,AA,FLAP-1,FLAP-2
CFB,0.997793,0.862082,0.893325,0.896558,0.748182,0.757392
CFBM,0.997793,0.783978,0.860876,0.86507,0.573927,0.579623
MAE,0.298178,0.305165,0.323217,0.32375,0.327116,0.325726


In [10]:
fairdata_mdm_eval = fairdata_mdm.evaluate(
    a_test, s_test, y_test, metrics=['cf', 'cfbm', 'acc'], p_range=0.05, b=50
)
pd.DataFrame(fairdata_mdm_eval, index=['CF', 'CFBM', 'ACC'], columns=['ML', 'FTU', 'FL', 'AA', 'FLAP-1', 'FLAP-2'])

Unnamed: 0,ML,FTU,FL,AA,FLAP-1,FLAP-2
CF,0.523789,0.23761,0.136297,0.13853,0.031211,0.028565
CFBM,0.997793,0.782205,0.860086,0.86372,0.573927,0.579623
ACC,0.701822,0.694835,0.676783,0.67625,0.672884,0.674274


In [9]:
fairdata_ortho_eval = fairdata_ortho.evaluate(
    a_test, s_test, y_test, metrics=['cf', 'cfbm', 'acc'], methods=['FLAP-1', 'FLAP-2'], p_range=0.05, b=50
)
pd.DataFrame(fairdata_ortho_eval, index=['CF', 'CFBM', 'ACC'], columns=['FLAP-1', 'FLAP-2'])

Unnamed: 0,FLAP-1,FLAP-2
CF,0.137289,0.136297
CFBM,0.854256,0.856019
ACC,0.674489,0.676783


In [12]:
res = np.concatenate([np.array(fairdata_mdm_eval), np.array(fairdata_ortho_eval)], axis=1)
res[:, [0,1,2,3,6,7,4,5]].round(4)

array([[0.5238, 0.2376, 0.1363, 0.1385, 0.1373, 0.1363, 0.0312, 0.0286],
       [0.9978, 0.7822, 0.8601, 0.8637, 0.8543, 0.856 , 0.5739, 0.5796],
       [0.7018, 0.6948, 0.6768, 0.6763, 0.6745, 0.6768, 0.6729, 0.6743]])