In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from fairdata import FairData

In [2]:
df_train_raw = pd.read_csv(
    'data/Adult/adult.data', names=[
        'age',              # continuous.
        'workclass',        # Private, Self-emp-not-inc, Self-emp-inc, Federal-gov Local-gov, State-gov, Without-pay, Never-worked.
        'fnlwgt',           # continuous.
        'education',        # Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
        'education-num',    # continuous.
        'marital-status',   # Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
        'occupation',       # Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
        'relationship',     # Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
        'race',             # White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
        'sex',              # Female, Male.
        'capital-gain',     # continuous.
        'capital-loss',     # continuous.
        'hours-per-week',   # continuous.
        'native-country',   # United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
        'income',           # <=50K, >50K
    ]
)
df_test_raw = pd.read_csv('data/Adult/adult.test', skiprows=1, names=df_train_raw.columns)
df_train_raw[df_train_raw == '?'] = np.nan
df_test_raw[df_test_raw == '?'] = np.nan
df_train_raw

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# Impute missing values with mode
for col in ['workclass', 'occupation', 'native-country']:
    mode = df_train_raw[col].mode()[0]
    df_train_raw[col].fillna(mode, inplace=True)
    df_test_raw[col].fillna(mode, inplace=True)
x_train = df_train_raw.drop(['income'], axis=1)
x_test = df_test_raw.drop(['income'], axis=1)
y_train = pd.DataFrame({'income': df_train_raw['income'] == ' >50K'}).astype(int)
y_test = pd.DataFrame({'income': df_test_raw['income'] == ' >50K'}).astype(int)

In [4]:
# Encode categorical variables
categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for feature in categorical:
    le = preprocessing.LabelEncoder()
    x_train[feature] = le.fit_transform(x_train[feature])
    x_test[feature] = le.transform(x_test[feature])

In [5]:
# Feature scaling
a_train = x_train.drop(['race', 'sex'], axis=1)
a_test = x_test.drop(['race', 'sex'], axis=1)
scaler = preprocessing.StandardScaler()
a_train = pd.DataFrame(scaler.fit_transform(a_train), columns=a_train.columns)
a_test = pd.DataFrame(scaler.transform(a_test), columns=a_test.columns)
a_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,native-country
0,0.030671,2.150579,-1.063611,-0.335437,1.134739,0.921634,-1.317809,-0.277805,0.148453,-0.21666,-0.035429,0.291569
1,0.837109,1.463736,-1.008707,-0.335437,1.134739,-0.406212,-0.608387,-0.900181,-0.14592,-0.21666,-2.222153,0.291569
2,-0.042642,0.09005,0.245079,0.181332,-0.42006,-1.734058,-0.135438,-0.277805,-0.14592,-0.21666,-0.035429,0.291569
3,1.057047,0.09005,0.425801,-2.402511,-1.197459,-0.406212,-0.135438,-0.900181,-0.14592,-0.21666,-0.035429,0.291569
4,-0.775768,0.09005,1.408176,-0.335437,1.134739,-0.406212,0.810458,2.211698,-0.14592,-0.21666,-0.035429,-4.054223


In [6]:
# PCA
pca = PCA()
a_train = pd.DataFrame(pca.fit_transform(a_train), columns=a_train.columns)
a_test = pd.DataFrame(pca.transform(a_test), columns=a_test.columns)

In [7]:
# Combine sensitive attributes
s_train = pd.DataFrame({'race-sex':x_train.race * 2 + x_train.sex})
s_test = pd.DataFrame({'race-sex':x_test.race * 2 + x_test.sex})

In [8]:
fairdata_ortho = FairData(s_train, a_train, y_train, preprocess_method='o')
fairdata_mdm = FairData(s_train, a_train, y_train, preprocess_method='m')

In [9]:
fairdata_ortho_eval = fairdata_ortho.evaluate(
    a_test, s_test, y_test, metrics=['aa', 'cf', 'lb', 'ub', 'mae']
)
pd.DataFrame(fairdata_ortho_eval, index=['AA', 'CF', 'LB', 'UB', 'MAE'], columns=['ML', 'FTU', 'FL', 'AA', 'FLAP-1', 'FLAP-2'])

Unnamed: 0,ML,FTU,FL,AA,FLAP-1,FLAP-2
AA,0.303389,0.25741,0.0,0.0,2.775558e-17,0.0
CF,0.277902,0.233775,0.022756,0.026795,0.02801999,0.022756
LB,-0.238096,-0.238541,-0.238296,-0.233885,-0.2433122,-0.238296
UB,0.761186,0.760423,0.759366,0.764423,0.7547639,0.759366
MAE,0.238814,0.239577,0.240634,0.235577,0.2452361,0.240634


In [10]:
fairdata_mdm_eval = fairdata_mdm.evaluate(
    a_test, s_test, y_test, metrics=['aa', 'cf', 'lb', 'ub', 'mae']
)
pd.DataFrame(fairdata_mdm_eval, index=['AA', 'CF', 'LB', 'UB', 'MAE'], columns=['ML', 'FTU', 'FL', 'AA', 'FLAP-1', 'FLAP-2'])

Unnamed: 0,ML,FTU,FL,AA,FLAP-1,FLAP-2
AA,0.303389,0.25741,0.0,0.0,0.017783,0.015902
CF,0.277902,0.233775,0.022756,0.026795,0.001978,0.002265
LB,-0.238096,-0.238541,-0.238296,-0.233885,-0.241299,-0.2376
UB,0.761186,0.760423,0.759366,0.764423,0.756968,0.759915
MAE,0.238814,0.239577,0.240634,0.235577,0.243032,0.240085
