# Training Model for Binary Classification Problem with Bias Mitigators 

In [16]:
# sys path
import sys
sys.path.append('../../')

In [17]:
# Imports
from sklearn.model_selection import train_test_split
from holisticai.pipeline import Pipeline
from holisticai.datasets import load_adult
from holisticai.bias.metrics import classification_bias_metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Settings
np.random.seed(0)
import warnings
warnings.filterwarnings("ignore")

In [19]:
# Dataset
data = load_adult()

# Dataframe
df = pd.concat([data["data"], data["target"]], axis=1)
protected_variables = ["sex", "race"]
output_variable = ["class"]

# Simple preprocessing
y = df[output_variable].replace({">50K": 1, "<=50K": 0})
X = pd.get_dummies(df.drop(protected_variables + output_variable, axis=1))
group = ["sex"]
group_a = df[group] == "Female"
group_b = df[group] == "Male"
data_ = [X, y, group_a, group_b]

# Train test split
dataset = train_test_split(*data_, test_size=0.2, shuffle=True)
train_data = dataset[::2]
test_data = dataset[1::2]

In [20]:
# the dataframe
data['frame']

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
48838,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
48839,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
48840,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K


In [18]:
# efficacy metrics from sklearn
from sklearn import metrics

# dictionnary of metrics
metrics_dict={
        "Accuracy": metrics.accuracy_score,
        "Balanced accuracy": metrics.balanced_accuracy_score,
        "Precision": metrics.precision_score,
        "Recall": metrics.recall_score,
        "F1-Score": metrics.f1_score}

# efficacy metrics dataframe helper tool
def metrics_dataframe(y_pred, y_true, metrics_dict=metrics_dict):
    metric_list = [[pf, fn(y_true, y_pred)] for pf, fn in metrics_dict.items()]
    return pd.DataFrame(metric_list, columns=["Metric", "Value"]).set_index("Metric")

# Baseline

In [21]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ])

X, y, group_a, group_b = train_data
pipeline.fit(X, y)

X, y, group_a, group_b = test_data
y_pred = pipeline.predict(X)
df_baseline = classification_bias_metrics(group_b.to_numpy().ravel(), 
                            group_a.to_numpy().ravel(), 
                            y_pred.ravel(), 
                            y.to_numpy().ravel(), metric_type='both')

df_eff_baseline = metrics_dataframe(y, y_pred)

In [22]:
df_baseline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.178353,0
Disparate Impact,3.235544,1
Four Fifths Rule,0.309067,1
Cohen D,0.457755,0
Equality of Opportunity Difference,0.059581,0
False Positive Rate Difference,0.082695,0
Average Odds Difference,0.071138,0
Accuracy Difference,-0.122388,0


In [23]:
df_eff_baseline

Unnamed: 0_level_0,Value
Metric,Unnamed: 1_level_1
Accuracy,0.850241
Balanced accuracy,0.804536
Precision,0.600681
Recall,0.728822
F1-Score,0.658576


# Utils

In [24]:
def fit_and_evaluate_pipeline(pipeline, data_cls=None):
    
    X, y, group_a, group_b = train_data
    fit_params = {
        'bm__group_a': group_a,
        'bm__group_b': group_b
    }
    pipeline.fit(X, y, **fit_params)
    
    X, y, group_a, group_b = test_data
    predict_params = {
        'bm__group_a': group_a,
        'bm__group_b': group_b,
    }
    y_pred = pipeline.predict(X, **predict_params)
    
    df = classification_bias_metrics(group_b.to_numpy().ravel(), 
                                group_a.to_numpy().ravel(), 
                                y_pred.ravel(), 
                                y.to_numpy().ravel(), metric_type='both')
    df_eff = metrics_dataframe(y, y_pred)
    return df,df_eff

def format_result_colum(name,df):
    return df.rename(columns={'Value':name}).iloc[:,0]

def show_bias_result_table(configurations, df_baseline):
    table = pd.concat([df_baseline.iloc[:,0]] + [format_result_colum(name,config['result']['bias']) 
            for name,config in configurations.items()] + [df_baseline.iloc[:,1]],axis=1)
    return table.rename(columns={'Value':'Baseline'})

def show_efficacy_result_table(configurations, df_baseline):
    table = pd.concat([df_baseline.iloc[:,0]] + [format_result_colum(name,config['result']['efficacy']) 
            for name,config in configurations.items()],axis=1)
    return table.rename(columns={'Value':'Baseline'})

# Pre-processing

In [25]:
from collections import defaultdict
configurations = defaultdict(dict)

from holisticai.bias.mitigation import Reweighing
configurations['Reweighing']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('bm_preprocessing', Reweighing()),
    ('classifier', LogisticRegression()),
    ])

from holisticai.bias.mitigation import LearningFairRepresentation
configurations['Learning Fair Representation']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('bm_preprocessing', LearningFairRepresentation(k=10, Ax=0.1, Ay=1.0, Az=2.0, verbose=1, print_interval=250)),
    ('classifier', LogisticRegression()),
    ])

### Run Configurations

In [26]:
for config_name,config in configurations.items():
    bias_metrics,eff_metrics = fit_and_evaluate_pipeline(config['pipeline'])
    config['result'] = {'bias':bias_metrics, 'efficacy':eff_metrics}

step: 250	loss: 0.8824	L_x: 2.5230	L_y: 0.6244	L_z: 0.0029
step: 500	loss: 0.8824	L_x: 2.5230	L_y: 0.6244	L_z: 0.0029
step: 750	loss: 0.8824	L_x: 2.5230	L_y: 0.6244	L_z: 0.0029
step: 1000	loss: 0.8284	L_x: 2.5220	L_y: 0.5707	L_z: 0.0028
step: 1250	loss: 0.8284	L_x: 2.5220	L_y: 0.5707	L_z: 0.0028
step: 1500	loss: 0.8284	L_x: 2.5220	L_y: 0.5707	L_z: 0.0028
step: 1750	loss: 0.8284	L_x: 2.5220	L_y: 0.5707	L_z: 0.0028
step: 2000	loss: 0.7961	L_x: 2.5188	L_y: 0.5389	L_z: 0.0026
step: 2250	loss: 0.7961	L_x: 2.5188	L_y: 0.5389	L_z: 0.0026
step: 2500	loss: 0.7961	L_x: 2.5188	L_y: 0.5389	L_z: 0.0026
step: 2750	loss: 0.7961	L_x: 2.5188	L_y: 0.5389	L_z: 0.0026
step: 3000	loss: 0.7943	L_x: 2.5173	L_y: 0.5375	L_z: 0.0026
step: 3250	loss: 0.7943	L_x: 2.5173	L_y: 0.5375	L_z: 0.0026
step: 3500	loss: 0.7943	L_x: 2.5173	L_y: 0.5375	L_z: 0.0026
step: 3750	loss: 0.7943	L_x: 2.5173	L_y: 0.5375	L_z: 0.0026
step: 4000	loss: 0.7724	L_x: 2.4854	L_y: 0.5189	L_z: 0.0025
step: 4250	loss: 0.7724	L_x: 2.4854	L_y: 0.

In [27]:
show_bias_result_table(configurations, df_baseline)

Unnamed: 0_level_0,Baseline,Reweighing,Learning Fair Representation,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Statistical Parity,0.178353,0.096208,0.075485,0
Disparate Impact,3.235544,1.805986,1.735582,1
Four Fifths Rule,0.309067,0.553714,0.576175,1
Cohen D,0.457755,0.250423,0.210877,0
Equality of Opportunity Difference,0.059581,-0.165055,0.023775,0
False Positive Rate Difference,0.082695,0.030316,0.008878,0
Average Odds Difference,0.071138,-0.06737,0.016327,0
Accuracy Difference,-0.122388,-0.121556,-0.104798,0


In [28]:
show_efficacy_result_table(configurations, df_eff_baseline)

Unnamed: 0_level_0,Baseline,Reweighing,Learning Fair Representation
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Accuracy,0.850241,0.843484,0.802539
Balanced accuracy,0.804536,0.799103,0.736226
Precision,0.600681,0.555556,0.406982
Recall,0.728822,0.72905,0.640751
F1-Score,0.658576,0.630587,0.497787


# Post Processing

In [29]:
from collections import defaultdict
configurations = defaultdict(dict)

from holisticai.bias.mitigation import EqualizedOdds
configurations['Equalized Odds']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ('bm_postprocessing', EqualizedOdds()),
    ])

from holisticai.bias.mitigation import CalibratedEqualizedOdds
configurations['Calibrated Equalized Odds']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ('bm_postprocessing', CalibratedEqualizedOdds()),
    ])

from holisticai.bias.mitigation import RejectOptionClassification
configurations['Reject Option Classification']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ('bm_postprocessing', RejectOptionClassification(metric_name="Statistical parity difference", verbose=1)),
    ])

### Run Configurations

In [30]:
for config_name,config in configurations.items():
    bias_metrics,eff_metrics = fit_and_evaluate_pipeline(config['pipeline'])
    config['result'] = {'bias':bias_metrics, 'efficacy':eff_metrics}

Progress: 100/100

In [31]:
show_bias_result_table(configurations, df_baseline)

Unnamed: 0_level_0,Baseline,Equalized Odds,Calibrated Equalized Odds,Reject Option Classification,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Statistical Parity,0.178353,0.095813,0.108962,0.059864,0
Disparate Impact,3.235544,1.70078,2.365774,1.182539,1
Four Fifths Rule,0.309067,0.587965,0.422695,0.845638,1
Cohen D,0.457755,0.24093,0.306568,0.124368,0
Equality of Opportunity Difference,0.059581,-0.020744,-0.109945,-0.148833,0
False Positive Rate Difference,0.082695,0.010227,0.057379,-0.038114,0
Average Odds Difference,0.071138,-0.005258,-0.026283,-0.093473,0
Accuracy Difference,-0.122388,-0.080495,-0.156621,0.016622,0


In [32]:
show_efficacy_result_table(configurations, df_eff_baseline)

Unnamed: 0_level_0,Baseline,Equalized Odds,Calibrated Equalized Odds,Reject Option Classification
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accuracy,0.850241,0.82598,0.827516,0.779609
Balanced accuracy,0.804536,0.765959,0.784805,0.726811
Precision,0.600681,0.554704,0.457642,0.806301
Recall,0.728822,0.665815,0.723419,0.527283
F1-Score,0.658576,0.605202,0.560626,0.637603


# Inprocessing

In [33]:
from collections import defaultdict
configurations = defaultdict(dict)

from holisticai.bias.mitigation import GridSearchReduction

model = LogisticRegression()
inprocessing_model = GridSearchReduction(constraints="DemographicParity", grid_size=20, verbose=1).transform_estimator(model)

configurations['GridSearch Reduction']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('bm_inprocessing', inprocessing_model),
    ])

from holisticai.bias.mitigation import ExponentiatedGradientReduction

model = LogisticRegression()
inprocessing_model = ExponentiatedGradientReduction(constraints="DemographicParity", verbose=1).transform_estimator(model)

configurations['ExponentiatedGradient Reduction']['pipeline'] = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('bm_inprocessing', inprocessing_model),
    ])

### Run Configurations

In [34]:
for config_name,config in configurations.items():
    bias_metrics,eff_metrics = fit_and_evaluate_pipeline(config['pipeline'])
    config['result'] = {'bias':bias_metrics, 'efficacy':eff_metrics}

steps: 5	Best gap:6.41039

In [35]:
show_bias_result_table(configurations, df_baseline)

Unnamed: 0_level_0,Baseline,GridSearch Reduction,ExponentiatedGradient Reduction,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Statistical Parity,0.178353,0.005191,0.010549,0
Disparate Impact,3.235544,1.034025,1.070555,1
Four Fifths Rule,0.309067,0.967095,0.934095,1
Cohen D,0.457755,0.014306,0.029036,0
Equality of Opportunity Difference,0.059581,-0.341247,-0.324293,0
False Positive Rate Difference,0.082695,-0.032478,-0.029755,0
Average Odds Difference,0.071138,-0.186862,-0.177024,0
Accuracy Difference,-0.122388,-0.113938,-0.113159,0


In [36]:
show_efficacy_result_table(configurations, df_eff_baseline)

Unnamed: 0_level_0,Baseline,GridSearch Reduction,ExponentiatedGradient Reduction
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Accuracy,0.850241,0.829358,0.830484
Balanced accuracy,0.804536,0.786316,0.788188
Precision,0.600681,0.469562,0.472967
Recall,0.728822,0.723753,0.726619
F1-Score,0.658576,0.569584,0.572976
