# **Mitigating Bias in regression setting with holisticai Pipeline**


In [1]:
# sys path
import sys
sys.path.append('../../')

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from holisticai.bias.metrics import regression_bias_metrics
from holisticai.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Data Preprocessing

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from holisticai.datasets import load_us_crime

def load_dataset():
  dataset = load_us_crime(return_X_y=False, as_frame=True)
  df = pd.concat([dataset["data"], dataset["target"]], axis=1)
  df_clean = df.iloc[:,[i for i,n in enumerate(df.isna().sum(axis=0).T.values) if n<1000]]
  df_clean = df_clean.dropna()

  #gs = ['racepctblack', 'racePctWhite']
  gs = ['racePctWhite']
  groups = {}
  for race in gs:
    groups[race] = df_clean[race].apply(lambda x: x>0.5)

  group_a =  groups[gs[0]]
  group_b =  1-group_a#groups[gs[1]]
  xor_groups  = group_a ^ group_b

  cols = [c for c in df_clean.columns if (not c.startswith('race')) and (not c.startswith('age'))]
  df_clean = df_clean[cols].iloc[:,3:]
  df_clean = df_clean[xor_groups]
  group_a = group_a[xor_groups]
  group_b = group_b[xor_groups]

  scalar = StandardScaler()
  df_t = scalar.fit_transform(df_clean)
  X = df_t[:,:-1]
  y = df_t[:,-1]

  X_train,X_test,y_train,y_test, group_a_tr, group_a_ts, group_b_tr, group_b_ts = \
      train_test_split(X, y, group_a, group_b, test_size=0.2, random_state=42)
  train_data = X_train, y_train, group_a_tr, group_b_tr
  test_data  = X_test, y_test, group_a_ts, group_b_ts
  return train_data, test_data

train_data, test_data = load_dataset()

## Baseline

In [4]:
pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", LinearRegression()),
    ]
)

X, y, group_a, group_b = train_data

pipeline.fit(X, y)

X, y, group_a, group_b = test_data

y_pred = pipeline.predict(X)

df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_baseline = y_pred.copy()
df_baseline=df.copy()
df_baseline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.016953,1
Disparate Impact Q80,0.100673,1
Disparate Impact Q50,0.424518,1
Statistical Parity Q50,-0.703821,0
No Disparate Impact Level,-0.825434,-
Average Score Difference,-1.492622,0
Z Score Difference,-2.465747,0
Max Statistical Parity,0.768248,0
Statistical Parity AUC,0.439341,0
RMSE Ratio,0.651463,1


## Grid Search

In [5]:
from holisticai.bias.mitigation import GridSearchReduction

model = LinearRegression()
inprocessing_model = GridSearchReduction(constraints="BoundedGroupLoss", 
                                         loss='Square', min_val=-0.1, max_val=1.3, 
                                         grid_size=20).transform_estimator(model)

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_inprocessing", inprocessing_model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_grid_search  = y_pred.copy()
df_grid_search =df.copy()
df_grid_search

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.036898,1
Disparate Impact Q80,0.112587,1
Disparate Impact Q50,0.424518,1
Statistical Parity Q50,-0.668775,0
No Disparate Impact Level,-0.850086,-
Average Score Difference,-1.480181,0
Z Score Difference,-2.234824,0
Max Statistical Parity,0.704611,0
Statistical Parity AUC,0.42287,0
RMSE Ratio,0.67644,1


## Exponentiated Gradient

In [6]:
from holisticai.bias.mitigation import ExponentiatedGradientReduction

model = LinearRegression()
inprocessing_model = ExponentiatedGradientReduction(constraints="BoundedGroupLoss", 
                                         loss='Square', min_val=-0.1, max_val=1.3, upper_bound=0.001,
                                         ).transform_estimator(model)

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_inprocessing", inprocessing_model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_exp_grad  = y_pred.copy()
df_exp_grad =df.copy()
df_exp_grad

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.052273,1
Disparate Impact Q80,0.139394,1
Disparate Impact Q50,0.434266,1
Statistical Parity Q50,-0.598682,0
No Disparate Impact Level,-0.918579,-
Average Score Difference,-1.437193,0
Z Score Difference,-1.870203,0
Max Statistical Parity,0.63307,0
Statistical Parity AUC,0.390744,0
RMSE Ratio,0.745835,1


## Correlation Remover

In [7]:
from holisticai.bias.mitigation import CorrelationRemover

model = LinearRegression()
pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_preprocessing", CorrelationRemover()),
        ("model", model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_correm  = y_pred.copy()
df_correm =df.copy()
df_correm

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.209091,1
Disparate Impact Q80,0.219814,1
Disparate Impact Q50,0.499692,1
Statistical Parity Q50,-0.440975,0
No Disparate Impact Level,-0.763841,-
Average Score Difference,-0.943252,0
Z Score Difference,-1.388035,0
Max Statistical Parity,0.529644,0
Statistical Parity AUC,0.305583,0
RMSE Ratio,0.565712,1


## Wassertein Barycenters

In [8]:
from holisticai.bias.mitigation import WassersteinBarycenter

model = LinearRegression()
pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", model),
        ("bm_postprocessing", WassersteinBarycenter()),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_wb  = y_pred.copy()
df_wb =df.copy()
df_wb

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.985714,1
Disparate Impact Q80,0.906061,1
Disparate Impact Q50,0.86317,1
Statistical Parity Q50,-0.020422,0
No Disparate Impact Level,1.021537,-
Average Score Difference,-0.063446,0
Z Score Difference,-0.103833,0
Max Statistical Parity,0.112516,0
Statistical Parity AUC,0.042851,0
RMSE Ratio,0.391934,1


In [11]:
from holisticai.bias.mitigation import PluginEstimationAndCalibration

model = LinearRegression()
pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", model),
        ("bm_postprocessing", PluginEstimationAndCalibration()),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_pec  = y_pred.copy()
df_pec =df.copy()
df_pec

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.743434,1
Disparate Impact Q80,0.985714,1
Disparate Impact Q50,1.0,1
Statistical Parity Q50,-0.002899,0
No Disparate Impact Level,0.740741,-
Average Score Difference,0.032142,0
Z Score Difference,0.102848,0
Max Statistical Parity,0.28195,0
Statistical Parity AUC,0.047021,0
RMSE Ratio,0.516212,1


In [12]:
result = pd.concat([df_baseline, df_grid_search, df_exp_grad, df_correm, df_wb, df_pec], axis=1).iloc[:, [0,2,4,6,8,10,11]]
result.columns = ['Baseline','GridSearch',"Exp Grad","Corr. Remv.","WassersteinBarycenter","PluginEstimationAndCalibration",'Reference']
result

Unnamed: 0_level_0,Baseline,GridSearch,Exp Grad,Corr. Remv.,WassersteinBarycenter,PluginEstimationAndCalibration,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Disparate Impact Q90,0.016953,0.036898,0.052273,0.209091,0.985714,0.743434,1
Disparate Impact Q80,0.100673,0.112587,0.139394,0.219814,0.906061,0.985714,1
Disparate Impact Q50,0.424518,0.424518,0.434266,0.499692,0.86317,1.0,1
Statistical Parity Q50,-0.703821,-0.668775,-0.598682,-0.440975,-0.020422,-0.002899,0
No Disparate Impact Level,-0.825434,-0.850086,-0.918579,-0.763841,1.021537,0.740741,-
Average Score Difference,-1.492622,-1.480181,-1.437193,-0.943252,-0.063446,0.032142,0
Z Score Difference,-2.465747,-2.234824,-1.870203,-1.388035,-0.103833,0.102848,0
Max Statistical Parity,0.768248,0.704611,0.63307,0.529644,0.112516,0.28195,0
Statistical Parity AUC,0.439341,0.42287,0.390744,0.305583,0.042851,0.047021,0
RMSE Ratio,0.651463,0.67644,0.745835,0.565712,0.391934,0.516212,1
