In [1]:
# sys path
import sys
sys.path.append('../../')

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from holisticai.bias.metrics import regression_bias_metrics
from holisticai.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data Preprocessing

In [3]:
def load_us_crime(return_X_y=False, as_frame=True):
  dataset = fetch_openml(
      name="us_crime",
      return_X_y=return_X_y,
      as_frame=as_frame,
  )

  df = pd.concat([dataset["data"], dataset["target"]], axis=1)
  df_clean = df.iloc[:,[i for i,n in enumerate(df.isna().sum(axis=0).T.values) if n<1000]]
  df_clean = df_clean.dropna()

  #gs = ['racepctblack', 'racePctWhite']
  gs = ['racePctWhite']
  groups = {}
  for race in gs:
    groups[race] = df_clean[race].apply(lambda x: x>0.5)

  group_a =  groups[gs[0]]
  group_b =  1-group_a#groups[gs[1]]
  xor_groups  = group_a ^ group_b

  cols = [c for c in df_clean.columns if (not c.startswith('race')) and (not c.startswith('age'))]
  df_clean = df_clean[cols].iloc[:,3:]
  df_clean = df_clean[xor_groups]
  group_a = group_a[xor_groups]
  group_b = group_b[xor_groups]

  scalar = StandardScaler()
  df_t = scalar.fit_transform(df_clean)
  X = df_t[:,:-1]
  y = df_t[:,-1]

  X_train,X_test,y_train,y_test, group_a_tr, group_a_ts, group_b_tr, group_b_ts = \
      train_test_split(X, y, group_a, group_b, test_size=0.2)
  train_data = X_train, y_train, group_a_tr, group_b_tr
  test_data  = X_test, y_test, group_a_ts, group_b_ts
  return train_data, test_data

In [4]:
train_data, test_data = load_us_crime()

  warn(


# Baseline

In [5]:
pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LinearRegression()),
    ]
)

X, y, group_a, group_b = train_data

pipeline.fit(X, y)

X, y, group_a, group_b = test_data

y_pred = pipeline.predict(X)

df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_baseline = y_pred.copy()
df_baseline=df.copy()
df_baseline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.034343,1
Disparate Impact Q80,0.093701,1
Disparate Impact Q50,0.404192,1
Statistical Parity Q50,-0.752925,0
No Disparate Impact Level,-0.76053,-
Average Score Difference,-1.498944,0
Z Score Difference,-2.349888,0
Max Statistical Parity,0.782773,0
Statistical Parity AUC,0.4456,0
RMSE Ratio,0.565407,1


# Grid Search

In [6]:
from holisticai.bias.mitigation import GridSearchReduction

model = LinearRegression()
inprocessing_model = GridSearchReduction(constraints="BoundedGroupLoss", 
                                         loss='Absolute', min_val=-0.1, max_val=1.3, 
                                         grid_size=20).transform_estimator(model)

pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("bm_inprocessing", inprocessing_model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_grid_search  = y_pred.copy()
df_grid_search =df.copy()
df_grid_search

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.048653,1
Disparate Impact Q80,0.110661,1
Disparate Impact Q50,0.423201,1
Statistical Parity Q50,-0.697789,0
No Disparate Impact Level,-0.823903,-
Average Score Difference,-1.562475,0
Z Score Difference,-2.097624,0
Max Statistical Parity,0.773376,0
Statistical Parity AUC,0.429467,0
RMSE Ratio,0.666666,1


# Exponentiated Gradient

In [7]:
from holisticai.bias.mitigation import ExponentiatedGradientReduction

model = LinearRegression()
inprocessing_model = ExponentiatedGradientReduction(constraints="BoundedGroupLoss", 
                                         loss='Absolute', min_val=-0.1, max_val=1.3, upper_bound=0.001,
                                         ).transform_estimator(model)

pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("bm_inprocessing", inprocessing_model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_exp_grad  = y_pred.copy()
df_exp_grad =df.copy()
df_exp_grad

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.041281,1
Disparate Impact Q80,0.093701,1
Disparate Impact Q50,0.413548,1
Statistical Parity Q50,-0.752925,0
No Disparate Impact Level,-0.77547,-
Average Score Difference,-1.545291,0
Z Score Difference,-2.234763,0
Max Statistical Parity,0.782358,0
Statistical Parity AUC,0.438166,0
RMSE Ratio,0.614082,1


In [8]:
from holisticai.bias.mitigation import CorrelationRemover

model = LinearRegression()
pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("bm_preprocessing", CorrelationRemover()),
        ("model", model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_correm  = y_pred.copy()
df_correm =df.copy()
df_correm

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.194611,1
Disparate Impact Q80,0.226169,1
Disparate Impact Q50,0.465087,1
Statistical Parity Q50,-0.440488,0
No Disparate Impact Level,-0.703893,-
Average Score Difference,-0.918003,0
Z Score Difference,-1.281099,0
Max Statistical Parity,0.539936,0
Statistical Parity AUC,0.324587,0
RMSE Ratio,0.481846,1


In [10]:
result = pd.concat([df_baseline, df_grid_search, df_exp_grad, df_correm], axis=1).iloc[:, [0,2,4,6,7]]
result.columns = ['Baseline','GridSearch',"Exp Grad","Corr. Remv.",'Reference']
result

Unnamed: 0_level_0,Baseline,GridSearch,Exp Grad,Corr. Remv.,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Disparate Impact Q90,0.034343,0.048653,0.041281,0.194611,1
Disparate Impact Q80,0.093701,0.110661,0.093701,0.226169,1
Disparate Impact Q50,0.404192,0.423201,0.413548,0.465087,1
Statistical Parity Q50,-0.752925,-0.697789,-0.752925,-0.440488,0
No Disparate Impact Level,-0.76053,-0.823903,-0.77547,-0.703893,-
Average Score Difference,-1.498944,-1.562475,-1.545291,-0.918003,0
Z Score Difference,-2.349888,-2.097624,-2.234763,-1.281099,0
Max Statistical Parity,0.782773,0.773376,0.782358,0.539936,0
Statistical Parity AUC,0.4456,0.429467,0.438166,0.324587,0
RMSE Ratio,0.565407,0.666666,0.614082,0.481846,1
