In [2]:
# sys path
import sys
sys.path.append('../../')

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from holisticai.bias.metrics import regression_bias_metrics
from holisticai.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data Preprocessing

In [4]:
def load_us_crime(return_X_y=False, as_frame=True):
  dataset = fetch_openml(
      name="us_crime",
      return_X_y=return_X_y,
      as_frame=as_frame,
  )

  df = pd.concat([dataset["data"], dataset["target"]], axis=1)
  df_clean = df.iloc[:,[i for i,n in enumerate(df.isna().sum(axis=0).T.values) if n<1000]]
  df_clean = df_clean.dropna()

  #gs = ['racepctblack', 'racePctWhite']
  gs = ['racePctWhite']
  groups = {}
  for race in gs:
    groups[race] = df_clean[race].apply(lambda x: x>0.5)

  group_a =  groups[gs[0]]
  group_b =  1-group_a#groups[gs[1]]
  xor_groups  = group_a ^ group_b

  cols = [c for c in df_clean.columns if (not c.startswith('race')) and (not c.startswith('age'))]
  df_clean = df_clean[cols].iloc[:,3:]
  df_clean = df_clean[xor_groups]
  group_a = group_a[xor_groups]
  group_b = group_b[xor_groups]

  scalar = StandardScaler()
  df_t = scalar.fit_transform(df_clean)
  X = df_t[:,:-1]
  y = df_t[:,-1]

  X_train,X_test,y_train,y_test, group_a_tr, group_a_ts, group_b_tr, group_b_ts = \
      train_test_split(X, y, group_a, group_b, test_size=0.2)
  train_data = X_train, y_train, group_a_tr, group_b_tr
  test_data  = X_test, y_test, group_a_ts, group_b_ts
  return train_data, test_data

In [5]:
train_data, test_data = load_us_crime()

  warn(


# Baseline

In [6]:
pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LinearRegression()),
    ]
)

X, y, group_a, group_b = train_data

pipeline.fit(X, y)

X, y, group_a, group_b = test_data

y_pred = pipeline.predict(X)

df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_baseline = y_pred.copy()
df_baseline=df.copy()
df_baseline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.016363,1
Disparate Impact Q80,0.102807,1
Disparate Impact Q50,0.40973,1
Statistical Parity Q50,-0.709719,0
No Disparate Impact Level,-0.773619,-
Average Score Difference,-1.497405,0
Z Score Difference,-2.605345,0
Max Statistical Parity,0.747662,0
Statistical Parity AUC,0.443752,0
RMSE Ratio,0.595058,1


# GridSearchReduction

In [7]:
from holisticai.bias.mitigation import GridSearchReduction

model = LinearRegression()
inprocessing_model = GridSearchReduction(constraints="BoundedGroupLoss", 
                                         loss='Absolute', min_val=-0.1, max_val=1.3, 
                                         grid_size=50).transform_estimator(model)

pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("bm_inprocessing", inprocessing_model),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)
df = regression_bias_metrics(
    group_a,
    group_b,
    y_pred,
    y,
    metric_type='both'
)
y_grid_search  = y_pred.copy()
df_grid_search =df.copy()
df_grid_search

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Disparate Impact Q90,0.067269,1
Disparate Impact Q80,0.127674,1
Disparate Impact Q50,0.42884,1
Statistical Parity Q50,-0.63797,0
No Disparate Impact Level,-0.889079,-
Average Score Difference,-1.504871,0
Z Score Difference,-2.046602,0
Max Statistical Parity,0.667011,0
Statistical Parity AUC,0.401659,0
RMSE Ratio,0.700248,1


In [14]:
result = pd.concat([df_baseline, df_grid_search], axis=1).iloc[:, [0,2,3]]
result.columns = ['Baseline','GridSearch','Reference']
result

Unnamed: 0_level_0,Baseline,GridSearch,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Disparate Impact Q90,0.016363,0.067269,1
Disparate Impact Q80,0.102807,0.127674,1
Disparate Impact Q50,0.40973,0.42884,1
Statistical Parity Q50,-0.709719,-0.63797,0
No Disparate Impact Level,-0.773619,-0.889079,-
Average Score Difference,-1.497405,-1.504871,0
Z Score Difference,-2.605345,-2.046602,0
Max Statistical Parity,0.747662,0.667011,0
Statistical Parity AUC,0.443752,0.401659,0
RMSE Ratio,0.595058,0.700248,1
