## Packages Installation

First, install the `holisticai` package if you haven't already:
```bash
!pip install holisticai[all]
```
Then, import the necessary libraries.

In [1]:
import warnings

import pandas as pd
from holisticai.bias.metrics import multiclass_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

## Data Loading

In [2]:
dataset = load_dataset('us_crime_multiclass')
train_test = dataset.train_test_split(test_size=0.2, random_state=42)

train = train_test['train']
test = train_test['test']

dataset

# 1. Correlation Remover

### Traditional implementation

In [3]:
from holisticai.bias.mitigation import CorrelationRemover

preprocessing_mitigator = CorrelationRemover()
model = LogisticRegression()

# set scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])

# apply mitigation and fit model
X_p_train = preprocessing_mitigator.fit_transform(X_train, train['group_a'], train['group_b'])
model.fit(X_p_train, train['y'])

# predict on test set
X_test = scaler.transform(test['X'])
X_p_test = preprocessing_mitigator.transform(X_test, test['group_a'], test['group_b'])
y_pred = model.predict(X_p_test)

# compute bias metrics
metrics = multiclass_bias_metrics(test['p_attr'], y_pred, test['y'], metric_type='both')
metrics

2024-07-31 09:36:53.398348: W external/xla/xla/service/gpu/nvptx_compiler.cc:765] The NVIDIA driver's CUDA version is 12.3 which is older than the ptxas CUDA version (12.5.82). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.28643,0
Mean Multiclass Statistical Parity,0.28643,0
Max Multiclass Equality of Opportunity,0.244263,0
Max Multiclass Average Odds,0.129637,0
Max Multiclass True Positive Difference,0.224615,0
Mean Multiclass Equality of Opportunity,0.244263,0
Mean Multiclass Average Odds,0.129637,0
Mean Multiclass True Positive Difference,0.224615,0


### Pipeline implementation

In [4]:
mitigator = CorrelationRemover()

# set pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()),("bm_preprocessing", mitigator), ("model", LogisticRegression())])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# predict on test set
y_pred = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# compute bias metrics
metrics_pipeline = multiclass_bias_metrics(test['p_attr'], y_pred, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.28643,0
Mean Multiclass Statistical Parity,0.28643,0
Max Multiclass Equality of Opportunity,0.244263,0
Max Multiclass Average Odds,0.129637,0
Max Multiclass True Positive Difference,0.224615,0
Mean Multiclass Equality of Opportunity,0.244263,0
Mean Multiclass Average Odds,0.129637,0
Mean Multiclass True Positive Difference,0.224615,0


### Comparison

In [5]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Max Multiclass Statistical Parity,0.28643,0.28643,0
Mean Multiclass Statistical Parity,0.28643,0.28643,0
Max Multiclass Equality of Opportunity,0.244263,0.244263,0
Max Multiclass Average Odds,0.129637,0.129637,0
Max Multiclass True Positive Difference,0.224615,0.224615,0
Mean Multiclass Equality of Opportunity,0.244263,0.244263,0
Mean Multiclass Average Odds,0.129637,0.129637,0
Mean Multiclass True Positive Difference,0.224615,0.224615,0


# 2. Disparate Impact Remover

### Traditional Implementation

In [6]:
from holisticai.bias.mitigation import DisparateImpactRemover

preprocessing_mitigator = DisparateImpactRemover()
model = LogisticRegression()

# set scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])

# fit mitgator and model
X_p_train = preprocessing_mitigator.fit_transform(X_train, group_a=train['group_a'], group_b=train['group_b'])
model.fit(X_p_train, train['y'])

# predict on test set
X_test = scaler.transform(test['X'])
X_p_test = preprocessing_mitigator.transform(X_test, group_a=test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_p_test)

# compute bias metrics
metrics = multiclass_bias_metrics(test['p_attr'], y_pred, test['y'], metric_type='both')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.475099,0
Mean Multiclass Statistical Parity,0.475099,0
Max Multiclass Equality of Opportunity,0.236351,0
Max Multiclass Average Odds,0.212576,0
Max Multiclass True Positive Difference,0.218725,0
Mean Multiclass Equality of Opportunity,0.236351,0
Mean Multiclass Average Odds,0.212576,0
Mean Multiclass True Positive Difference,0.218725,0


### Pipeline Implementation

In [7]:
mitigator = DisparateImpactRemover()

# set pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()),("bm_preprocessing", mitigator), ("model", LogisticRegression())])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# predict on test set
y_pred = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# compute bias metrics
metrics_pipeline = multiclass_bias_metrics(test['p_attr'], y_pred, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.475099,0
Mean Multiclass Statistical Parity,0.475099,0
Max Multiclass Equality of Opportunity,0.236351,0
Max Multiclass Average Odds,0.212576,0
Max Multiclass True Positive Difference,0.218725,0
Mean Multiclass Equality of Opportunity,0.236351,0
Mean Multiclass Average Odds,0.212576,0
Mean Multiclass True Positive Difference,0.218725,0


### Comparison

In [8]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Max Multiclass Statistical Parity,0.475099,0.475099,0
Mean Multiclass Statistical Parity,0.475099,0.475099,0
Max Multiclass Equality of Opportunity,0.236351,0.236351,0
Max Multiclass Average Odds,0.212576,0.212576,0
Max Multiclass True Positive Difference,0.218725,0.218725,0
Mean Multiclass Equality of Opportunity,0.236351,0.236351,0
Mean Multiclass Average Odds,0.212576,0.212576,0
Mean Multiclass True Positive Difference,0.218725,0.218725,0


# 3. Reweighing

### Traditional Implementation

In [9]:
from holisticai.bias.mitigation import Reweighing

mitigator = Reweighing()
model = LogisticRegression()

# set scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])

# fit mitigator and model
mitigator.fit(train['y'], group_a=train['group_a'], group_b=train['group_b'])
sw = mitigator.estimator_params["sample_weight"]
model.fit(X_train, train['y'], sw)

# predict on test set
X_test = scaler.transform(test['X'])
X_p_test = mitigator.transform(X_test, group_a=test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_p_test)

# Compute metrics
metrics = multiclass_bias_metrics(test['p_attr'], y_pred, test['y'], metric_type='both')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.584321,0
Mean Multiclass Statistical Parity,0.584321,0
Max Multiclass Equality of Opportunity,0.318624,0
Max Multiclass Average Odds,0.267455,0
Max Multiclass True Positive Difference,0.241943,0
Mean Multiclass Equality of Opportunity,0.318624,0
Mean Multiclass Average Odds,0.267455,0
Mean Multiclass True Positive Difference,0.241943,0


### Pipeline Implementation

In [10]:
mitigator = Reweighing()

# set pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()),("bm_preprocessing", mitigator), ("model", LogisticRegression())])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# predict on test set
y_pred = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# compute bias metrics
metrics_pipeline = multiclass_bias_metrics(test['p_attr'], y_pred, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.584321,0
Mean Multiclass Statistical Parity,0.584321,0
Max Multiclass Equality of Opportunity,0.318624,0
Max Multiclass Average Odds,0.267455,0
Max Multiclass True Positive Difference,0.241943,0
Mean Multiclass Equality of Opportunity,0.318624,0
Mean Multiclass Average Odds,0.267455,0
Mean Multiclass True Positive Difference,0.241943,0


### Comparison

In [11]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Max Multiclass Statistical Parity,0.584321,0.584321,0
Mean Multiclass Statistical Parity,0.584321,0.584321,0
Max Multiclass Equality of Opportunity,0.318624,0.318624,0
Max Multiclass Average Odds,0.267455,0.267455,0
Max Multiclass True Positive Difference,0.241943,0.241943,0
Mean Multiclass Equality of Opportunity,0.318624,0.318624,0
Mean Multiclass Average Odds,0.267455,0.267455,0
Mean Multiclass True Positive Difference,0.241943,0.241943,0
