## Packages Installation

First, install the `holisticai` package if you haven't already:
```bash
!pip install holisticai[all]
```
Then, import the necessary libraries.

In [1]:
import pandas as pd
from holisticai.bias.metrics import classification_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

## Dataset loading

In [2]:
dataset = load_dataset('adult', protected_attribute='sex')
train_test = dataset.train_test_split(test_size=0.2, random_state=42)

train = train_test['train']
test = train_test['test']

dataset

# 1 . Correlation Remover

### Traditional Implementation

In [3]:
# Define postprocessing model
from holisticai.bias.mitigation import CorrelationRemover

mitigator = CorrelationRemover()
model = LogisticRegression()

# Standardize data and fit model
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])
X_train_pre = mitigator.fit_transform(X_train, group_a=train['group_a'], group_b=train['group_b'])
model.fit(X_train_pre, train['y'])

# Predict on test data
X_test = scaler.transform(test['X'])
X_test_pre = mitigator.transform(X_test, group_a= test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_test_pre)

# Evaluate bias metrics
metrics = classification_bias_metrics(test['group_a'], test['group_b'], y_pred, test['y'], metric_type='both')
metrics

2024-07-09 11:58:23.059034: W external/xla/xla/service/gpu/nvptx_compiler.cc:765] The NVIDIA driver's CUDA version is 12.3 which is older than the ptxas CUDA version (12.5.82). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.092649,0
Disparate Impact,1.75646,1
Four Fifths Rule,0.569327,1
Cohen D,0.240006,0
2SD Rule,10.608753,0
Equality of Opportunity Difference,-0.098587,0
False Positive Rate Difference,0.013932,0
Average Odds Difference,-0.042328,0
Accuracy Difference,-0.097085,0


### Pipeline Implementation

In [4]:
# Define postprocessing model
mitigator = CorrelationRemover()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics_pipeline = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.092649,0
Disparate Impact,1.75646,1
Four Fifths Rule,0.569327,1
Cohen D,0.240006,0
2SD Rule,10.608753,0
Equality of Opportunity Difference,-0.098587,0
False Positive Rate Difference,0.013932,0
Average Odds Difference,-0.042328,0
Accuracy Difference,-0.097085,0


### Comparison

In [5]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Statistical Parity,0.092649,0.092649,0
Disparate Impact,1.75646,1.75646,1
Four Fifths Rule,0.569327,0.569327,1
Cohen D,0.240006,0.240006,0
2SD Rule,10.608753,10.608753,0
Equality of Opportunity Difference,-0.098587,-0.098587,0
False Positive Rate Difference,0.013932,0.013932,0
Average Odds Difference,-0.042328,-0.042328,0
Accuracy Difference,-0.097085,-0.097085,0


# 2. Disparate Impact Remover

### Traditional Implementation

In [6]:
# Define postprocessing model
from holisticai.bias.mitigation import DisparateImpactRemover

mitigator = DisparateImpactRemover()
model = LogisticRegression()

# Standardize data and fit model
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])

# Fit model
X_train_pre = mitigator.fit_transform(X_train, group_a=train['group_a'], group_b=train['group_b'])
model.fit(X_train_pre, train['y'])

# Predict on test data
X_test = scaler.transform(test['X'])
X_test_pre = mitigator.transform(X_test, group_a=test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_test_pre)

# Evaluate bias metrics
metrics = classification_bias_metrics(test['group_a'], test['group_b'], y_pred, test['y'], metric_type='both')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.450658,0
Disparate Impact,7.521151,1
Four Fifths Rule,0.132958,1
Cohen D,1.034583,0
2SD Rule,41.423844,0
Equality of Opportunity Difference,0.41483,0
False Positive Rate Difference,0.340255,0
Average Odds Difference,0.377543,0
Accuracy Difference,-0.207011,0


### Pipeline Implementation

In [7]:
# Define postprocessing model
mitigator = DisparateImpactRemover()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics_pipeline = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.450658,0
Disparate Impact,7.521151,1
Four Fifths Rule,0.132958,1
Cohen D,1.034583,0
2SD Rule,41.423844,0
Equality of Opportunity Difference,0.41483,0
False Positive Rate Difference,0.340255,0
Average Odds Difference,0.377543,0
Accuracy Difference,-0.207011,0


### Comparison

In [8]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Statistical Parity,0.450658,0.450658,0
Disparate Impact,7.521151,7.521151,1
Four Fifths Rule,0.132958,0.132958,1
Cohen D,1.034583,1.034583,0
2SD Rule,41.423844,41.423844,0
Equality of Opportunity Difference,0.41483,0.41483,0
False Positive Rate Difference,0.340255,0.340255,0
Average Odds Difference,0.377543,0.377543,0
Accuracy Difference,-0.207011,-0.207011,0


# 3. Learning Fair Representations

### Traditional Implementation

In [9]:
# Define postprocessing model
from holisticai.bias.mitigation import LearningFairRepresentation

mitigator = LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, verbose=1, maxiter=100, seed=100)
model = LogisticRegression()

# Standardize data and fit model
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])
X_train_pre = mitigator.fit_transform(X_train, train['y'], group_a=train['group_a'], group_b=train['group_b'])
model.fit(X_train_pre, train['y'])

# Predict on test data
X_test = scaler.transform(test['X'])
X_test_pre = mitigator.transform(X_test, group_a=test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_test_pre)

# Evaluate bias metrics
metrics = classification_bias_metrics(test['group_a'], test['group_b'], y_pred, test['y'], metric_type='both')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.066076,0
Disparate Impact,2.177676,1
Four Fifths Rule,0.459205,1
Cohen D,0.220608,0
2SD Rule,9.760754,0
Equality of Opportunity Difference,0.07058,0
False Positive Rate Difference,0.020807,0
Average Odds Difference,0.045694,0
Accuracy Difference,-0.139717,0


### Pipeline Implementation

In [10]:
# Define postprocessing model
mitigator = LearningFairRepresentation(k=10, Ax=0.2, Ay=2.0, Az=4.0, verbose=1, maxiter=100, seed=100)
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics_pipeline = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.066076,0
Disparate Impact,2.177676,1
Four Fifths Rule,0.459205,1
Cohen D,0.220608,0
2SD Rule,9.760754,0
Equality of Opportunity Difference,0.07058,0
False Positive Rate Difference,0.020807,0
Average Odds Difference,0.045694,0
Accuracy Difference,-0.139717,0


### Comparison

In [11]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Statistical Parity,0.066076,0.066076,0
Disparate Impact,2.177676,2.177676,1
Four Fifths Rule,0.459205,0.459205,1
Cohen D,0.220608,0.220608,0
2SD Rule,9.760754,9.760754,0
Equality of Opportunity Difference,0.07058,0.07058,0
False Positive Rate Difference,0.020807,0.020807,0
Average Odds Difference,0.045694,0.045694,0
Accuracy Difference,-0.139717,-0.139717,0


# 4. Reweighing

### Traditional Implementation

In [12]:
# Define preprocessing model
from holisticai.bias.mitigation import Reweighing

mitigator = Reweighing()
model = LogisticRegression()

# Standardize data and fit model
scaler = StandardScaler()
X_train = scaler.fit_transform(train['X'])

# Fit model
mitigator.fit(train['y'], group_a=train['group_a'], group_b=train['group_b'])
sw = mitigator.estimator_params["sample_weight"]
model = LogisticRegression()
model.fit(X_train, train['y'], sw)

# Mitigator transform and model predict
X_test = scaler.transform(test['X'])
X_pre = mitigator.transform(X_test, group_a=test['group_a'], group_b=test['group_b'])
y_pred = model.predict(X_pre)

# Evaluate bias metrics
metrics = classification_bias_metrics(test['group_a'], test['group_b'], y_pred, test['y'], metric_type='both')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.09611,0
Disparate Impact,1.789128,1
Four Fifths Rule,0.558932,1
Cohen D,0.248225,0
2SD Rule,10.967284,0
Equality of Opportunity Difference,-0.109324,0
False Positive Rate Difference,0.018751,0
Average Odds Difference,-0.045286,0
Accuracy Difference,-0.101354,0


### Pipeline Implementation

In [13]:
# Define preprocessing model
mitigator = Reweighing()
model = LogisticRegression()

# Define pipeline
pipeline = Pipeline(steps=[('scalar', StandardScaler()), ("bm_preprocessing", mitigator), ("estimator", model),])
pipeline.fit(train['X'], train['y'], bm__group_a=train['group_a'], bm__group_b=train['group_b'])

# Make predictions
y_pred_pipeline = pipeline.predict(test['X'], bm__group_a=test['group_a'], bm__group_b=test['group_b'])

# Evaluate bias metrics for pipeline model
metrics_pipeline = classification_bias_metrics(test['group_a'], test['group_b'], y_pred_pipeline, test['y'], metric_type='both')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistical Parity,0.09611,0
Disparate Impact,1.789128,1
Four Fifths Rule,0.558932,1
Cohen D,0.248225,0
2SD Rule,10.967284,0
Equality of Opportunity Difference,-0.109324,0
False Positive Rate Difference,0.018751,0
Average Odds Difference,-0.045286,0
Accuracy Difference,-0.101354,0


### Comparison

In [14]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Statistical Parity,0.09611,0.09611,0
Disparate Impact,1.789128,1.789128,1
Four Fifths Rule,0.558932,0.558932,1
Cohen D,0.248225,0.248225,0
2SD Rule,10.967284,10.967284,0
Equality of Opportunity Difference,-0.109324,-0.109324,0
False Positive Rate Difference,0.018751,0.018751,0
Average Odds Difference,-0.045286,-0.045286,0
Accuracy Difference,-0.101354,-0.101354,0
