## Packages Installation

First, install the `holisticai` package if you haven't already:
```bash
!pip install holisticai[all]
```
Then, import the necessary libraries.

In [2]:
import warnings

import pandas as pd
from holisticai.bias.metrics import clustering_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.cluster import KMeans

warnings.filterwarnings("ignore")

## Data Loading

In [5]:
dataset = load_dataset('clinical_records',protected_attribute="sex")
train_test = dataset.train_test_split(test_size=0.2, random_state=42)

train = train_test['train']
test = train_test['test']

dataset

In [10]:
from holisticai.bias.mitigation import FairletClusteringPreprocessing

# set the model
model = KMeans(n_clusters = 3, random_state=42)
model.fit(train['X'])
y_pred = model.predict(train['X'])
centroids = model.cluster_centers_

# get the metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.711297,1
Minimum Cluster Ratio,0.5,1
Cluster Distribution Total Variation,0.057983,0
Cluster Distribution KL Div,0.021202,0
Social Fairness Ratio,1.217368,1
Silhouette Difference,-0.007195,0


# 1. Fairlet 

#### Traditional Implementation

In [11]:
from holisticai.bias.mitigation import FairletClusteringPreprocessing

# set the model
model = KMeans(n_clusters = 3, random_state=42)

# set the mitigator and fit the model
mitigator = FairletClusteringPreprocessing(seed=42)
Xpre = mitigator.fit_transform(train['X'], train['group_a'], train['group_b'])
model.fit(Xpre)

# predict the clusters and get the centroids
y_pred = model.predict(Xpre)
centroids = model.cluster_centers_

# get the metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.946014,1
Minimum Cluster Ratio,0.507042,1
Cluster Distribution Total Variation,0.03751,0
Cluster Distribution KL Div,0.002869,0
Social Fairness Ratio,1.337623,1
Silhouette Difference,-0.064977,0


# Pipeline Implementation

In [7]:
mitigator = FairletClusteringPreprocessing(seed=42)

# set the pipeline
pipeline = Pipeline(steps=[('bm_preprocessing', mitigator), ('model', KMeans(n_clusters = 3, random_state=42))])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])

# predict the clusters and get the centroids
y_pred = pipeline.predict(train['X'])
centroids = model.cluster_centers_

# compute the bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.946014,1
Minimum Cluster Ratio,0.507042,1
Cluster Distribution Total Variation,0.03751,0
Cluster Distribution KL Div,0.002869,0
Social Fairness Ratio,1.337623,1
Silhouette Difference,-0.064977,0


### Comparison

In [12]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cluster Balance,0.946014,0.946014,1
Minimum Cluster Ratio,0.507042,0.507042,1
Cluster Distribution Total Variation,0.03751,0.03751,0
Cluster Distribution KL Div,0.002869,0.002869,0
Social Fairness Ratio,1.337623,1.337623,1
Silhouette Difference,-0.064977,-0.064977,0
