## Packages Installation

First, install the `holisticai` package if you haven't already:
```bash
!pip install holisticai[all]
```
Then, import the necessary libraries.

In [1]:
import warnings

import pandas as pd
from holisticai.bias.metrics import clustering_bias_metrics
from holisticai.datasets import load_dataset
from holisticai.pipeline import Pipeline
from sklearn.cluster import KMeans

warnings.filterwarnings("ignore")

## Data Loading

In [2]:
dataset = load_dataset('clinical_records')
train_test = dataset.train_test_split(test_size=0.2, random_state=42)

train = train_test['train']
test = train_test['test']

dataset

# 1. Variational Fair Clustering

#### Traditional Implementation

In [3]:
from holisticai.bias.mitigation import  VariationalFairClustering

# fit the mitigator
mitigator = VariationalFairClustering(n_clusters=3, lmbda=7, method='kmeans', verbose=True, seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])

# make predictions and get centroids
y_pred = mitigator.predict(train['X'], train['group_a'], train['group_b'])
centroids = mitigator.cluster_centers_

# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')


[elapsed time: 00:00:00 | iter:8/100 | fairness_error:0.0318 | fair_cluster_energy:204.2278 | cluster_energy:190.2894]

### Pipeline Implementation

In [4]:
inprocessing_model = VariationalFairClustering(n_clusters= 3, lmbda=7, method='kmeans', verbose=True,seed=42)

# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', inprocessing_model)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])

# make predictions and get centroids
y_pred = pipeline.predict(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
centroids = pipeline['bm_inprocessing'].cluster_centers_

# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline


[elapsed time: 00:00:00 | iter:8/100 | fairness_error:0.0318 | fair_cluster_energy:204.2278 | cluster_energy:190.2894]

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.738088,1
Minimum Cluster Ratio,0.355932,1
Cluster Distribution Total Variation,0.136058,0
Cluster Distribution KL Div,0.043728,0
Social Fairness Ratio,1.118376,1
Silhouette Difference,0.006792,0


### Comparison

In [5]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cluster Balance,0.738088,0.738088,1
Minimum Cluster Ratio,0.355932,0.355932,1
Cluster Distribution Total Variation,0.136058,0.136058,0
Cluster Distribution KL Div,0.043728,0.043728,0
Social Fairness Ratio,1.118376,1.118376,1
Silhouette Difference,0.006792,0.006792,0


# 2. Fair K-Center

### Traditional Implementation

In [6]:
from holisticai.bias.mitigation import  FairKCenterClustering

# fit the mitigator
mitigator = FairKCenterClustering(req_nr_per_group=(1,1), nr_initially_given = 0, seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])

# make predictions and get centroids
y_pred = mitigator.predict(train['X'], train['group_a'], train['group_b'])
centroids = mitigator.all_centroids

# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')

### Pipeline Implementation

In [7]:
inprocessing_model = FairKCenterClustering(req_nr_per_group=(1,1), nr_initially_given = 0, seed=42)

# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', inprocessing_model)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])

# make predictions and get centroids
y_pred = pipeline.predict(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
centroids = pipeline['bm_inprocessing'].all_centroids

# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.819967,1
Minimum Cluster Ratio,0.457944,1
Cluster Distribution Total Variation,0.118335,0
Cluster Distribution KL Div,0.031148,0
Social Fairness Ratio,1.008144,1
Silhouette Difference,-0.010421,0


### Comparison

In [8]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cluster Balance,0.819967,0.819967,1
Minimum Cluster Ratio,0.457944,0.457944,1
Cluster Distribution Total Variation,0.118335,0.118335,0
Cluster Distribution KL Div,0.031148,0.031148,0
Social Fairness Ratio,1.008144,1.008144,1
Silhouette Difference,-0.010421,-0.010421,0


# 3. Fair K-Median

### Traditional Implementation

In [9]:
from holisticai.bias.mitigation import  FairKMedianClustering

# fit the mitigator
mitigator = FairKMedianClustering(n_clusters=2, strategy='GA', seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])

# make predictions and get centroids
y_pred = mitigator.labels_
centroids = mitigator.cluster_centers_

# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')

### Pipeline Implementation

In [10]:
mitigator = FairKMedianClustering(n_clusters=2, strategy='GA', seed=42)

# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', mitigator)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])

# make predictions and get centroids
y_pred = pipeline['bm_inprocessing'].labels_
centroids = pipeline['bm_inprocessing'].cluster_centers_

# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.823072,1
Minimum Cluster Ratio,0.462963,1
Cluster Distribution Total Variation,0.113063,0
Cluster Distribution KL Div,0.028764,0
Social Fairness Ratio,1.134138,1
Silhouette Difference,-0.009503,0


### Comparison

In [11]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cluster Balance,0.823072,0.823072,1
Minimum Cluster Ratio,0.462963,0.462963,1
Cluster Distribution Total Variation,0.113063,0.113063,0
Cluster Distribution KL Div,0.028764,0.028764,0
Social Fairness Ratio,1.134138,1.134138,1
Silhouette Difference,-0.009503,-0.009503,0


# 4. Fairlet

### Traditional Implementation

In [12]:
from holisticai.bias.mitigation import  FairletClustering

# fit the mitigator
mitigator = FairletClustering(decomposition='Scalable', clustering_model='KMedoids', p=10, q=20, n_clusters=2, seed=42)
mitigator.fit(train['X'], group_a = train['group_a'], group_b = train['group_b'])

# make predictions and get centroids
y_pred = mitigator.predict(train['X'])
centroids = mitigator.cluster_centers_

# compute clustering bias metrics
metrics = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')

### Pipeline Implementation

In [13]:
mitigator = FairletClustering(decomposition='Scalable', clustering_model='KMedoids', p=10, q=20, n_clusters=2, seed=42)

# set the pipeline
pipeline = Pipeline(steps=[('bm_inprocessing', mitigator)])
pipeline.fit(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])

# make predictions and get centroids
y_pred = pipeline.predict(train['X'], bm__group_a = train['group_a'], bm__group_b = train['group_b'])
centroids = pipeline['bm_inprocessing'].cluster_centers_

# compute clustering bias metrics
metrics_pipeline = clustering_bias_metrics(train['group_a'], train['group_b'], y_pred, data = train['X'], centroids = centroids, metric_type = 'equal_outcome')
metrics_pipeline

Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster Balance,0.937255,1
Minimum Cluster Ratio,0.5,1
Cluster Distribution Total Variation,0.024446,0
Cluster Distribution KL Div,0.001588,0
Social Fairness Ratio,1.23574,1
Silhouette Difference,0.006768,0


### Comparison

In [14]:
pd.concat([metrics['Value'], metrics_pipeline], axis=1, keys=['Traditional', 'Pipeline'])

Unnamed: 0_level_0,Traditional,Pipeline,Pipeline
Unnamed: 0_level_1,Value,Value,Reference
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cluster Balance,0.937255,0.937255,1
Minimum Cluster Ratio,0.5,0.5,1
Cluster Distribution Total Variation,0.024446,0.024446,0
Cluster Distribution KL Div,0.001588,0.001588,0
Social Fairness Ratio,1.23574,1.23574,1
Silhouette Difference,0.006768,0.006768,0
