# **Mitigating Bias in multiclass classification**

In [1]:
# sys path
import sys
sys.path = ['../../']+sys.path

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from holisticai.bias.metrics import multiclass_bias_metrics
from holisticai.pipeline import Pipeline
from holisticai.utils.transformers.bias import SensitiveGroups
from tests.testing_utils._tests_data_utils import load_preprocessed_us_crime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Data Preprocessing

In [3]:
train_data , test_data = load_preprocessed_us_crime(nb_classes=5)
_, _, group_a, group_b = train_data
sensgroup = SensitiveGroups()

## Baseline

In [4]:
pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", LogisticRegression()),
    ]
)

X, y, group_a, group_b = train_data

pipeline.fit(X, y)

X, y, group_a, group_b = test_data

y_pred = pipeline.predict(X)

p_attr = sensgroup.fit_transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_baseline = y_pred.copy()
df_baseline=df.copy()
df_baseline

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.612385,0
Mean Multiclass Statistical Parity,0.612385,0
Max Multiclass Equality of Opportunity,0.385094,0
Max Multiclass Average Odds,0.32664,0
Max Multiclass True Positive Difference,0.235136,0
Mean Multiclass Equality of Opportunity,0.385094,0
Mean Multiclass Average Odds,0.32664,0
Mean Multiclass True Positive Difference,0.235136,0


## ML Debiaser

In [5]:
from holisticai.bias.mitigation import MLDebiaser
np.random.seed(10)

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", LogisticRegression()),
        ("bm_postprocessing", MLDebiaser(sgd_steps=10_000,
                       full_gradient_epochs=500, 
                       max_iter=5)),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)

p_attr = sensgroup.transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_mldebiaser  = y_pred.copy()
df_mldebiaser = df.copy()
df_mldebiaser

  from .autonotebook import tqdm as notebook_tqdm
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[elapsed time: 00:00:14 | iter:5/5 | primal_residual::10.8485 | dual_residual::0.0471]]


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.333202,0
Mean Multiclass Statistical Parity,0.333202,0
Max Multiclass Equality of Opportunity,0.19194,0
Max Multiclass Average Odds,0.061833,0
Max Multiclass True Positive Difference,0.109597,0
Mean Multiclass Equality of Opportunity,0.19194,0
Mean Multiclass Average Odds,0.061833,0
Mean Multiclass True Positive Difference,0.109597,0


## Reweighing

In [6]:
from holisticai.bias.mitigation import Reweighing
np.random.seed(10)

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_preprocessing", Reweighing()),
        ("model", LogisticRegression()),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)

p_attr = sensgroup.transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_rw  = y_pred.copy()
df_rw = df.copy()
df_rw

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.588142,0
Mean Multiclass Statistical Parity,0.588142,0
Max Multiclass Equality of Opportunity,0.339266,0
Max Multiclass Average Odds,0.302325,0
Max Multiclass True Positive Difference,0.240514,0
Mean Multiclass Equality of Opportunity,0.339266,0
Mean Multiclass Average Odds,0.302325,0
Mean Multiclass True Positive Difference,0.240514,0


In [7]:
from holisticai.bias.mitigation import CorrelationRemover

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_preprocessing", CorrelationRemover()),
        ("model", LogisticRegression()),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)

p_attr = sensgroup.transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_cr  = y_pred.copy()
df_cr = df.copy()
df_cr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Value,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
Max Multiclass Statistical Parity,0.474045,0
Mean Multiclass Statistical Parity,0.474045,0
Max Multiclass Equality of Opportunity,0.287799,0
Max Multiclass Average Odds,0.156795,0
Max Multiclass True Positive Difference,0.180078,0
Mean Multiclass Equality of Opportunity,0.287799,0
Mean Multiclass Average Odds,0.156795,0
Mean Multiclass True Positive Difference,0.180078,0


In [8]:
result = pd.concat([df_baseline, df_rw, df_cr,df_mldebiaser], axis=1).iloc[:, [0,2,4,6,7]]
result.columns = ['Baseline','Reweighing','Correlation Remover','ML Debiaser','Reference']
result

Unnamed: 0_level_0,Baseline,Reweighing,Correlation Remover,ML Debiaser,Reference
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Max Multiclass Statistical Parity,0.612385,0.588142,0.474045,0.333202,0
Mean Multiclass Statistical Parity,0.612385,0.588142,0.474045,0.333202,0
Max Multiclass Equality of Opportunity,0.385094,0.339266,0.287799,0.19194,0
Max Multiclass Average Odds,0.32664,0.302325,0.156795,0.061833,0
Max Multiclass True Positive Difference,0.235136,0.240514,0.180078,0.109597,0
Mean Multiclass Equality of Opportunity,0.385094,0.339266,0.287799,0.19194,0
Mean Multiclass Average Odds,0.32664,0.302325,0.156795,0.061833,0
Mean Multiclass True Positive Difference,0.235136,0.240514,0.180078,0.109597,0
