# **Mitigating Bias in multiclass classification**


In [None]:
# sys path
import sys
sys.path = ['../../']+sys.path

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from holisticai.bias.metrics import multiclass_bias_metrics
from holisticai.pipeline import Pipeline
from holisticai.utils.transformers.bias import SensitiveGroups
from tests.testing_utils._tests_data_utils import load_preprocessed_us_crime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Data Preprocessing

In [None]:
train_data , test_data = load_preprocessed_us_crime(nb_classes=5)
_, _, group_a, group_b = train_data
sensgroup = SensitiveGroups()

## Baseline

In [None]:
pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", LogisticRegression()),
    ]
)

X, y, group_a, group_b = train_data

pipeline.fit(X, y)

X, y, group_a, group_b = test_data

y_pred = pipeline.predict(X)

p_attr = sensgroup.fit_transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_baseline = y_pred.copy()
df_baseline=df.copy()
df_baseline

## ML Debiaser

In [None]:
from holisticai.bias.mitigation import MLDebiaser
np.random.seed(10)

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("model", LogisticRegression()),
        ("bm_postprocessing", MLDebiaser(sgd_steps=10_000,
                       full_gradient_epochs=500, 
                       max_iter=5)),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)

p_attr = sensgroup.transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_mldebiaser  = y_pred.copy()
df_mldebiaser = df.copy()
df_mldebiaser

## Reweighing

In [None]:
from holisticai.bias.mitigation import Reweighing
np.random.seed(10)

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_preprocessing", Reweighing()),
        ("model", LogisticRegression()),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)

p_attr = sensgroup.transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_rw  = y_pred.copy()
df_rw = df.copy()
df_rw

In [None]:
from holisticai.bias.mitigation import CorrelationRemover

pipeline = Pipeline(
    steps=[
        ('scalar', StandardScaler()),
        ("bm_preprocessing", CorrelationRemover()),
        ("model", LogisticRegression()),
    ]
)

X, y, group_a, group_b = train_data
fit_params = {
    "bm__group_a": group_a, 
    "bm__group_b": group_b
}

pipeline.fit(X, y, **fit_params)

X, y, group_a, group_b = test_data
predict_params = {
    "bm__group_a": group_a,
    "bm__group_b": group_b,
}
y_pred = pipeline.predict(X, **predict_params)

p_attr = sensgroup.transform(np.stack([group_a,group_b], axis=1), convert_numeric=True)

df = multiclass_bias_metrics(
    p_attr,
    y_pred,
    y,
    metric_type='both'
)
y_cr  = y_pred.copy()
df_cr = df.copy()
df_cr

In [None]:
result = pd.concat([df_baseline, df_rw, df_cr,df_mldebiaser], axis=1).iloc[:, [0,2,4,6,7]]
result.columns = ['Baseline','Reweighing','Correlation Remover','ML Debiaser','Reference']
result

### Fair Score Classifier

In [None]:
from holisticai.datasets import load_student
from holisticai.bias.mitigation import FairScoreClassifier
from sklearn.model_selection import train_test_split

In [None]:
def ohot_encoding(df):
    new_df = pd.DataFrame()
    for col in df.columns:
        if df[col].dtype == object:
            if df[col].nunique() == 2:
                unique_values = df[col].unique()
                tmp = df[col].map({unique_values[0]:0, unique_values[1]:1})
            elif df[col].nunique() > 2 and df[col].nunique() <= 5:
                tmp = pd.get_dummies(df[col], prefix=col)
            elif df[col].nunique() > 5:
                continue
            new_df = pd.concat([new_df, tmp], axis=1)
        else:
            if df[col].nunique() > 5:
                continue
            elif df[col].nunique() > 2 and df[col].nunique() <= 5:
                tmp = pd.get_dummies(df[col], prefix=col)
                new_df = pd.concat([new_df, tmp], axis=1)
            else:
                new_df = pd.concat([new_df, df[col]], axis=1)
    return new_df

In [None]:
# load data
df = load_student()['frame']

# Make data multiclass by slicing into 4 buckets
y = df['G3'].to_numpy()
buckets = np.array([8, 11, 14])
y_cat = (y.reshape(-1, 1) > buckets.reshape(1, -1)).sum(axis=1)
df['target'] = y_cat

# map dictionary
grade_dict = {0:'very-low', 1:'low', 2:'high',3:'very-high'}
df['target'] = df['target'].map(grade_dict)

# drop the other grade columns
df = df.drop(columns=['G1','G2','G3'])

df = ohot_encoding(df)
df.drop_duplicates()
df.head()

In [None]:
labels_name = list(df.iloc[:,-4:].columns)
labels_name

In [None]:
train, test = train_test_split(df, test_size=0.4, random_state=42)

X_train = train.drop(columns=labels_name)
X_test = test.drop(columns=labels_name)
y_train = train[labels_name]
y_test = test[labels_name]

In [None]:
# Set the objectives and the constraints
objectives = "ba"
constraints = {}
protected_attributes = ["sex"]
protected_labels = ["target_very-high"]

In [None]:
# Create the model
model = FairScoreClassifier(objectives, constraints)

In [None]:
# Train the model
model.fit(X_train, y_train, protected_attributes, protected_labels)

In [None]:
# Predict
ypred = model.predict(X_test)
y_pred = pd.DataFrame(ypred, columns=labels_name)
y_pred = y_pred.idxmax(axis=1)

In [None]:
p_attr = X_test[protected_attributes]

In [None]:
y_test = y_test.idxmax(axis=1)

In [None]:
multiclass_bias_metrics(p_attr, y_pred, y_test)