In [86]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import sys


sys.path.append('../scripts')
import data
import models
import utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
param_space_fair = {
    "min_child_weight" : {"type" : "float", "low" : 0.01, "high" : 100, "log" : True},
    "n_estimators" : {"type" : "int", "low" : 20, "high" : 50},
    "eta" : {"type" : "float", "low" : 0.01, "high" : 0.5},
    "max_depth" : {"type" : "int", "low" : 2, "high" : 10},
    "l2_weight" : {"type" : "float", "low" : 0.01, "high" : 100, "log" : True},
    "fair_weight" : {"type" : "float", "low" : 0.01, "high" : 10, "log" : True}
}
param_space_xgb =  {
    "min_child_weight" : {"type" : "float", "low" : 0.01, "high" : 100, "log" : True},
    "n_estimators" : {"type" : "int", "low" : 20, "high" : 50},
    "eta" : {"type" : "float", "low" : 0.01, "high" : 0.5},
    "max_depth" : {"type" : "int", "low" : 2, "high" : 10},
    "l2_weight" : {"type" : "float", "low" : 0.01, "high" : 100, "log" : True},
    "fair_weight" : {"type" : "float", "low" : 0, "high" : 0}
}

In [69]:
X_train, Y_train, X_val, Y_val, X_test, Y_test = data.get_fold("german2", 0, 0)
cat_features = [
    "CheckingAccount",
    "CreditHistory",
    "Purpose",
    "SavingsAccount",
    "EmploymentSince",
    "Gender",
    "OtherDebtors",
    "Property",
    "OtherInstallmentPlans",
    "Housing",
    "Job",
    "Telephone",
    "ForeignWorker",
]
num_features = X_train.columns.difference(cat_features).tolist()

# Define sensitive attribute from gender and age
A_train = X_train.Gender + "_" + (X_train.Age > 50).astype(str)
A_val = X_val.Gender + "_" + (X_val.Age > 50).astype(str)
A_test = X_test.Gender + "_" + (X_test.Age > 50).astype(str)
# create dict of sensitive attribute names and values
sensitive_map = dict([
    (attr, i)
    for i, attr in enumerate(A_train.unique())
])
A_train = A_train.map(sensitive_map)
A_val = A_val.map(sensitive_map)
A_test = A_test.map(sensitive_map)

In [70]:
col_trans = ColumnTransformer(
    [
    ("numeric", StandardScaler(), num_features),
    ("categorical", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), cat_features)
    ], 
    verbose_feature_names_out=False,
)
col_trans.set_output(transform="pandas")
preprocess = Pipeline([
    ("preprocess", col_trans)
])
preprocess.fit(X_train)
X_train_preprocessed = preprocess.transform(X_train)
X_val_preprocessed = preprocess.transform(X_val)
X_test_preprocessed = preprocess.transform(X_test)
X_train_preprocessed.insert(0, "Sensitive", A_train)
X_val_preprocessed.insert(0, "Sensitive", A_val)
X_test_preprocessed.insert(0, "Sensitive", A_test)

In [84]:
def run_trial(
        trial,
        X_train,
        Y_train,
        X_val,
        Y_val,
        model_class,
        param_space,
        random_state = None
):  
    params = {}
    for name, values in param_space.items():
        if values["type"] == "int":
            values_cp = {n: v for n, v in values.items() if n != "type"}
            params[name] = trial.suggest_int(name, **values_cp)
        elif values["type"] == "categorical":
            values_cp = {n: v for n, v in values.items() if n != "type"}
            params[name] = trial.suggest_categorical(name, **values_cp)
        elif values["type"] == "float":  # corrected this line
            values_cp = {n: v for n, v in values.items() if n != "type"}
            params[name] = trial.suggest_float(name, **values_cp)

    params["seed"] = random_state
    model = model_class(**params)
    model.fit(X_train, Y_train)
    return model.score(X_val, Y_val)

In [87]:
study = optuna.create_study(direction="maximize")
objective = lambda trial : run_trial(trial, X_train_preprocessed, Y_train, X_val_preprocessed, Y_val, models.XtremeFair, param_space_fair)
study.optimize(objective, n_trials=100, show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [89]:
model = models.XtremeFair(**study.best_params)
model.fit(X_train_preprocessed, Y_train)
print(f"score: {model.score(X_test_preprocessed, Y_test):.3f}")
print(f"equalized loss: {utils.equalized_loss_score(Y_test, model.predict_proba(X_test_preprocessed)[:, 1], A_test):.3f}")

score: 0.760
equalized loss: 0.505


In [91]:
study = optuna.create_study(direction="maximize")
objective = lambda trial : run_trial(trial, X_train_preprocessed, Y_train, X_val_preprocessed, Y_val, models.XtremeFair, param_space_xgb)
study.optimize(objective, n_trials=100, show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [93]:
model = models.XtremeFair(**study.best_params)
model.fit(X_train_preprocessed, Y_train)
print(f"score: {model.score(X_test_preprocessed, Y_test):.3f}")
print(f"equalized loss: {utils.equalized_loss_score(Y_test, model.predict_proba(X_test_preprocessed)[:, 1], A_test):.3f}")

score: 0.710
equalized loss: 0.632
