In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
from tqdm import tqdm
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import sys
import os


sys.path.append('../scripts')
import data
import models
import utils

%load_ext autoreload
%autoreload 2

In [2]:
def run_trial(
        trial,
        scorer,
        X_train,
        Y_train,
        A_train,
        X_val,
        Y_val,
        A_val,
        model_class,
        param_space,
        random_state = None
):  
    params = {}
    for name, values in param_space.items():
        if values["type"] == "int":
            values_cp = {n: v for n, v in values.items() if n != "type"}
            params[name] = trial.suggest_int(name, **values_cp)
        elif values["type"] == "categorical":
            values_cp = {n: v for n, v in values.items() if n != "type"}
            params[name] = trial.suggest_categorical(name, **values_cp)
        elif values["type"] == "float":  # corrected this line
            values_cp = {n: v for n, v in values.items() if n != "type"}
            params[name] = trial.suggest_float(name, **values_cp)

    params["seed"] = random_state
    model = model_class(**params)
    model.fit(X_train, Y_train, A_train)
    Y_val_pred = model.predict(X_val)
    return scorer(Y_val, Y_val_pred, A_val)

In [4]:
def get_subgroup_feature(dataset, X_train, X_val, X_test):
    if dataset == "german2":
        A_train = X_train.Gender.astype(str) + "_" + (X_train.Age > 50).astype(str)
        A_val = X_val.Gender.astype(str) + "_" + (X_val.Age > 50).astype(str)
        A_test = X_test.Gender.astype(str) + "_" + (X_test.Age > 50).astype(str)

    sensitive_map = dict([
        (attr, i)
        for i, attr in enumerate(A_train.unique())
    ])
    A_train = A_train.map(sensitive_map)
    A_val = A_val.map(sensitive_map)
    A_test = A_test.map(sensitive_map)
    return A_train, A_val, A_test

In [5]:
def eval_model(y_ground, y_prob, y_pred, A):
    acc = accuracy_score(y_ground, y_pred)
    roc = roc_auc_score(y_ground, y_prob)
    eq_loss = utils.equalized_loss_score(y_ground, y_prob, A)
    eod = utils.equal_opportunity_score(y_ground, y_pred, A)
    spd = utils.statistical_parity_score(y_ground, y_pred, A)
    return {
        "acc" : acc,
        "roc" : roc,
        "eq_loss" : eq_loss,
        "eod" : eod,
        "spd" : spd
    }

In [6]:
def get_model(model_name):
    if model_name == "XtremeFair":
        def model(**params):
            return models.XtremeFair(**params)
    elif model_name == "XtremeFair_grad":
        def model(**params):
            return models.XtremeFair(dual_learning="gradient", **params)
    elif model_name == "XGBClassifier":
        def model(**params):
            assert params["fair_weight"] == 0
            return models.XtremeFair(**params)
    return model
    
def get_param_spaces(model_name):
    if model_name == "XtremeFair":
        return models.PARAM_SPACES["XtremeFair"]
    elif model_name == "XtremeFair_grad":
        return models.PARAM_SPACES["XtremeFair"]
    elif model_name == "XGBClassifier":
        return models.PARAM_SPACES["XGBClassifier"]

In [7]:
def subgroup_experiment(args):
    # create output directory if not exists
    if not os.path.exists(args["output_dir"]):
        os.makedirs(args["output_dir"])

    results = []

    cat_features = data.CAT_FEATURES[args["dataset"]]
    num_features = data.NUM_FEATURES[args["dataset"]]
    col_trans = ColumnTransformer(
        [
            ("numeric", StandardScaler(), num_features),
            (
                "categorical",
                OneHotEncoder(
                    drop="if_binary", sparse_output=False, handle_unknown="ignore"
                ),
                cat_features,
            ),
        ],
        verbose_feature_names_out=False,
    )
    col_trans.set_output(transform="pandas")
    scorer = utils.get_combined_metrics_scorer(
        alpha=args["alpha"], performance_metric="acc", fairness_metric="eod"
    )

    for i in tqdm(range(10)):
        # Load and prepare data
        X_train, Y_train, X_val, Y_val, X_test, Y_test = data.get_fold(
            args["dataset"], i, 0
        )

        # Define sensitive attribute from gender and age
        A_train, A_val, A_test = get_subgroup_feature(
            args["dataset"], X_train, X_val, X_test
        )

        preprocess = Pipeline([("preprocess", col_trans)])
        preprocess.fit(X_train)
        X_train = preprocess.transform(X_train)
        X_val = preprocess.transform(X_val)
        X_test = preprocess.transform(X_test)

        study = optuna.create_study(direction="maximize")
        objective = lambda trial: run_trial(
            trial,
            scorer,
            X_train,
            Y_train,
            A_train,
            X_val,
            Y_val,
            A_val,
            get_model(args["model_name"]),
            get_param_spaces(args["model_name"]),
            0,
        )
        study.optimize(objective, n_trials=args["n_trials"])

        # save best params
        with open(os.path.join(args["output_dir"], f"best_params.txt"), "w+") as f:
            f.write(str(study.best_params))

        model = models.XtremeFair(**study.best_params)
        model.fit(X_train, Y_train, A_train)
        y_prob = model.predict_proba(X_train)[:, 1]
        thresh = utils.get_best_threshold(Y_train, y_prob)
        y_prob_test = model.predict_proba(X_test)[:, 1]
        y_pred_test = y_prob_test > thresh

        metrics = eval_model(Y_test, y_prob_test, y_pred_test, A_test)
        results.append(metrics)

    results = pd.DataFrame(results)
    results.to_csv(os.path.join(args["output_dir"], "results.csv"))
    results.mean().to_csv(os.path.join(args["output_dir"], "results_mean.csv"))



In [8]:
args = {
    "dataset" : "german2",
    "alpha" : 1,
    "output_dir" : "../results/subgroup_experiment/german/XtremeFair_1",
    "model_name" : "XtremeFair",
    "n_trials" : 100
}
subgroup_experiment(args)

  0%|          | 0/10 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 10/10 [00:47<00:00,  4.79s/it]


In [9]:
args = {
    "dataset" : "german2",
    "alpha" : 1,
    "output_dir" : "../results/subgroup_experiment/german/XtremeFair_grad",
    "model_name" : "XtremeFair_grad",
    "n_trials" : 100
    
}
subgroup_experiment(args)

  0%|          | 0/10 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 10/10 [00:35<00:00,  3.56s/it]


In [10]:
args = {
    "dataset" : "german2",
    "alpha" : 1,
    "output_dir" : "../results/subgroup_experiment/german/XGBClassifier",
    "model_name" : "XGBClassifier",
    "n_trials" : 100
}
subgroup_experiment(args)

  0%|          | 0/10 [00:00<?, ?it/s]

  loss = -(y_ground * np.log(y_prob) + (1 - y_ground) * np.log(1 - y_prob))
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 10/10 [00:25<00:00,  2.50s/it]
