In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
import matplotlib.patches as mpatches
from matplotlib import ticker
import joblib
from glob import glob
import sys
from credit_pipeline import training, evaluate

sys.path.append("../scripts")
from experiments import load_split, PROTECTED_ATTRIBUTES

# small fix to be able to load models
from credit_pipeline.training import EBE

%load_ext autoreload
%autoreload 2


2024-09-12 13:40:43.257307: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-12 13:40:43.292885: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-12 13:40:43.292905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-12 13:40:43.293920: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-12 13:40:43.299556: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-12 13:40:43.300975: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [46]:
def transform_to_table(metrics):
    metrics_mean = metrics.groupby("model").apply(lambda x : x.abs().mean())
    metrics_std = metrics.groupby("model").apply(lambda x : x.abs().std())

    for col in metrics.columns:
        if "diff" in col:
            metrics_mean[col] = metrics.groupby("model")[col].mean()
            metrics_std[col] = metrics.groupby("model")[col].std()

    for col in metrics_mean.columns:
        metrics_mean[col] = (
            " & " 
            + metrics_mean[col].round(3).astype(str)
            + " ± "
            + metrics_std[col].round(3).astype(str)
        )
    metrics_mean = metrics_mean.reset_index()
    return metrics_mean

In [None]:
# small config to print the tables
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 0)
pd.set_option('display.expand_frame_repr', False)

## Credit Models

In [3]:
def summarize_credit_experiment_perf(dataset_name, seed = 0):
    """Function that summarizes the results of the credit models experiment.
    It will print the mean and standard deviation of metrics.
    """

    path = "../results/credit_models_unaware"
    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict = {}
        X_train, A_train, Y_train, X_val, A_val, Y_val, X_test, A_test, Y_test = load_split(
            dataset_name, fold, seed, unaware = True
        )


        models_files = glob(f"{path}/{dataset_name}/{fold}/*.pkl")
        # remove the ones that are not models
        models_files = [file for file in models_files if "study" not in file]

        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            model_dict[model_name] = [model, threshold]

        metrics_folds_val.append(evaluate.get_metrics(model_dict, X_val, Y_val))
        metrics_folds_test.append(evaluate.get_metrics(model_dict, X_test, Y_test))

    metrics_val = pd.concat(metrics_folds_val)
    metrics_test = pd.concat(metrics_folds_test)

    return metrics_val, metrics_test

In [None]:
test_metrics = {}
for dataset in ["german", "taiwan", "homecredit"]:
    _, t = summarize_credit_experiment_perf(dataset, 0)
    test_metrics[dataset] = t

In [8]:
for dataset in test_metrics:
    print(dataset)
    print(transform_to_table(test_metrics[dataset]))
    print("\n\n")

german
                    model            AUC    Brier Score Balanced Accuracy  \
0          LGBMClassifier    0.71 ± 0.02  0.225 ± 0.027     0.656 ± 0.033   
1      LogisticRegression  0.761 ± 0.008  0.195 ± 0.004     0.701 ± 0.014   
2           MLPClassifier   0.755 ± 0.03    0.2 ± 0.015     0.696 ± 0.033   
3  RandomForestClassifier  0.748 ± 0.017  0.208 ± 0.011     0.688 ± 0.025   

        Accuracy      Precision         Recall             F1  
0  0.694 ± 0.022  0.596 ± 0.061  0.521 ± 0.157  0.536 ± 0.079  
1   0.71 ± 0.032   0.59 ± 0.053  0.669 ± 0.089   0.621 ± 0.02  
2  0.715 ± 0.026   0.595 ± 0.04  0.634 ± 0.099  0.609 ± 0.048  
3  0.704 ± 0.028  0.582 ± 0.055  0.634 ± 0.079  0.602 ± 0.032  



taiwan
                    model            AUC    Brier Score Balanced Accuracy  \
0          LGBMClassifier  0.793 ± 0.001   0.15 ± 0.021     0.721 ± 0.004   
1      LogisticRegression   0.77 ± 0.001  0.161 ± 0.025     0.709 ± 0.001   
2           MLPClassifier  0.782 ± 0.006  0.13

## Fairness of Credit Models

In [48]:
def summarize_fair_baselines(dataset_name, seed = 0):
    
    # computing fairness metrics
    path_1 = "../results/credit_models_unaware"
    path_2 = "../results/credit_models"

    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict_val = {}
        model_dict_test = {}
        X_train, A_train, Y_train, X_val, A_val, Y_val, X_test, A_test, Y_test = load_split(
            dataset_name, fold, seed
        )


        models_files_1 = glob(f"{path_1}/{dataset_name}/{fold}/*.pkl")
        models_files_1 = [file for file in models_files_1 if "study" not in file]
        models_files_2 = glob(f"{path_2}/{dataset_name}/{fold}/*.pkl")
        models_files_2 = [file for file in models_files_2 if "study" not in file]
        models_files = models_files_1 + models_files_2


        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]

            if "unaware" in file:
                model_name = model_name + "_unaware"
            else:
                model_name = model_name + "_aware"
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            Y_val_pred = model.predict_proba(X_val)[:, 1]
            Y_val_pred = (Y_val_pred > threshold).astype(int)
            model_dict_val[model_name] = Y_val_pred
            Y_test_pred = model.predict_proba(X_test)[:, 1]
            Y_test_pred = (Y_test_pred > threshold).astype(int)
            model_dict_test[model_name] = Y_test_pred

        metrics_folds_val.append(
            evaluate.get_fairness_metrics(model_dict_val, X_val, Y_val, A_val, benefit_class = 0)
        )
        metrics_folds_test.append(
            evaluate.get_fairness_metrics(model_dict_test, X_test, Y_test, A_test, benefit_class = 0)
        )
        metrics_folds_test[-1]["fold"] = fold


    metrics_val = pd.concat(metrics_folds_val)    
    metrics_test = pd.concat(metrics_folds_test)
        
    return metrics_val, metrics_test

In [5]:
class Thr_helper:
    def __init__(self, model, sensitive_features):
        self.model = model
        self.sensitive_features = sensitive_features

    def predict(self, X):
        return self.model.predict(
            X, sensitive_features=self.sensitive_features
        )

In [53]:
test_metrics = {}
for dataset in ["german", "taiwan", "homecredit"]:
    _, t = summarize_fair_baselines(dataset, 0)
    test_metrics[dataset] = t

In [54]:
for dataset in test_metrics:
    print(dataset)
    print(transform_to_table(test_metrics[dataset][["model", "balanced_accuracy", "EOD", "DPD", "APVD"]]))
    print("\n\n")

german
                            model balanced_accuracy               EOD               DPD              APVD
0            LGBMClassifier_aware   & 0.632 ± 0.041     & 0.09 ± 0.06   & 0.066 ± 0.037    & 0.13 ± 0.031
1          LGBMClassifier_unaware   & 0.656 ± 0.033   & 0.041 ± 0.035   & 0.048 ± 0.025   & 0.103 ± 0.041
2        LogisticRegression_aware   & 0.697 ± 0.024    & 0.193 ± 0.06   & 0.124 ± 0.055   & 0.137 ± 0.026
3      LogisticRegression_unaware   & 0.701 ± 0.014   & 0.073 ± 0.037    & 0.041 ± 0.03   & 0.094 ± 0.028
4             MLPClassifier_aware   & 0.672 ± 0.049   & 0.177 ± 0.169   & 0.145 ± 0.147    & 0.132 ± 0.04
5           MLPClassifier_unaware   & 0.696 ± 0.033   & 0.066 ± 0.058   & 0.086 ± 0.047   & 0.088 ± 0.025
6    RandomForestClassifier_aware   & 0.672 ± 0.027    & 0.181 ± 0.07   & 0.103 ± 0.065   & 0.132 ± 0.024
7  RandomForestClassifier_unaware   & 0.688 ± 0.025    & 0.12 ± 0.056   & 0.067 ± 0.044   & 0.112 ± 0.018



taiwan
                            m

## Fair Models

In [29]:
def summarize_fair_methods(dataset_name, goal = 0.01, seed = 0):

    baselines_comparison = {
        "DemographicParityClassifier" : "LogisticRegression",
        "EqualOpportunityClassifier" : "LogisticRegression",
        "FairGBMClassifier" : "LGBMClassifier",
        "rw_LogisticRegression" : "LogisticRegression",
        "rw_MLPClassifier" : "MLPClassifier",
        "rw_RandomForestClassifier" : "RandomForestClassifier",
        "rw_LGBMClassifier" : "LGBMClassifier",
        "thr_LogisticRegression" : "LogisticRegression",
        "thr_MLPClassifier" : "MLPClassifier",
        "thr_RandomForestClassifier" : "RandomForestClassifier",
        "thr_LGBMClassifier" : "LGBMClassifier",
    }

    path_baselines = "../results/credit_models_unaware"
    path_fair = f"../results/fair_models"

    metrics_folds_val = []
    metrics_folds_test = []
    baseline_acc_val = []
    baseline_acc_test = []
    for fold in range(10):
        model_dict_val = {}
        model_dict_test = {}
        X_train, A_train, Y_train, X_val, A_val, Y_val, X_test, A_test, Y_test = load_split(
            dataset_name, fold, seed, unaware = True
        )

        # get baselines balanced accuracy and eod

        models_files = glob(f"{path_baselines}/{dataset_name}/{fold}/*.pkl")
        models_files = [file for file in models_files if "study" not in file]
        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            Y_val_pred = model.predict_proba(X_val)[:, 1]
            Y_val_pred = (Y_val_pred > threshold).astype(int)

            baseline_acc_val.append(
                evaluate.get_fairness_metrics({model_name : Y_val_pred}, X_val, Y_val, A_val, benefit_class = 0)
            )
            baseline_acc_val[-1]["fold"] = fold
            
            Y_test_pred = model.predict_proba(X_test)[:, 1]
            Y_test_pred = (Y_test_pred > threshold).astype(int)
            baseline_acc_test.append(
                evaluate.get_fairness_metrics({model_name : Y_test_pred}, X_test, Y_test, A_test, benefit_class = 0)
            )
            baseline_acc_test[-1]["fold"] = fold


        # evaluate fair models
        models_files = glob(f"{path_fair}/{dataset_name}/{fold}/*.pkl")
        models_files = [file for file in models_files if "study" not in file]
        
        for file in models_files:
            if file.find("thr") != -1:
                continue

            if "DemographicParityClassifier" in file or "EqualOpportunityClassifier" in file:
                X_train["Z"] = A_train
                X_val["Z"] = A_val
                X_test["Z"] = A_test


            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            Y_val_pred = model.predict_proba(X_val)[:, 1]
            Y_val_pred = (Y_val_pred > threshold).astype(int)
            model_dict_val[model_name] = Y_val_pred
            Y_test_pred = model.predict_proba(X_test)[:, 1]
            Y_test_pred = (Y_test_pred > threshold).astype(int)
            model_dict_test[model_name] = Y_test_pred

            if "DemographicParityClassifier" in file or "EqualOpportunityClassifier" in file:
                X_train = X_train.drop(columns = ["Z"])
                X_val = X_val.drop(columns = ["Z"])
                X_test = X_test.drop(columns = ["Z"])

                
        metrics_folds_val.append(
            evaluate.get_fairness_metrics(model_dict_val, X_val, Y_val, A_val, benefit_class = 0)
        )
        metrics_folds_val[-1]["fold"] = fold
        metrics_folds_test.append(
            evaluate.get_fairness_metrics(model_dict_test, X_test, Y_test, A_test, benefit_class = 0)
        )
        metrics_folds_test[-1]["fold"] = fold

        # Threshold Optimizer needs a different procedure as it does not predict probabilities
        for file in models_files:
            if file.find("thr") == -1:
                continue
            
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit = 4 if dataset == "homecredit" else 3)
            pipeline_preprocess.fit(X_train, Y_train)
            X_val_preprocessed = pipeline_preprocess.transform(X_val)
            X_test_preprocessed = pipeline_preprocess.transform(X_test)

            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            thr_opt_helper_val = Thr_helper(model, A_val)
            Y_val_pred = thr_opt_helper_val.predict(X_val_preprocessed)
            thr_opt_helper_test = Thr_helper(model, A_test)
            Y_test_pred = thr_opt_helper_test.predict(X_test_preprocessed)
            
            model_dict_val = {model_name : Y_val_pred}
            model_dict_test = {model_name : Y_test_pred}
            metrics_folds_val.append(
                evaluate.get_fairness_metrics(
                    model_dict_val, X_val_preprocessed, Y_val, A_val, benefit_class = 0
                )
            )
            metrics_folds_val[-1]["fold"] = fold
            metrics_folds_test.append(
                evaluate.get_fairness_metrics(
                    model_dict_test, X_test_preprocessed, Y_test, A_test, benefit_class = 0
                )
            )
            metrics_folds_test[-1]["fold"] = fold



    metrics_val = pd.concat(metrics_folds_val)
    metrics_test = pd.concat(metrics_folds_test)
    baseline_acc_val = pd.concat(baseline_acc_val)
    baseline_acc_test = pd.concat(baseline_acc_test)

    diff_bal_acc = []
    diff_eod = []
    for i, row in metrics_val.iterrows():
        fold = row["fold"]
        model = row["model"]
        compared_model = baselines_comparison[model]
        baseline_acc = baseline_acc_val[
            (baseline_acc_val["model"] == compared_model) & (baseline_acc_val["fold"] == fold)
        ]["balanced_accuracy"].values[0]
        baseline_eod = baseline_acc_val[
            (baseline_acc_val["model"] == compared_model) & (baseline_acc_val["fold"] == fold)
        ]["EOD"].values[0]
        diff_bal_acc.append(row["balanced_accuracy"] - baseline_acc)
        diff_eod.append(abs(row["EOD"]) - abs(baseline_eod))
    metrics_val["diff_bal_acc"] = diff_bal_acc
    metrics_val["diff_eod"] = diff_eod

    diff_bal_acc = []
    diff_eod = []
    for i, row in metrics_test.iterrows():
        fold = row["fold"]
        model = row["model"]
        compared_model = baselines_comparison[model]
        baseline_acc = baseline_acc_test[
            (baseline_acc_test["model"] == compared_model) & (baseline_acc_test["fold"] == fold)
        ]["balanced_accuracy"].values[0]
        baseline_eod = baseline_acc_test[
            (baseline_acc_test["model"] == compared_model) & (baseline_acc_test["fold"] == fold)
        ]["EOD"].values[0]
        diff_bal_acc.append(row["balanced_accuracy"] - baseline_acc)
        diff_eod.append(abs(row["EOD"]) - abs(baseline_eod))
    metrics_test["diff_bal_acc"] = diff_bal_acc
    metrics_test["diff_eod"] = diff_eod

    # invert the order of columns
    columns = metrics_val.columns.tolist()
    columns.reverse()

    return metrics_val[columns], metrics_test[columns]
        

In [26]:
fairness_goals = {"german" : 0.05, "taiwan" : 0.01, "homecredit" : 0.05}

In [None]:
test_metrics = {}
for dataset in ["german", "taiwan", "homecredit"]:
    _, t = summarize_fair_methods(dataset, fairness_goals[dataset], 0)
    test_metrics[dataset] = t

In [47]:
for dataset in test_metrics:
    print(dataset)
    print(transform_to_table(test_metrics[dataset][["model", "diff_bal_acc", "diff_eod", "balanced_accuracy", "EOD", "DPD", "APVD"]]))
    print("\n\n")

german
                          model       diff_bal_acc           diff_eod balanced_accuracy               EOD               DPD              APVD
0   DemographicParityClassifier     & 0.009 ± 0.02    & -0.023 ± 0.05    & 0.71 ± 0.014    & 0.05 ± 0.032    & 0.061 ± 0.05   & 0.065 ± 0.024
1    EqualOpportunityClassifier    & 0.007 ± 0.018    & -0.02 ± 0.058    & 0.708 ± 0.02   & 0.053 ± 0.042   & 0.043 ± 0.041    & 0.074 ± 0.02
2             FairGBMClassifier   & -0.037 ± 0.079    & 0.016 ± 0.072   & 0.618 ± 0.075   & 0.057 ± 0.067   & 0.052 ± 0.038   & 0.129 ± 0.078
3             rw_LGBMClassifier   & -0.035 ± 0.047   & -0.001 ± 0.059    & 0.62 ± 0.069    & 0.04 ± 0.041    & 0.038 ± 0.04   & 0.123 ± 0.041
4         rw_LogisticRegression   & -0.012 ± 0.038    & 0.023 ± 0.064   & 0.689 ± 0.033   & 0.097 ± 0.049   & 0.057 ± 0.039   & 0.092 ± 0.018
5              rw_MLPClassifier    & -0.036 ± 0.07    & 0.006 ± 0.083   & 0.661 ± 0.068   & 0.071 ± 0.044   & 0.079 ± 0.055    & 0.08 ± 0.029