In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
import matplotlib.patches as mpatches
from matplotlib import ticker
import joblib
from glob import glob
import sys
from credit_pipeline import training, evaluate

sys.path.append("../scripts")
from experiments import load_split, PROTECTED_ATTRIBUTES

# small fix to be able to load models
from credit_pipeline.training import EBE

%load_ext autoreload
%autoreload 2


2024-09-04 09:21:28.296509: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-04 09:21:28.330551: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-04 09:21:28.330568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-04 09:21:28.331508: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 09:21:28.336949: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-04 09:21:28.338368: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [3]:
def transform_to_table(metrics):
    metrics_mean = metrics.groupby("model").apply(lambda x : x.abs().mean())
    metrics_std = metrics.groupby("model").apply(lambda x : x.abs().std())
    for col in metrics_mean.columns:
        metrics_mean[col] = (
            metrics_mean[col].round(3).astype(str)
            + " ± "
            + metrics_std[col].round(3).astype(str)
        )
    #metrics_mean = metrics_mean.drop(columns=metrics_std.columns)
    metrics_mean = metrics_mean.reset_index()
    return metrics_mean

In [4]:
def summarize_credit_experiment_perf(dataset_name, seed = 0):
    """Function that summarizes the results of the credit models experiment.
    It will print the mean and standard deviation of metrics.
    """

    path = "../results/credit_models_unaware"
    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict = {}
        X_train, A_train, Y_train, X_val, A_val, Y_val, X_test, A_test, Y_test = load_split(
            dataset_name, fold, seed, unaware = True
        )


        models_files = glob(f"{path}/{dataset_name}/{fold}/*.pkl")
        # remove the ones that are not models
        models_files = [file for file in models_files if "study" not in file]

        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            model_dict[model_name] = [model, threshold]

        metrics_folds_val.append(evaluate.get_metrics(model_dict, X_val, Y_val))
        metrics_folds_test.append(evaluate.get_metrics(model_dict, X_test, Y_test))

    metrics_val = pd.concat(metrics_folds_val)
    metrics_test = pd.concat(metrics_folds_test)

    return metrics_val, metrics_test

In [7]:
test_metrics = {}
for dataset in ["german", "taiwan", "homecredit"]:
    _, t = summarize_credit_experiment_perf(dataset, 0)
    test_metrics[dataset] = t

In [8]:
for dataset in test_metrics:
    print(dataset)
    print(transform_to_table(test_metrics[dataset]))
    print("\n\n")

german
                    model            AUC    Brier Score Balanced Accuracy  \
0          LGBMClassifier    0.71 ± 0.02  0.225 ± 0.027     0.656 ± 0.033   
1      LogisticRegression  0.761 ± 0.008  0.195 ± 0.004     0.701 ± 0.014   
2           MLPClassifier   0.755 ± 0.03    0.2 ± 0.015     0.696 ± 0.033   
3  RandomForestClassifier  0.748 ± 0.017  0.208 ± 0.011     0.688 ± 0.025   

        Accuracy      Precision         Recall             F1  
0  0.694 ± 0.022  0.596 ± 0.061  0.521 ± 0.157  0.536 ± 0.079  
1   0.71 ± 0.032   0.59 ± 0.053  0.669 ± 0.089   0.621 ± 0.02  
2  0.715 ± 0.026   0.595 ± 0.04  0.634 ± 0.099  0.609 ± 0.048  
3  0.704 ± 0.028  0.582 ± 0.055  0.634 ± 0.079  0.602 ± 0.032  



taiwan
                    model            AUC    Brier Score Balanced Accuracy  \
0          LGBMClassifier  0.793 ± 0.001   0.15 ± 0.021     0.721 ± 0.004   
1      LogisticRegression   0.77 ± 0.001  0.161 ± 0.025     0.709 ± 0.001   
2           MLPClassifier  0.782 ± 0.006  0.13

In [19]:
def summarize_credit_experiment_fair(dataset_name, seed = 0):

    # computing fairness metrics
    path_1 = "../results/fair_models"
    path_2 = "../results/credit_models"

    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict_val = {}
        model_dict_test = {}
        X_train, _, Y_train, X_val, _, Y_val, X_test, _, Y_test = load_split(
            dataset_name, fold, seed
        )

        # It was necessary to compute the pipeline again due to pickle error
        if dataset_name == "homecredit":
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
        else:
            pipeline_preprocess = training.create_pipeline(X_train, Y_train)
        pipeline_preprocess.fit(X_train, Y_train)
        X_train_preprocessed = pipeline_preprocess.transform(X_train)
        X_val_preprocessed = pipeline_preprocess.transform(X_val)
        A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        X_test_preprocessed = pipeline_preprocess.transform(X_test)
        A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]

        models_files_1 = glob(f"{path_1}/{dataset_name}/{fold}/*.pkl")
        models_files_1 = [file for file in models_files_1 if "study" not in file]
        models_files_2 = glob(f"{path_2}/{dataset_name}/{fold}/*.pkl")
        models_files_2 = [file for file in models_files_2 if "study" not in file]
        models_files = models_files_1 + models_files_2

        class Thr_helper:
            def __init__(self, model, sensitive_features):
                self.model = model
                self.sensitive_features = sensitive_features

            def predict(self, X):
                return self.model.predict(
                    X, sensitive_features=self.sensitive_features
                )

        for file in models_files:
            if file.find("thr") != -1:
                continue
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            Y_val_pred = model.predict_proba(X_val)[:, 1]
            Y_val_pred = (Y_val_pred > threshold).astype(int)
            model_dict_val[model_name] = Y_val_pred
            Y_test_pred = model.predict_proba(X_test)[:, 1]
            Y_test_pred = (Y_test_pred > threshold).astype(int)
            model_dict_test[model_name] = Y_test_pred

        metrics_folds_val.append(
            evaluate.get_fairness_metrics(model_dict_val, X_val, Y_val, A_val, benefit_class = 0)
        )
        metrics_folds_test.append(
            evaluate.get_fairness_metrics(model_dict_test, X_test, Y_test, A_test, benefit_class = 0)
        )
        metrics_folds_test[-1]["fold"] = fold

        # Threshold Optimizer needs a different procedure as it does not predict probabilities
        for file in models_files:
            if file.find("thr") == -1:
                continue

            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            thr_opt_helper_val = Thr_helper(model, A_val)
            Y_val_pred = thr_opt_helper_val.predict(X_val_preprocessed)
            thr_opt_helper_test = Thr_helper(model, A_test)
            Y_test_pred = thr_opt_helper_test.predict(X_test_preprocessed)
            
            model_dict_val = {model_name : Y_val_pred}
            model_dict_test = {model_name : Y_test_pred}
            metrics_folds_val.append(
                evaluate.get_fairness_metrics(
                    model_dict_val, X_val_preprocessed, Y_val, A_val, benefit_class = 0
                )
            )
            metrics_folds_test.append(
                evaluate.get_fairness_metrics(
                    model_dict_test, X_test_preprocessed, Y_test, A_test, benefit_class = 0
                )
            )
            metrics_folds_test[-1]["fold"] = fold


    # this need to be evaluated separately because it uses a different set of features
    path = "../results/credit_models_unaware"
    for fold in range(10):
        model_dict_val = {}
        model_dict_test = {}
        X_train, _, Y_train, X_val, _, Y_val, X_test, _, Y_test = load_split(
            dataset_name, fold, seed, unaware = False
        )

        # It was necessary to compute the pipeline again due to pickle error
        if dataset_name == "homecredit":
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
        else:
            pipeline_preprocess = training.create_pipeline(X_train, Y_train)
        pipeline_preprocess.fit(X_train, Y_train)
        X_val_preprocessed = pipeline_preprocess.transform(X_val)
        A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        X_test_preprocessed = pipeline_preprocess.transform(X_test)
        A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]


        X_train, _, Y_train, X_val, _, Y_val, X_test, _, Y_test = load_split(
            dataset_name, fold, seed, unaware = True
        )


        models_files = glob(f"{path}/{dataset_name}/{fold}/*.pkl")
        models_files = [file for file in models_files if "study" not in file]

        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            Y_val_pred = model.predict_proba(X_val)[:, 1]
            Y_val_pred = (Y_val_pred > threshold).astype(int)
            model_dict_val[model_name+"_unaware"] = Y_val_pred
            Y_test_pred = model.predict_proba(X_test)[:, 1]
            Y_test_pred = (Y_test_pred > threshold).astype(int)
            model_dict_test[model_name+"_unaware"] = Y_test_pred

        metrics_folds_val.append(
            evaluate.get_fairness_metrics(model_dict_val, X_val, Y_val, A_val, benefit_class = 0)
        )
        metrics_folds_test.append(
            evaluate.get_fairness_metrics(model_dict_test, X_test, Y_test, A_test, benefit_class = 0)
        )
        metrics_folds_test[-1]["fold"] = fold

    metrics_val = pd.concat(metrics_folds_val)    
    metrics_test = pd.concat(metrics_folds_test)
        
    return metrics_val, metrics_test

In [20]:
test_metrics = {}
for dataset in ["german", "taiwan", "homecredit"]:
    _, t = summarize_credit_experiment_fair(dataset, seed=0)
    test_metrics[dataset] = t

In [22]:
for dataset in test_metrics:
    print(dataset)
    print(transform_to_table(test_metrics[dataset][["model", "balanced_accuracy", "EOD", "DPD", "APVD"]]))
    print("\n\n")

german
                             model balanced_accuracy            EOD  \
0      DemographicParityClassifier     0.707 ± 0.015  0.076 ± 0.083   
1       EqualOpportunityClassifier      0.71 ± 0.014  0.092 ± 0.059   
2                FairGBMClassifier     0.638 ± 0.058   0.06 ± 0.046   
3                   LGBMClassifier     0.632 ± 0.041    0.09 ± 0.06   
4           LGBMClassifier_unaware     0.656 ± 0.033  0.041 ± 0.035   
5               LogisticRegression     0.697 ± 0.024   0.193 ± 0.06   
6       LogisticRegression_unaware     0.701 ± 0.014  0.073 ± 0.037   
7                    MLPClassifier     0.672 ± 0.049  0.177 ± 0.169   
8            MLPClassifier_unaware     0.696 ± 0.033  0.066 ± 0.058   
9           RandomForestClassifier     0.672 ± 0.027   0.181 ± 0.07   
10  RandomForestClassifier_unaware     0.688 ± 0.025   0.12 ± 0.056   
11               rw_LGBMClassifier     0.652 ± 0.059  0.055 ± 0.042   
12           rw_LogisticRegression      0.697 ± 0.03  0.055 ± 0.046   