In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
import matplotlib.patches as mpatches
from matplotlib import ticker
import joblib
from glob import glob
import sys
from credit_pipeline import training, evaluate

sys.path.append("../scripts")
from experiments import load_split, PROTECTED_ATTRIBUTES

# small fix to be able to load models
from credit_pipeline.training import EBE

%load_ext autoreload
%autoreload 2


2024-04-19 08:50:45.863468: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-19 08:50:45.897811: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-19 08:50:45.897831: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-19 08:50:45.898879: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-19 08:50:45.904480: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-19 08:50:45.905380: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
def transform_to_table(metrics):
    metrics_mean = metrics.groupby("model").apply(lambda x : x.abs().mean())
    metrics_std = metrics.groupby("model").apply(lambda x : x.abs().std())
    for col in metrics_mean.columns:
        metrics_mean[col] = (
            metrics_mean[col].round(3).astype(str)
            + " ± "
            + metrics_std[col].round(3).astype(str)
        )
    #metrics_mean = metrics_mean.drop(columns=metrics_std.columns)
    metrics_mean = metrics_mean.reset_index()
    return metrics_mean

In [3]:
def summarize_credit_experiment_perf(dataset_name, seed = 0):
    """Function that summarizes the results of the credit models experiment.
    It will print the mean and standard deviation of metrics.
    """

    path = "../results/credit_models"
    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict = {}
        X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split(
            dataset_name, fold, seed
        )

        # It was necessary to compute the pipeline again due to pickle error
        if dataset_name == "homecredit":
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
        else:
            pipeline_preprocess = training.create_pipeline(X_train, Y_train)
        pipeline_preprocess.fit(X_train, Y_train)
        X_train_preprocessed = pipeline_preprocess.transform(X_train)
        X_val_preprocessed = pipeline_preprocess.transform(X_val)
        A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        X_test_preprocessed = pipeline_preprocess.transform(X_test)
        A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        del (
            X_train_preprocessed,
            X_val_preprocessed,
            X_test_preprocessed,
            pipeline_preprocess,
        )

        models_files = glob(f"{path}/{dataset_name}/{fold}/*.pkl")
        # remove the ones that are not models
        models_files = [file for file in models_files if "study" not in file]

        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            model_dict[model_name] = [model, threshold]

        metrics_folds_val.append(evaluate.get_metrics(model_dict, X_val, Y_val))
        metrics_folds_test.append(evaluate.get_metrics(model_dict, X_test, Y_test))

    metrics_val = pd.concat(metrics_folds_val)
    metrics_test = pd.concat(metrics_folds_test)

    return metrics_val, metrics_test

In [5]:
_, homecredit_test_metrics = summarize_credit_experiment_perf("homecredit", 0)

In [6]:
transform_to_table(homecredit_test_metrics)

Unnamed: 0,model,AUC,Brier Score,Balanced Accuracy,Accuracy,Precision,Recall,F1
0,LGBMClassifier,0.756 ± 0.001,0.117 ± 0.064,0.688 ± 0.002,0.696 ± 0.009,0.164 ± 0.003,0.679 ± 0.012,0.264 ± 0.003
1,LogisticRegression,0.737 ± 0.0,0.207 ± 0.0,0.675 ± 0.0,0.682 ± 0.006,0.156 ± 0.002,0.667 ± 0.008,0.252 ± 0.002
2,MLPClassifier,0.741 ± 0.0,0.069 ± 0.0,0.679 ± 0.001,0.69 ± 0.015,0.159 ± 0.004,0.666 ± 0.019,0.257 ± 0.004
3,RandomForestClassifier,0.744 ± 0.001,0.103 ± 0.055,0.679 ± 0.002,0.682 ± 0.011,0.157 ± 0.003,0.675 ± 0.013,0.255 ± 0.003


In [4]:
dataset_name = "homecredit"
fold = 0
seed = 0
X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split(
    dataset_name, fold, seed
)
if dataset_name == "homecredit":
    pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
pipeline_preprocess.fit(X_train, Y_train)
X_train_preprocessed = pipeline_preprocess.transform(X_train)
X_val_preprocessed = pipeline_preprocess.transform(X_val)
A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
X_test_preprocessed = pipeline_preprocess.transform(X_test)
A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]




In [5]:

file = "../results/credit_models/homecredit/0/MLPClassifier.pkl"
model = joblib.load(file)
model_name = file.split("/")[-1].split(".")[0]
Y_train_pred = model.predict_proba(X_train)[:, 1]

print(Y_train_pred)

[0.03814841 0.01438359 0.0849669  ... 0.1868208  0.03891562 0.11964767]


In [7]:
threshold = training.ks_threshold(Y_train, Y_train_pred)
Y_test_pred = model.predict_proba(X_test)[:, 1]
Y_test_pred = (Y_test_pred > threshold).astype(int)
model_dict = {
    "MLPClassifier" : Y_test_pred
}

evaluate.get_fairness_metrics(
    model_dict, X_test, Y_test, A_test
)

Unnamed: 0,model,DPD,EOD,AOD,APVD,GMA,balanced_accuracy
0,MLPClassifier,-0.166772,-0.16628,-0.161041,-0.011135,0.690298,0.676642


In [15]:
def summarize_credit_experiment_fair(dataset_name, seed = 0):

    # computing fairness metrics
    path_1 = "../results/fair_models"
    path_2 = "../results/credit_models"

    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict_val = {}
        model_dict_test = {}
        X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split(
            dataset_name, fold, seed
        )

        # It was necessary to compute the pipeline again due to pickle error
        if dataset_name == "homecredit":
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
        else:
            pipeline_preprocess = training.create_pipeline(X_train, Y_train)
        pipeline_preprocess.fit(X_train, Y_train)
        X_train_preprocessed = pipeline_preprocess.transform(X_train)
        X_val_preprocessed = pipeline_preprocess.transform(X_val)
        A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        X_test_preprocessed = pipeline_preprocess.transform(X_test)
        A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]

        models_files_1 = glob(f"{path_1}/{dataset_name}/{fold}/*.pkl")
        models_files_1 = [file for file in models_files_1 if "study" not in file]
        models_files_2 = glob(f"{path_2}/{dataset_name}/{fold}/*.pkl")
        models_files_2 = [file for file in models_files_2 if "study" not in file]
        models_files = models_files_1 + models_files_2

        class Thr_helper:
            def __init__(self, model, sensitive_features):
                self.model = model
                self.sensitive_features = sensitive_features

            def predict(self, X):
                return self.model.predict(
                    X, sensitive_features=self.sensitive_features
                )

        for file in models_files:
            if file.find("thr") != -1:
                continue
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            Y_val_pred = model.predict_proba(X_val)[:, 1]
            Y_val_pred = (Y_val_pred > threshold).astype(int)
            model_dict_val[model_name] = Y_val_pred
            Y_test_pred = model.predict_proba(X_test)[:, 1]
            Y_test_pred = (Y_test_pred > threshold).astype(int)
            model_dict_test[model_name] = Y_test_pred

        metrics_folds_val.append(
            evaluate.get_fairness_metrics(model_dict_val, X_val, Y_val, A_val, benefit_class = 0)
        )
        metrics_folds_test.append(
            evaluate.get_fairness_metrics(model_dict_test, X_test, Y_test, A_test, benefit_class = 0)
        )
        metrics_folds_test[-1]["fold"] = fold

        # Threshold Optimizer needs a different procedure as it does not predict probabilities
        for file in models_files:
            if file.find("thr") == -1:
                continue

            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            thr_opt_helper_val = Thr_helper(model, A_val)
            Y_val_pred = thr_opt_helper_val.predict(X_val_preprocessed)
            thr_opt_helper_test = Thr_helper(model, A_test)
            Y_test_pred = thr_opt_helper_test.predict(X_test_preprocessed)
            
            model_dict_val = {model_name : Y_val_pred}
            model_dict_test = {model_name : Y_test_pred}
            metrics_folds_val.append(
                evaluate.get_fairness_metrics(
                    model_dict_val, X_val_preprocessed, Y_val, A_val, benefit_class = 0
                )
            )
            metrics_folds_test.append(
                evaluate.get_fairness_metrics(
                    model_dict_test, X_test_preprocessed, Y_test, A_test, benefit_class = 0
                )
            )
            metrics_folds_test[-1]["fold"] = fold

    metrics_val = pd.concat(metrics_folds_val)    
    metrics_test = pd.concat(metrics_folds_test)
        
    return metrics_val, metrics_test

In [16]:
_, german_test_metrics = summarize_credit_experiment_fair("german", seed=0)
_, taiwan_test_metrics = summarize_credit_experiment_fair("taiwan", seed=0)
_, homecredit_test_metrics = summarize_credit_experiment_fair("homecredit", seed=0)

In [21]:
transform_to_table(german_test_metrics.drop(columns = ["GMA", "fold"])).to_dict(orient="records")

[{'model': 'DemographicParityClassifier',
  'DPD': '0.059 ± 0.058',
  'EOD': '0.076 ± 0.083',
  'AOD': '0.058 ± 0.06',
  'APVD': '0.084 ± 0.034',
  'balanced_accuracy': '0.707 ± 0.015'},
 {'model': 'EqualOpportunityClassifier',
  'DPD': '0.05 ± 0.045',
  'EOD': '0.092 ± 0.059',
  'AOD': '0.047 ± 0.051',
  'APVD': '0.086 ± 0.025',
  'balanced_accuracy': '0.71 ± 0.014'},
 {'model': 'FairGBMClassifier',
  'DPD': '0.028 ± 0.019',
  'EOD': '0.06 ± 0.046',
  'AOD': '0.028 ± 0.018',
  'APVD': '0.096 ± 0.019',
  'balanced_accuracy': '0.638 ± 0.058'},
 {'model': 'LGBMClassifier',
  'DPD': '0.066 ± 0.037',
  'EOD': '0.09 ± 0.06',
  'AOD': '0.078 ± 0.051',
  'APVD': '0.13 ± 0.031',
  'balanced_accuracy': '0.632 ± 0.041'},
 {'model': 'LogisticRegression',
  'DPD': '0.124 ± 0.055',
  'EOD': '0.193 ± 0.06',
  'AOD': '0.136 ± 0.06',
  'APVD': '0.137 ± 0.026',
  'balanced_accuracy': '0.697 ± 0.024'},
 {'model': 'MLPClassifier',
  'DPD': '0.145 ± 0.147',
  'EOD': '0.177 ± 0.169',
  'AOD': '0.149 ± 0.14

In [22]:
transform_to_table(taiwan_test_metrics.drop(columns = ["GMA", "fold"])).to_dict(orient="records")

[{'model': 'DemographicParityClassifier',
  'DPD': '0.063 ± 0.021',
  'EOD': '0.036 ± 0.021',
  'AOD': '0.052 ± 0.02',
  'APVD': '0.031 ± 0.01',
  'balanced_accuracy': '0.708 ± 0.003'},
 {'model': 'EqualOpportunityClassifier',
  'DPD': '0.062 ± 0.005',
  'EOD': '0.036 ± 0.006',
  'AOD': '0.052 ± 0.006',
  'APVD': '0.03 ± 0.003',
  'balanced_accuracy': '0.707 ± 0.002'},
 {'model': 'FairGBMClassifier',
  'DPD': '0.053 ± 0.01',
  'EOD': '0.029 ± 0.011',
  'AOD': '0.036 ± 0.011',
  'APVD': '0.033 ± 0.004',
  'balanced_accuracy': '0.72 ± 0.004'},
 {'model': 'LGBMClassifier',
  'DPD': '0.072 ± 0.007',
  'EOD': '0.049 ± 0.007',
  'AOD': '0.054 ± 0.008',
  'APVD': '0.025 ± 0.003',
  'balanced_accuracy': '0.721 ± 0.002'},
 {'model': 'LogisticRegression',
  'DPD': '0.078 ± 0.019',
  'EOD': '0.051 ± 0.02',
  'AOD': '0.068 ± 0.018',
  'APVD': '0.022 ± 0.009',
  'balanced_accuracy': '0.709 ± 0.001'},
 {'model': 'MLPClassifier',
  'DPD': '0.099 ± 0.015',
  'EOD': '0.074 ± 0.016',
  'AOD': '0.086 ± 0

In [23]:
transform_to_table(homecredit_test_metrics.drop(columns = ["GMA", "fold"])).to_dict(orient="records")

[{'model': 'FairGBMClassifier',
  'DPD': '0.097 ± 0.009',
  'EOD': '0.087 ± 0.009',
  'AOD': '0.088 ± 0.009',
  'APVD': '0.021 ± 0.002',
  'balanced_accuracy': '0.683 ± 0.003'},
 {'model': 'LGBMClassifier',
  'DPD': '0.162 ± 0.003',
  'EOD': '0.153 ± 0.003',
  'AOD': '0.147 ± 0.003',
  'APVD': '0.011 ± 0.0',
  'balanced_accuracy': '0.688 ± 0.002'},
 {'model': 'LogisticRegression',
  'DPD': '0.182 ± 0.002',
  'EOD': '0.174 ± 0.003',
  'AOD': '0.166 ± 0.003',
  'APVD': '0.01 ± 0.0',
  'balanced_accuracy': '0.675 ± 0.0'},
 {'model': 'MLPClassifier',
  'DPD': '0.178 ± 0.007',
  'EOD': '0.168 ± 0.007',
  'AOD': '0.165 ± 0.005',
  'APVD': '0.01 ± 0.001',
  'balanced_accuracy': '0.679 ± 0.001'},
 {'model': 'RandomForestClassifier',
  'DPD': '0.162 ± 0.012',
  'EOD': '0.153 ± 0.011',
  'AOD': '0.147 ± 0.014',
  'APVD': '0.012 ± 0.002',
  'balanced_accuracy': '0.679 ± 0.002'},
 {'model': 'rw_LGBMClassifier',
  'DPD': '0.026 ± 0.006',
  'EOD': '0.015 ± 0.006',
  'AOD': '0.014 ± 0.008',
  'APVD':