In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
import matplotlib.patches as mpatches
from matplotlib import ticker
import joblib
from glob import glob
import sys
from credit_pipeline import training, evaluate

sys.path.append("../scripts")
from experiments import load_split, PROTECTED_ATTRIBUTES

# small fix to be able to load models
from credit_pipeline.training import EBE


2024-04-17 10:05:07.725370: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-17 10:05:07.763511: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-17 10:05:07.763533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-17 10:05:07.764706: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-17 10:05:07.770990: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-17 10:05:07.772306: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [24]:
def transform_to_table(metrics):
    metrics_mean = metrics.groupby("model").apply(lambda x : x.abs().mean())
    metrics_std = metrics.groupby("model").apply(lambda x : x.abs().std())
    for col in metrics_mean.columns:
        metrics_mean[col] = (
            metrics_mean[col].round(3).astype(str)
            + " ± "
            + metrics_std[col].round(3).astype(str)
        )
    #metrics_mean = metrics_mean.drop(columns=metrics_std.columns)
    metrics_mean = metrics_mean.reset_index()
    return metrics_mean

In [4]:
def summarize_credit_experiment_perf(dataset_name, seed = 0):
    """Function that summarizes the results of the credit models experiment.
    It will print the mean and standard deviation of metrics.
    """

    path = "../results/credit_models"
    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict = {}
        X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split(
            dataset_name, fold, seed
        )

        # It was necessary to compute the pipeline again due to pickle error
        if dataset_name == "homecredit":
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
        else:
            pipeline_preprocess = training.create_pipeline(X_train, Y_train)
        pipeline_preprocess.fit(X_train, Y_train)
        X_train_preprocessed = pipeline_preprocess.transform(X_train)
        X_val_preprocessed = pipeline_preprocess.transform(X_val)
        A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        X_test_preprocessed = pipeline_preprocess.transform(X_test)
        A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        del (
            X_train_preprocessed,
            X_val_preprocessed,
            X_test_preprocessed,
            pipeline_preprocess,
        )

        models_files = glob(f"{path}/{dataset_name}/{fold}/*.pkl")
        # remove the ones that are not models
        models_files = [file for file in models_files if "study" not in file]

        for file in models_files:
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            model_dict[model_name] = [model, threshold]

        metrics_folds_val.append(evaluate.get_metrics(model_dict, X_val, Y_val))
        metrics_folds_test.append(evaluate.get_metrics(model_dict, X_test, Y_test))

    metrics_val = pd.concat(metrics_folds_val)
    metrics_test = pd.concat(metrics_folds_test)

    return metrics_val, metrics_test

In [7]:
def summarize_credit_experiment_fair(dataset_name, seed = 0):

    # computing fairness metrics
    path_1 = "../results/fair_models"
    path_2 = "../results/credit_models"

    metrics_folds_val = []
    metrics_folds_test = []
    for fold in range(10):
        model_dict = {}
        X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split(
            dataset_name, fold, seed
        )

        # It was necessary to compute the pipeline again due to pickle error
        if dataset_name == "homecredit":
            pipeline_preprocess = training.create_pipeline(X_train, Y_train, crit=4)
        else:
            pipeline_preprocess = training.create_pipeline(X_train, Y_train)
        pipeline_preprocess.fit(X_train, Y_train)
        X_train_preprocessed = pipeline_preprocess.transform(X_train)
        X_val_preprocessed = pipeline_preprocess.transform(X_val)
        A_val = X_val_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]
        X_test_preprocessed = pipeline_preprocess.transform(X_test)
        A_test = X_test_preprocessed[PROTECTED_ATTRIBUTES[dataset_name] + "_0"]

        models_files_1 = glob(f"{path_1}/{dataset_name}/{fold}/*.pkl")
        models_files_1 = [file for file in models_files_1 if "study" not in file]
        models_files_2 = glob(f"{path_2}/{dataset_name}/{fold}/*.pkl")
        models_files_2 = [file for file in models_files_2 if "study" not in file]
        models_files = models_files_1 + models_files_2

        class Thr_helper:
            def __init__(self, model, sensitive_features):
                self.model = model
                self.sensitive_features = sensitive_features

            def predict(self, X):
                return self.model.predict(
                    X, sensitive_features=self.sensitive_features
                )

        for file in models_files:
            if file.find("thr") != -1:
                continue
            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            Y_train_pred = model.predict_proba(X_train)[:, 1]
            threshold = training.ks_threshold(Y_train, Y_train_pred)
            model_dict[model_name] = [model, threshold]

        metrics_folds_val.append(
            evaluate.get_fairness_metrics(model_dict, X_val, Y_val, A_val, benefit_class = 0)
        )
        metrics_folds_test.append(
            evaluate.get_fairness_metrics(model_dict, X_test, Y_test, A_test, benefit_class = 0)
        )
        metrics_folds_test[-1]["fold"] = fold

        # Threshold Optimizer needs a different procedure as it does not predict probabilities
        for file in models_files:
            if file.find("thr") == -1:
                continue

            model = joblib.load(file)
            model_name = file.split("/")[-1].split(".")[0]
            thr_opt_helper_val = Thr_helper(model, A_val)
            thr_opt_helper_test = Thr_helper(model, A_test)
            model_dict = {model_name: [thr_opt_helper_val, None]}
            metrics_folds_val.append(
                evaluate.get_fairness_metrics(
                    model_dict, X_val_preprocessed, Y_val, A_val, benefit_class = 0
                )
            )
            model_dict = {model_name: [thr_opt_helper_test, None]}
            metrics_folds_test.append(
                evaluate.get_fairness_metrics(
                    model_dict, X_test_preprocessed, Y_test, A_test, benefit_class = 0
                )
            )
            metrics_folds_test[-1]["fold"] = fold

    metrics_val = pd.concat(metrics_folds_val)    
    metrics_test = pd.concat(metrics_folds_test)
        
    return metrics_val, metrics_test

In [13]:
metrics_val, metrics_test = summarize_credit_experiment_perf("german")

In [19]:
_, german_test_metrics = summarize_credit_experiment_fair("german", seed=0)
_, taiwan_test_metrics = summarize_credit_experiment_fair("taiwan", seed=0)
_, homecredit_test_metrics = summarize_credit_experiment_fair("homecredit", seed=0)

In [31]:
transform_to_table(german_test_metrics.drop(columns = ["GMA", "fold"])).to_dict(orient="records")

[{'model': 'DemographicParityClassifier',
  'DPD': '0.056 ± 0.038',
  'EOD': '0.043 ± 0.026',
  'AOD': '0.064 ± 0.045',
  'APVD': '0.134 ± 0.045',
  'balanced_accuracy': '0.609 ± 0.025'},
 {'model': 'EqualOpportunityClassifier',
  'DPD': '0.056 ± 0.031',
  'EOD': '0.048 ± 0.038',
  'AOD': '0.061 ± 0.037',
  'APVD': '0.126 ± 0.039',
  'balanced_accuracy': '0.604 ± 0.021'},
 {'model': 'FairGBMClassifier',
  'DPD': '0.04 ± 0.026',
  'EOD': '0.029 ± 0.029',
  'AOD': '0.049 ± 0.035',
  'APVD': '0.181 ± 0.114',
  'balanced_accuracy': '0.557 ± 0.058'},
 {'model': 'LGBMClassifier',
  'DPD': '0.049 ± 0.033',
  'EOD': '0.077 ± 0.045',
  'AOD': '0.064 ± 0.044',
  'APVD': '0.136 ± 0.026',
  'balanced_accuracy': '0.639 ± 0.042'},
 {'model': 'LogisticRegression',
  'DPD': '0.092 ± 0.053',
  'EOD': '0.133 ± 0.072',
  'AOD': '0.105 ± 0.061',
  'APVD': '0.154 ± 0.04',
  'balanced_accuracy': '0.637 ± 0.08'},
 {'model': 'MLPClassifier',
  'DPD': '0.109 ± 0.137',
  'EOD': '0.099 ± 0.142',
  'AOD': '0.101 

In [32]:
transform_to_table(taiwan_test_metrics.drop(columns = ["GMA", "fold"])).to_dict(orient="records")

[{'model': 'DemographicParityClassifier',
  'DPD': '0.037 ± 0.002',
  'EOD': '0.01 ± 0.002',
  'AOD': '0.037 ± 0.003',
  'APVD': '0.037 ± 0.004',
  'balanced_accuracy': '0.653 ± 0.002'},
 {'model': 'EqualOpportunityClassifier',
  'DPD': '0.036 ± 0.001',
  'EOD': '0.01 ± 0.001',
  'AOD': '0.037 ± 0.002',
  'APVD': '0.037 ± 0.001',
  'balanced_accuracy': '0.654 ± 0.001'},
 {'model': 'FairGBMClassifier',
  'DPD': '0.028 ± 0.008',
  'EOD': '0.007 ± 0.002',
  'AOD': '0.025 ± 0.009',
  'APVD': '0.036 ± 0.011',
  'balanced_accuracy': '0.644 ± 0.039'},
 {'model': 'LGBMClassifier',
  'DPD': '0.049 ± 0.015',
  'EOD': '0.024 ± 0.016',
  'AOD': '0.042 ± 0.008',
  'APVD': '0.03 ± 0.004',
  'balanced_accuracy': '0.684 ± 0.033'},
 {'model': 'LogisticRegression',
  'DPD': '0.052 ± 0.012',
  'EOD': '0.025 ± 0.012',
  'AOD': '0.047 ± 0.007',
  'APVD': '0.026 ± 0.003',
  'balanced_accuracy': '0.677 ± 0.029'},
 {'model': 'MLPClassifier',
  'DPD': '0.034 ± 0.003',
  'EOD': '0.009 ± 0.002',
  'AOD': '0.021 

In [33]:
transform_to_table(homecredit_test_metrics.drop(columns = ["GMA", "fold"])).to_dict(orient="records")

[{'model': 'FairGBMClassifier',
  'DPD': '0.002 ± 0.001',
  'EOD': '0.001 ± 0.001',
  'AOD': '0.001 ± 0.001',
  'APVD': '0.026 ± 0.019',
  'balanced_accuracy': '0.512 ± 0.002'},
 {'model': 'LGBMClassifier',
  'DPD': '0.066 ± 0.081',
  'EOD': '0.061 ± 0.076',
  'AOD': '0.062 ± 0.073',
  'APVD': '0.024 ± 0.016',
  'balanced_accuracy': '0.582 ± 0.092'},
 {'model': 'LogisticRegression',
  'DPD': '0.182 ± 0.002',
  'EOD': '0.173 ± 0.002',
  'AOD': '0.165 ± 0.003',
  'APVD': '0.01 ± 0.0',
  'balanced_accuracy': '0.675 ± 0.0'},
 {'model': 'MLPClassifier',
  'DPD': '0.0 ± 0.0',
  'EOD': '0.0 ± 0.0',
  'AOD': '0.0 ± 0.0',
  'APVD': '0.03 ± 0.0',
  'balanced_accuracy': '0.501 ± 0.001'},
 {'model': 'RandomForestClassifier',
  'DPD': '0.041 ± 0.066',
  'EOD': '0.038 ± 0.061',
  'AOD': '0.038 ± 0.06',
  'APVD': '0.014 ± 0.0',
  'balanced_accuracy': '0.552 ± 0.084'},
 {'model': 'rw_LGBMClassifier',
  'DPD': '0.01 ± 0.015',
  'EOD': '0.006 ± 0.01',
  'AOD': '0.012 ± 0.007',
  'APVD': '0.025 ± 0.007',