In [115]:
import joblib 
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from experiments_helpers import ExperimentResults

if 'src' in os.getcwd():
    os.chdir('..')
    print(os.getcwd())

omit_idx = [1]
fold_indices = [0, 1, 2, 3, 4]

for i in omit_idx:
    fold_indices.remove(i)
    
print(fold_indices)

N = len(fold_indices)
print(N)

res_base_list = []
for fold in fold_indices:
    res_base: ExperimentResults = ExperimentResults.load_results_from_file(f'results/mlp_base_{fold}.joblib')
    res_base_list.append(res_base)

res_calib_iso_list = []
for fold in fold_indices:
    res_calib_iso: ExperimentResults = ExperimentResults.load_results_from_file(f'results/mlp_isotonic-cv_{fold}.joblib')
    res_calib_iso_list.append(res_calib_iso)
    
res_calib_sig_list = []
for fold in fold_indices:
    res_calib_sig: ExperimentResults = ExperimentResults.load_results_from_file(f'results/mlp_sigmoid-cv_{fold}.joblib')
    res_calib_sig_list.append(res_calib_sig)

[0, 2, 3, 4]
4


In [116]:
# Check unique counts for each fold
for i in range(N):
    print(f'Fold {i}')
    print(f'Base: {np.unique(res_base_list[i].get_results_for_metric("validity_2"), return_counts=True)}')
    print(f'Isotonic: {np.unique(res_calib_iso_list[i].get_results_for_metric("validity_2"), return_counts=True)}')
    print(f'Sigmoid: {np.unique(res_calib_sig_list[i].get_results_for_metric("validity_2"), return_counts=True)}')

Fold 0
Base: (array([False,  True]), array([77, 24]))
Isotonic: (array([ 0.,  1., nan]), array([82, 14,  5]))
Sigmoid: (array([ 0.,  1., nan]), array([14, 25, 62]))
Fold 1
Base: (array([False,  True]), array([63, 39]))
Isotonic: (array([ 0.,  1., nan]), array([41, 60,  1]))
Sigmoid: (array([ 0.,  1., nan]), array([41, 57,  4]))
Fold 2
Base: (array([False,  True]), array([72, 32]))
Isotonic: (array([False,  True]), array([74, 30]))
Sigmoid: (array([ 0.,  1., nan]), array([77, 24,  3]))
Fold 3
Base: (array([ 0., nan]), array([83, 15]))
Isotonic: (array([False,  True]), array([89,  9]))
Sigmoid: (array([False,  True]), array([93,  5]))


In [117]:
def average_results(results_list: list,  name: str) -> pd.DataFrame:
    to_average = ['validity', 'validity_2', 'cf_counterfactual_stability', 'cf_counterfactual_stability_2']
    avg = pd.DataFrame(index=[name], columns=to_average)
    std = pd.DataFrame(index=[name], columns=to_average)
    
    for metric in to_average:
        clean = []
        for res in results_list:
            clean += res.get_results_for_metric(metric)
        clean = np.array(list(filter(lambda x: str(x) != 'nan', clean)), dtype=np.float32)
        
        avg.at[name, metric] = np.mean(clean)
        std.at[name, metric] = np.std(clean)
        
    return avg, std
    

avg_base, std_base = average_results(res_base_list, 'Base')
avg_iso, std_iso = average_results(res_calib_iso_list, 'Isotonic')
avg_sig, std_sig = average_results(res_calib_sig_list, 'Sigmoid')

avgs = pd.concat([avg_base, avg_iso, avg_sig])
stds = pd.concat([std_base, std_iso, std_sig])

avgs

Unnamed: 0,validity,validity_2,cf_counterfactual_stability,cf_counterfactual_stability_2
Base,1.0,0.24359,0.457762,0.617155
Isotonic,1.0,0.283208,0.447096,0.607564
Sigmoid,1.0,0.330357,0.474963,0.607388


In [118]:
stds

Unnamed: 0,validity,validity_2,cf_counterfactual_stability,cf_counterfactual_stability_2
Base,0.0,0.429248,0.070476,0.104552
Isotonic,0.0,0.450557,0.048115,0.091725
Sigmoid,0.0,0.470342,0.030989,0.092348


In [119]:
def get_averages_for_each_fold(results_list: list) -> pd.DataFrame:
    to_average = ['validity', 'validity_2', 'cf_counterfactual_stability', 'cf_counterfactual_stability_2']
    avg = pd.DataFrame(index=range(N), columns=to_average)
    std = pd.DataFrame(index=range(N), columns=to_average)
    
    for i in range(N):
        for metric in to_average:
            clean = results_list[i].get_results_for_metric(metric)
            clean = np.array(list(filter(lambda x: str(x) != 'nan', clean)), dtype=np.float32)
            
            if len(clean) == 0:
                avg.at[i, metric] = np.nan
                std.at[i, metric] = np.nan
            avg.at[i, metric] = np.mean(clean)
            std.at[i, metric] = np.std(clean)
            
    return avg, std

avg_base_fold, std_base_fold = get_averages_for_each_fold(res_base_list)
avg_iso_fold, std_iso_fold = get_averages_for_each_fold(res_calib_iso_list)
avg_sig_fold, std_sig_fold = get_averages_for_each_fold(res_calib_sig_list)

print(f'Averages for each fold')
print(f'Base {avg_base_fold["validity_2"].to_list()}')
print(f'Isotonic {avg_iso_fold["validity_2"].to_list()}')
print(f'Sigmoid {avg_sig_fold["validity_2"].to_list()}')

print(f'Standard deviations for each fold')


Averages for each fold
Base [0.23762377, 0.38235295, 0.30769232, 0.0]
Isotonic [0.14583333, 0.5940594, 0.28846154, 0.091836736]
Sigmoid [0.64102566, 0.5816327, 0.23762377, 0.05102041]
Standard deviations for each fold


In [122]:
from scipy.stats import ttest_ind

def t_student_unpaired(scores1, scores2, hypothesis='greater'):
    
    t_stat, p_value = ttest_ind(scores1, scores2)
    
    if hypothesis == 'greater':
        p_value = p_value / 2
    return t_stat, p_value

scores_base = avg_base_fold.loc[:, 'validity_2'].to_list()
scores_sig = avg_sig_fold.loc[:, 'validity_2'].to_list()
scores_iso = avg_iso_fold.loc[:, 'validity_2'].to_list()

print(f'iso > base: {t_student_unpaired(scores_iso, scores_base)}')
print(f'sig > base: {t_student_unpaired(scores_sig, scores_base)}')

iso > base: (0.34444738815077836, 0.3711308642293241)
sig > base: (0.8942565960495553, 0.2028125370690714)
