In [2]:
import joblib 
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from experiments_helpers import ExperimentResults

if 'src' in os.getcwd():
    os.chdir('..')
    print(os.getcwd())

omit_idx = []
fold_indices = [0]
folder = 'results2'
name = 'torch-fico-gs_'


for i in omit_idx:
    fold_indices.remove(i)
    
print(fold_indices)

N = len(fold_indices)
print(N)

res_base_list = []
for fold in fold_indices:
    res_base: ExperimentResults = ExperimentResults.load_results_from_file(f'results2/mlp_base_{fold}.joblib')
    res_base_list.append(res_base)

res_calib_iso_list = []
for fold in fold_indices:
    res_calib_iso: ExperimentResults = ExperimentResults.load_results_from_file(f'results2/mlp_isotonic-cv_{fold}.joblib')
    res_calib_iso_list.append(res_calib_iso)
    
res_calib_sig_list = []
for fold in fold_indices:
    res_calib_sig: ExperimentResults = ExperimentResults.load_results_from_file(f'results2/mlp_sigmoid-cv_{fold}.joblib')
    res_calib_sig_list.append(res_calib_sig)

[0, 1, 2, 3, 4, 5, 6, 7, 8]
9


In [3]:
# Check unique counts for each fold
for i in range(N):
    print(f'Fold {i}')
    print(f'Base: {np.unique(res_base_list[i].get_results_for_metric("validity_2"), return_counts=True)}')
    print(f'Isotonic: {np.unique(res_calib_iso_list[i].get_results_for_metric("validity_2"), return_counts=True)}')
    print(f'Sigmoid: {np.unique(res_calib_sig_list[i].get_results_for_metric("validity_2"), return_counts=True)}')

Fold 0
Base: (array([False,  True]), array([54, 22]))
Isotonic: (array([False,  True]), array([51, 25]))
Sigmoid: (array([False,  True]), array([52, 24]))
Fold 1
Base: (array([False,  True]), array([17, 53]))
Isotonic: (array([False,  True]), array([28, 42]))
Sigmoid: (array([False,  True]), array([28, 42]))
Fold 2
Base: (array([False,  True]), array([37, 38]))
Isotonic: (array([False,  True]), array([38, 37]))
Sigmoid: (array([False,  True]), array([19, 56]))
Fold 3
Base: (array([False,  True]), array([50, 28]))
Isotonic: (array([False,  True]), array([55, 23]))
Sigmoid: (array([False,  True]), array([54, 24]))
Fold 4
Base: (array([False,  True]), array([48, 27]))
Isotonic: (array([False,  True]), array([37, 38]))
Sigmoid: (array([ 0.,  1., nan]), array([30, 44,  1]))
Fold 5
Base: (array([False,  True]), array([37, 35]))
Isotonic: (array([False,  True]), array([33, 39]))
Sigmoid: (array([False,  True]), array([35, 37]))
Fold 6
Base: (array([False,  True]), array([38, 38]))
Isotonic: (

In [4]:
def average_results(results_list: list,  name: str) -> pd.DataFrame:
    to_average = ['validity', 'validity_2', 'cf_counterfactual_stability', 'cf_counterfactual_stability_2']
    avg = pd.DataFrame(index=[name], columns=to_average)
    std = pd.DataFrame(index=[name], columns=to_average)
    
    for metric in to_average:
        clean = []
        for res in results_list:
            clean += res.get_results_for_metric(metric)
        clean = np.array(list(filter(lambda x: str(x) != 'nan', clean)), dtype=np.float32)
        
        avg.at[name, metric] = np.mean(clean)
        std.at[name, metric] = np.std(clean)
        
    return avg, std
    

avg_base, std_base = average_results(res_base_list, 'Base')
avg_iso, std_iso = average_results(res_calib_iso_list, 'Isotonic')
avg_sig, std_sig = average_results(res_calib_sig_list, 'Sigmoid')

avgs = pd.concat([avg_base, avg_iso, avg_sig])
stds = pd.concat([std_base, std_iso, std_sig])

avgs

Unnamed: 0,validity,validity_2,cf_counterfactual_stability,cf_counterfactual_stability_2
Base,1.0,0.442136,0.449393,0.660532
Isotonic,1.0,0.464392,0.449713,0.64374
Sigmoid,1.0,0.502985,0.469781,0.630857


In [5]:
stds

Unnamed: 0,validity,validity_2,cf_counterfactual_stability,cf_counterfactual_stability_2
Base,0.0,0.49664,0.090636,0.177602
Isotonic,0.0,0.49873,0.060041,0.181744
Sigmoid,0.0,0.499991,0.044074,0.168609


In [6]:
def get_averages_for_each_fold(results_list: list) -> pd.DataFrame:
    to_average = ['validity', 'validity_2', 'cf_counterfactual_stability', 'cf_counterfactual_stability_2']
    avg = pd.DataFrame(index=range(N), columns=to_average)
    std = pd.DataFrame(index=range(N), columns=to_average)
    
    for i in range(N):
        for metric in to_average:
            clean = results_list[i].get_results_for_metric(metric)
            clean = np.array(list(filter(lambda x: str(x) != 'nan', clean)), dtype=np.float32)
            
            if len(clean) == 0:
                avg.at[i, metric] = np.nan
                std.at[i, metric] = np.nan
            avg.at[i, metric] = np.mean(clean)
            std.at[i, metric] = np.std(clean)
            
    return avg, std

avg_base_fold, std_base_fold = get_averages_for_each_fold(res_base_list)
avg_iso_fold, std_iso_fold = get_averages_for_each_fold(res_calib_iso_list)
avg_sig_fold, std_sig_fold = get_averages_for_each_fold(res_calib_sig_list)

print(f'Averages for each fold')
print(f'Base {avg_base_fold["validity_2"].to_list()}')
print(f'Isotonic {avg_iso_fold["validity_2"].to_list()}')
print(f'Sigmoid {avg_sig_fold["validity_2"].to_list()}')

print(f'Standard deviations for each fold')


Averages for each fold
Base [0.28947368, 0.75714284, 0.50666666, 0.35897437, 0.36, 0.4861111, 0.5, 0.47368422, 0.27631578]
Isotonic [0.32894737, 0.6, 0.49333334, 0.2948718, 0.50666666, 0.5416667, 0.6184211, 0.46052632, 0.35526314]
Sigmoid [0.31578946, 0.6, 0.74666667, 0.30769232, 0.5945946, 0.5138889, 0.6438356, 0.4473684, 0.38157895]
Standard deviations for each fold


In [7]:
from scipy.stats import ttest_ind

def t_student_unpaired(scores1, scores2, hypothesis='greater'):
    
    t_stat, p_value = ttest_ind(scores1, scores2)
    
    if hypothesis == 'greater':
        p_value = p_value / 2
    return t_stat, p_value

scores_base = avg_base_fold.loc[:, 'validity_2'].to_list()
scores_sig = avg_sig_fold.loc[:, 'validity_2'].to_list()
scores_iso = avg_iso_fold.loc[:, 'validity_2'].to_list()

print(f'iso > base: {t_student_unpaired(scores_iso, scores_base)}')
print(f'sig > base: {t_student_unpaired(scores_sig, scores_base)}')

iso > base: (0.33865950037263237, 0.3696349076878197)
sig > base: (0.8508007964562788, 0.2037164720357379)
