In [1]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import tensorflow_datasets as tfds

import os
import pickle
import numpy as np
from tqdm import tqdm

from src.pmi_estimators import train_critic_model, neural_pmi
from src.psi_estimators import psi_gaussian_train, psi_gaussian_val_class
from src.pvi_estimators import train_pvi_null_model, neural_pvi_class, neural_pvi_ensemble_class
import src.utils as utils
import src.metrics as metrics
import src.methods as methods
import src.temp_scaling as temp_scaling

2025-04-09 10:54:29.837059: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9373] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-09 10:54:29.837115: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-09 10:54:29.838474: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1534] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 10:54:29.845780: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_name = 'vgg16'
dataset_name = 'stl10'

(ds_train, ds_val, ds_test), ds_info = tfds.load(
    'stl10',
    split=['train', 'test[:15%]', 'test[15%:]'],
    data_dir = '../tensorflow_datasets/',
    shuffle_files=False,
    as_supervised=True,
    with_info=True
)

IMG_SIZE = 224
num_classes = 10
def preprocess(image, label):
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = tf.cast(image, tf.float32) / 255.0
    label = tf.one_hot(label, depth=num_classes)
    return image, label

ds_train = ds_train.map(preprocess)
ds_val = ds_val.map(preprocess)
ds_test = ds_test.map(preprocess)

# batch_size = 128
# ds_train = ds_train.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
# ds_val = ds_val.batch(batch_size).prefetch(tf.data.AUTOTUNE)
# ds_test = ds_test.batch(batch_size).prefetch(tf.data.AUTOTUNE)

true_y_train = np.argmax([y for x,y in ds_train], axis=1)
true_y_val = np.argmax([y for x,y in ds_val], axis=1)
true_y_test = np.argmax([y for x,y in ds_test], axis=1)

2025-04-09 10:54:33.112202: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1926] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 74331 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:4e:00.0, compute capability: 8.0


In [3]:
def create_model():
    base_model = VGG16(include_top=False, weights='imagenet', input_tensor=Input(shape=(IMG_SIZE, IMG_SIZE, 3)))
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)
    outputs = Dense(10, activation='linear')(x)
    model = Model(inputs=base_model.input, outputs=outputs)
    for layer in base_model.layers:
        layer.trainable = True
    return model

### Train Model

In [None]:
for run in range(10):
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)

    model = create_model()
    
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=AdamW(learning_rate=1e-4, weight_decay=1e-4), loss=loss_fn, metrics=['accuracy'])

    lr_scheduler = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=5, verbose=1)
    early_stop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True, verbose=1)
    history = model.fit(ds_train, validation_data=ds_val, epochs=100, callbacks=[lr_scheduler, early_stop])
    
    if not os.path.exists(exp_name+'/saved_models'):
        print("Making directory", exp_name+'/saved_models')
        os.makedirs(exp_name+'/saved_models')

    model.save_weights(f'{exp_name}/saved_models/trained_weights.h5')
    with open(f'{exp_name}/history.pickle', 'wb') as f:
        pickle.dump(history, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_acc = []
val_acc = []
test_acc = []
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=AdamW(learning_rate=1e-4, weight_decay=1e-4), loss=loss_fn, metrics=['accuracy'])
    train_acc.append(model.evaluate(ds_train.batch(256), verbose=0)[1])
    val_acc.append(model.evaluate(ds_val.batch(256), verbose=0)[1])
    test_acc.append(model.evaluate(ds_test.batch(256), verbose=0)[1])
    print(f'Test accuracy: {test_acc[-1]*100:.2f}')
print(f'Average train error: {(100-np.mean(train_acc)*100):.2f}, ({(np.std(train_acc)*100):.2f})')
print(f'Average validation error: {(100-np.mean(val_acc)*100):.2f} ({(np.std(val_acc)*100):.2f})')
print(f'Average test error: {(100-np.mean(test_acc)*100):.2f} ({(np.std(test_acc)*100):.2f})')

### PMI

In [None]:
for run in range(1,10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pmi/separable_variational_f_js'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)

    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
    int_model = tf.keras.Model(inputs=model.inputs, outputs=model.layers[-1].output)

    ##############################################################
    #
    # Train PMI Model
    #
    # #############################################################

    print(f'Training PMI model...')
    ds_activity_trn = ds_train.batch(128).map(lambda x, y: (int_model(x), y)).cache().prefetch(tf.data.AUTOTUNE)
    ds_activity_val = ds_val.batch(128).map(lambda x, y: (int_model(x), y)).cache().prefetch(tf.data.AUTOTUNE)
    train_critic_model(ds_activity_trn, ds_activity_val, critic='separable', estimator='variational_f_js', epochs=200, save_path=f'{exp_name}/pmi_output_model')

    ##############################################################
    #
    # Compute PMI for all validation and test samples
    #
    # #############################################################

    pmi_model = tf.keras.models.load_model(f'{exp_name}/pmi_output_model')
    n_classes = 10

    print(f'Computing PMI for all validation samples and for all classes...')
    encoded_x = []
    for x, _ in ds_val.batch(128):
        encoded_x.append(int_model(x).numpy())
    encoded_x = np.concatenate(encoded_x)
    num_samples = encoded_x.shape[0]
    
    pmi_class = []
    batch_size = 1024
    for k in range(n_classes):
        num_samples = encoded_x.shape[0]
        y_k = tf.one_hot(tf.fill([num_samples], k), depth=n_classes)
        pmi_list = []
        for i in tqdm(range(0, len(encoded_x), batch_size), desc=f"Computing PMI for class {k+1}"):
            x_batch = encoded_x[i:i+batch_size]
            y_batch = y_k[i:i+batch_size]
            pmi = neural_pmi(x_batch, y_batch, pmi_model, estimator='variational_f_js')
            pmi_list += np.array(pmi).tolist()
        pmi_class.append(pmi_list)
    np.save(f'{exp_name}/pmi_output_class_val.npy', np.array(pmi_class).T)
    
    print(f'Computing PMI for all test samples and for all classes...')
    encoded_x = []
    for x, _ in ds_test.batch(128):
        encoded_x.append(int_model(x).numpy())
    encoded_x = np.concatenate(encoded_x)
    num_samples = encoded_x.shape[0]
    
    pmi_class = []
    batch_size = 1024
    for k in range(n_classes):
        num_samples = encoded_x.shape[0]
        y_k = tf.one_hot(tf.fill([num_samples], k), depth=n_classes)
        pmi_list = []
        for i in tqdm(range(0, len(encoded_x), batch_size), desc=f"Computing PMI for class {k+1}"):
            x_batch = encoded_x[i:i+batch_size]
            y_batch = y_k[i:i+batch_size]
            pmi = neural_pmi(x_batch, y_batch, pmi_model, estimator='variational_f_js')
            pmi_list += np.array(pmi).tolist()
        pmi_class.append(pmi_list)
    np.save(f'{exp_name}/pmi_output_class_test.npy', np.array(pmi_class).T)

### PSI

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/psi/gaussian'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)

    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
    int_model = tf.keras.Model(inputs=model.inputs, outputs=model.layers[-1].output)
    
    ##############################################################
    #
    # Train PSI Model
    #
    # #############################################################
    
    x_logits_list = []
    y_labels_list = []

    for x_batch, y_batch in ds_train.batch(256):
        logits = int_model(x_batch)
        labels = tf.argmax(y_batch, axis=1)
        x_logits_list.append(logits)
        y_labels_list.append(labels)

    x = tf.concat(x_logits_list, axis=0).numpy()
    y = tf.concat(y_labels_list, axis=0).numpy()
    
    print(f'Training PSI model (gaussian)...')
    psi_data = psi_gaussian_train(x, y, n_projs=500)
    np.save(f'{exp_name}/gaussian_output_model_500_projs.npy', psi_data)

    ##############################################################
    #
    # Compute PSI for all validation and test samples
    #
    # #############################################################

    psi_data = np.load(f'{exp_name}/gaussian_output_model_500_projs.npy', allow_pickle=True).item()

    print(f'Computing PSI for all validation samples...')
    x_logits_list = []

    for x_batch, y_batch in ds_val.batch(256):
        logits = int_model(x_batch)
        x_logits_list.append(logits)
    
    x = tf.concat(x_logits_list, axis=0).numpy()
    psi_class, pmi_arr = psi_gaussian_val_class(x, psi_data)
    np.save(f'{exp_name}/psi_output_class_500_projs_val.npy', np.array(psi_class))

    print(f'Computing PSI for all test samples...')
    x_logits_list = []

    for x_batch, y_batch in ds_test.batch(256):
        logits = int_model(x_batch)
        x_logits_list.append(logits)
    
    x = tf.concat(x_logits_list, axis=0).numpy()
    psi_class, pmi_arr = psi_gaussian_val_class(x, psi_data)
    np.save(f'{exp_name}/psi_output_class_500_projs_test.npy', np.array(psi_class))

### PVI

In [None]:
random_runs = list(range(10))
while any(random_runs[i] == i for i in range(10)):
    np.random.shuffle(random_runs)
    
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)
        
    ##############################################################
    #
    # Train PVI Model
    #
    # #############################################################
    
    pvi_model = create_model()
    pvi_model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{random_runs[run]+1}/saved_models/trained_weights.h5')
    pvi_model.save_weights(f'{exp_name}/pvi_model_weights.h5')
    
    untrained_model = create_model()
    train_pvi_null_model(ds_train, untrained_model, epochs=10, save_path=f'{exp_name}/pvi_null_model_weights.h5')
    
    ##############################################################
    #
    # Compute PVI for all training and test samples
    #
    # #############################################################
    
    pvi_model = create_model()
    pvi_model.load_weights(f'{exp_name}/pvi_model_weights.h5')
    null_model = create_model()
    null_model.load_weights(f'{exp_name}/pvi_null_model_weights.h5')
    
    true_y_val = np.argmax([y for x,y in ds_val], axis=1)
    opt_temp_pvi = temp_scaling.temp_scaling_nll(pvi_model.predict(ds_val.batch(128), verbose=0), true_y_val)
    ds_null = ds_val.map(lambda x, y: (tf.zeros_like(x), y))
    opt_temp_null = temp_scaling.temp_scaling_nll(null_model.predict(ds_null.batch(128), verbose=0), true_y_val)

    print(f'Computing PVI for all validation samples and for all classes...')
    pvi_class = neural_pvi_class(ds_val.batch(128), pvi_model, null_model, opt_temp_pvi, opt_temp_null)
    np.save(f'{exp_name}/pvi_class_val.npy', np.array(pvi_class))

    print(f'Computing PVI for all test samples and for all classes...')
    pvi_class = neural_pvi_class(ds_test.batch(128), pvi_model, null_model, opt_temp_pvi, opt_temp_null)
    np.save(f'{exp_name}/pvi_class_test.npy', np.array(pvi_class))

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/finetuned'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)
        
    ##############################################################
    #
    # Train PVI Model
    #
    # #############################################################
    
    pvi_model = create_model()
    pvi_model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    pvi_model.compile(optimizer=AdamW(learning_rate=1e-4, weight_decay=1e-4), loss=loss_fn, metrics=['accuracy'])

    lr_scheduler = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=5, verbose=1)
    early_stop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True, verbose=1)
    pvi_model.fit(ds_train.batch(256), validation_data=ds_val.batch(256), epochs=100, callbacks=[lr_scheduler, early_stop])
    
    pvi_model.save_weights(f'{exp_name}/pvi_model_weights.h5')
    
    untrained_model = create_model()
    untrained_model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch/pvi_null_model_weights.h5')
    untrained_model.save_weights(f'{exp_name}/pvi_null_model_weights.h5')
    
    ##############################################################
    #
    # Compute PVI for all training and test samples
    #
    # #############################################################
    
    pvi_model = create_model()
    pvi_model.load_weights(f'{exp_name}/pvi_model_weights.h5')
    null_model = create_model()
    null_model.load_weights(f'{exp_name}/pvi_null_model_weights.h5')
    
    true_y_val = np.argmax([y for x,y in ds_val], axis=1)
    opt_temp_pvi = temp_scaling.temp_scaling_nll(pvi_model.predict(ds_val.batch(128), verbose=0), true_y_val)
    ds_null = ds_val.map(lambda x, y: (tf.zeros_like(x), y))
    opt_temp_null = temp_scaling.temp_scaling_nll(null_model.predict(ds_null.batch(128), verbose=0), true_y_val)

    print(f'Computing PVI for all validation samples and for all classes...')
    pvi_class = neural_pvi_class(ds_val.batch(128), pvi_model, null_model, opt_temp_pvi, opt_temp_null)
    np.save(f'{exp_name}/pvi_class_val.npy', np.array(pvi_class))

    print(f'Computing PVI for all test samples and for all classes...')
    pvi_class = neural_pvi_class(ds_test.batch(128), pvi_model, null_model, opt_temp_pvi, opt_temp_null)
    np.save(f'{exp_name}/pvi_class_test.npy', np.array(pvi_class))

In [None]:
pvi_runs = [9 if i == 4 else 4 for i in range(10)]
    
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)
        
    ##############################################################
    #
    # Train PVI Model
    #
    # #############################################################
    
    pvi_model = create_model()
    pvi_model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{pvi_runs[run]+1}/saved_models/trained_weights.h5')
    pvi_model.save_weights(f'{exp_name}/pvi_model_best_weights.h5')
    
#     untrained_model = create_model()
#     train_pvi_null_model(ds_train, untrained_model, epochs=10, save_path=f'{exp_name}/pvi_null_model_weights.h5')
    
    ##############################################################
    #
    # Compute PVI for all training and test samples
    #
    # #############################################################
    
    pvi_model = create_model()
    pvi_model.load_weights(f'{exp_name}/pvi_model_best_weights.h5')
    null_model = create_model()
    null_model.load_weights(f'{exp_name}/pvi_null_model_weights.h5')
    
    true_y_val = np.argmax([y for x,y in ds_val], axis=1)
    opt_temp_pvi = temp_scaling.temp_scaling_nll(pvi_model.predict(ds_val.batch(128), verbose=0), true_y_val)
    ds_null = ds_val.map(lambda x, y: (tf.zeros_like(x), y))
    opt_temp_null = temp_scaling.temp_scaling_nll(null_model.predict(ds_null.batch(128), verbose=0), true_y_val)

    print(f'Computing PVI for all validation samples and for all classes...')
    pvi_class = neural_pvi_class(ds_val.batch(128), pvi_model, null_model, opt_temp_pvi, opt_temp_null)
    np.save(f'{exp_name}/pvi_class_best_val.npy', np.array(pvi_class))

    print(f'Computing PVI for all test samples and for all classes...')
    pvi_class = neural_pvi_class(ds_test.batch(128), pvi_model, null_model, opt_temp_pvi, opt_temp_null)
    np.save(f'{exp_name}/pvi_class_best_test.npy', np.array(pvi_class))

### Ensemble PVI

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/ensemble_no_training_training_from_scratch_calibrated'
    if not os.path.exists(exp_name):
        print("Making directory", exp_name)
        os.makedirs(exp_name)
        
    ##############################################################
    #
    # Train PVI Model
    #
    # #############################################################
    
    pvi_model_1 = create_model()
    pvi_model_1.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
    null_model_1 = create_model()
    null_model_1.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch/pvi_null_model_weights.h5')
    pvi_model_2 = create_model()
    pvi_model_2.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch/pvi_model_weights.h5')
    null_model_2 = create_model()
    null_model_2.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch/pvi_null_model_weights.h5')
    
    true_y_val = np.argmax([y for x,y in ds_val], axis=1)
    opt_temp_pvi_1 = temp_scaling.temp_scaling_nll(pvi_model_1.predict(ds_val.batch(128), verbose=0), true_y_val)
    opt_temp_pvi_2 = temp_scaling.temp_scaling_nll(pvi_model_2.predict(ds_val.batch(128), verbose=0), true_y_val)
    ds_null = ds_val.map(lambda x, y: (tf.zeros_like(x), y))
    opt_temp_null = temp_scaling.temp_scaling_nll(null_model_1.predict(ds_null.batch(128), verbose=0), true_y_val)
    
    ##############################################################
    #
    # Compute PVI for all training and test samples
    #
    # #############################################################
    
    print(f'Computing PVI for all validation samples and for all classes...')
    pvi_class = []
    for (x_batch, y_batch) in ds_val.batch(256):
        pvi = neural_pvi_ensemble_class([x_batch, x_batch], [pvi_model_1, pvi_model_2], [null_model_1, null_model_2], [opt_temp_pvi_1, opt_temp_pvi_2], [opt_temp_null, opt_temp_null])
        pvi_class += np.array(pvi).tolist()
    np.save(f'{exp_name}/pvi_class_val.npy', np.array(pvi_class))

    print(f'Computing PVI for all test samples and for all classes...')
    pvi_class = []
    for (x_batch, y_batch) in ds_test.batch(256):
        pvi = neural_pvi_ensemble_class([x_batch, x_batch], [pvi_model_1, pvi_model_2], [null_model_1, null_model_2], [opt_temp_pvi_1, opt_temp_pvi_2], [opt_temp_null, opt_temp_null])
        pvi_class += np.array(pvi).tolist()
    np.save(f'{exp_name}/pvi_class_test.npy', np.array(pvi_class))

### Temperature Scaling

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

#     if not os.path.exists(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration'):
#         print("Making directory", f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration')
#         os.makedirs(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration')                                  
  
    
    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = model.predict(ds_val.batch(512), verbose=0)
    
#     opt_temp = temp_scaling.temp_scaling_aurc(scores, pred_y_val, true_y_val)
#     np.save(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_aurc.npy', opt_temp)
    
    opt_temp = temp_scaling.temp_scaling_nll(scores, true_y_val)
    np.save(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_nll.npy', opt_temp)

#     opt_temp = temp_scaling.temp_scaling_ece(scores, pred_y_val, true_y_val, 15)
#     np.save(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_ece.npy', opt_temp)

In [4]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

#     if not os.path.exists(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration'):
#         print("Making directory", f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration')
#         os.makedirs(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration')                                  
  
    
    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = model.predict(ds_val.batch(512), verbose=0)
    
    opt_temp, opt_weights = temp_scaling.ensemble_temp_scaling_nll(scores, true_y_val, num_classes)
    np.save(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_ets_nll.npy', opt_temp)
    np.save(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_weights_ets_nll.npy', opt_weights)

#     opt_temp = temp_scaling.temp_scaling_ece(scores, pred_y_val, true_y_val, 15)
#     np.save(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_ece.npy', opt_temp)

Run: 1


2025-04-09 09:50:41.598307: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:467] Loaded cuDNN version 90100




2025-04-09 09:51:13.250867: I external/local_xla/xla/service/service.cc:168] XLA service 0x556118952d80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-09 09:51:13.250899: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
2025-04-09 09:51:13.255793: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1744192273.375403 1229886 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Run: 2
Run: 3
Run: 4
Run: 5
Run: 6
Run: 7
Run: 8
Run: 9
Run: 10


In [4]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
    
    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = model.predict(ds_val.batch(512), verbose=0)
    
    pts = temp_scaling.PTSCalibrator(
    epochs=30,
    lr=1e-3,
    weight_decay=1e-4,
    batch_size=64,
    nlayers=2,
    n_nodes=32,
    length_logits=10,
    top_k_logits=5
)

    pts.tune(logits=scores, labels=pred_y_val)
    pts.save(path=f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/calibration_model/')

Run: 1


2025-04-09 10:41:20.088768: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:467] Loaded cuDNN version 90100


Epoch 1/30


2025-04-09 10:41:52.879519: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f272eba0e80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-09 10:41:52.879566: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
2025-04-09 10:41:52.885078: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1744195313.006583 1372796 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_1/calibration/calibration_model/pts_model.h5
Run: 2
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_2/calibration/calibration_model/pts_model.h5
Run: 3
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoc

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pmi/separable_variational_f_js'
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = np.load(f'{exp_name}/pmi_output_class_val.npy')
    
#     opt_temp = temp_scaling.temp_scaling_aurc(scores, pred_y_val, true_y_val)                            
#     np.save(f'{exp_name}/pmi_opt_temp_aurc.npy', opt_temp)
    
    opt_temp = temp_scaling.temp_scaling_nll(scores, true_y_val)                                
    np.save(f'{exp_name}/pmi_opt_temp_nll.npy', opt_temp)
    
#     opt_temp = temp_scaling.temp_scaling_ece(scores, pred_y_val, true_y_val, 15)                                
#     np.save(f'{exp_name}/pmi_opt_temp_ece.npy', opt_temp)

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/psi/gaussian'
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = np.load(f'{exp_name}/psi_output_class_500_projs_val.npy')
    
#     opt_temp = temp_scaling.temp_scaling_aurc(scores, pred_y_val, true_y_val)                                 
#     np.save(f'{exp_name}/psi_opt_temp_aurc.npy', opt_temp)
    
    opt_temp = temp_scaling.temp_scaling_nll(scores, true_y_val)                            
    np.save(f'{exp_name}/psi_opt_temp_nll.npy', opt_temp)
    
#     opt_temp = temp_scaling.temp_scaling_ece(scores, pred_y_val, true_y_val, 15)                            
#     np.save(f'{exp_name}/psi_opt_temp_ece.npy', opt_temp)

In [None]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/finetuned'
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = np.load(f'{exp_name}/pvi_class_val.npy')
    
    opt_temp = temp_scaling.temp_scaling_aurc(scores, pred_y_val, true_y_val)                                 
    np.save(f'{exp_name}/pvi_opt_temp_aurc.npy', opt_temp)
    
    opt_temp = temp_scaling.temp_scaling_nll(scores, true_y_val)                                          
    np.save(f'{exp_name}/pvi_opt_temp_nll.npy', opt_temp)

#     opt_temp = temp_scaling.temp_scaling_ece(scores, pred_y_val, true_y_val, 15)                                          
#     np.save(f'{exp_name}/pvi_opt_temp_ece.npy', opt_temp)

In [13]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = np.load(f'{exp_name}/pvi_class_val.npy')
    
    opt_temp, opt_weights = temp_scaling.ensemble_temp_scaling_nll(scores, true_y_val, num_classes)
    np.save(f'{exp_name}/pvi_opt_temp_ets_nll.npy', opt_temp)
    np.save(f'{exp_name}/pvi_opt_weights_ets_nll.npy', opt_weights)

Run: 1
Run: 2
Run: 3
Run: 4




Run: 5




Run: 6
Run: 7
Run: 8
Run: 9
Run: 10


In [4]:
for run in range(10):
    print(f'Run: {run+1}')
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_val = np.argmax(model.predict(ds_val.batch(512), verbose=1), axis=1)
    scores = np.load(f'{exp_name}/pvi_class_val.npy')
    
    opt_temp, opt_weights = temp_scaling.ensemble_temp_scaling_nll(scores, true_y_val, num_classes)
    np.save(f'{exp_name}/pvi_opt_temp_ets_nll.npy', opt_temp)
    np.save(f'{exp_name}/pvi_opt_weights_ets_nll.npy', opt_weights)
    
    pts = temp_scaling.PTSCalibrator(
    epochs=30,
    lr=1e-3,
    weight_decay=1e-4,
    batch_size=64,
    nlayers=2,
    n_nodes=128,
    length_logits=10,
    top_k_logits=5
)

    pts.tune(logits=scores, labels=pred_y_val)
    pts.save(path=f'{exp_name}/calibration_model/')

Run: 1


2025-04-09 10:54:37.802920: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:467] Loaded cuDNN version 90100




2025-04-09 10:55:05.150416: I external/local_xla/xla/service/service.cc:168] XLA service 0x55f3474445e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-09 10:55:05.150458: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
2025-04-09 10:55:05.155417: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1744196105.268334 1459893 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_1/calibration/pvi/training_from_scratch/calibration_model/pts_model.h5
Run: 2
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_2/calibration/pvi/training_from_scratch/calibration_model/pts_model.h5
Run



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_5/calibration/pvi/training_from_scratch/calibration_model/pts_model.h5
Run: 6




Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_6/calibration/pvi/training_from_scratch/calibration_model/pts_model.h5
Run: 7
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Saved PTS model weights to: ../results/PI_Explainability/vgg16_stl10/run_7/calibration/pvi/training_from_scratch/calibration_model/pts_model.h5
Run

### Failure Detection

In [None]:
def get_confidence_scores(conf_method, model, ds_test, pred_y_test, run, model_name, dataset_name):
    base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration'
    metric = conf_method.split('_')[-1] if 'temp_scaling' in conf_method else None
    method_key = conf_method.replace(f'_temp_scaling_{metric}', '') if metric else conf_method

    if method_key == 'softmax':
        if metric:
            opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
            return methods.max_softmax_prob(model, ds_test, opt_temp)
        else:
            return methods.max_softmax_prob(model, ds_test)

    elif method_key in ['pmi', 'psi', 'pvi', 'pvi_best']:
        if method_key == 'pmi':
            exp_path = f'{base_path}/pmi/separable_variational_f_js'
            class_file = 'pmi_output_class_test.npy'
        elif method_key == 'psi':
            exp_path = f'{base_path}/psi/gaussian'
            class_file = 'psi_output_class_500_projs_test.npy'
        elif method_key == 'pvi':
            exp_path = f'{base_path}/pvi/training_from_scratch'
            class_file = 'pvi_class_test.npy'
        elif method_key == 'pvi_best':
            exp_path = f'{base_path}/pvi/training_from_scratch'
            class_file = 'pvi_class_best_test.npy'

        opt_temp = np.load(f'{exp_path}/{method_key}_opt_temp_{metric}.npy')
        scores_class = np.load(f'{exp_path}/{class_file}')
        scores_class = np.array([utils.softmax(x / opt_temp) for x in scores_class])
        return np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])

    elif method_key == 'softmax_margin':
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        return methods.softmax_margin(model, ds_test, opt_temp)

    elif method_key == 'max_logits':
        return methods.max_logits(model, ds_test)

    elif method_key == 'logits_margin':
        return methods.logits_margin(model, ds_test)

    elif method_key == 'negative_entropy':
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        return methods.negative_entropy(model, ds_test, opt_temp)

    elif method_key == 'negative_gini':
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        return methods.negative_gini(model, ds_test, opt_temp)

    elif method_key == 'isotonic_regression':
        return methods.isotonic_reg(model, ds_val, ds_test, true_y_val)

    else:
        raise ValueError(f"Unknown confidence method: {conf_method}")


def evaluate_failure_pred(ds_test, true_y_test, conf_method, n_runs=10):
    results = {
        "auroc": [],
        "fpr_at_95tpr": [],
        "auprc_success": [],
        "auprc_error": [],
        "aurc": [],
        "eaurc": []
    }

    for run in range(n_runs):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        scores_test = get_confidence_scores(conf_method, model, ds_test, pred_y_test, run, model_name, dataset_name)

        results["auroc"].append(metrics.compute_auroc(scores_test, pred_y_test, true_y_test))
        results["auprc_success"].append(metrics.compute_auprc_success(scores_test, pred_y_test, true_y_test))
        results["auprc_error"].append(metrics.compute_auprc_error(scores_test, pred_y_test, true_y_test))
        results["fpr_at_95tpr"].append(metrics.compute_fpr_at_95tpr(scores_test, pred_y_test, true_y_test))
        results["aurc"].append(metrics.compute_aurc(scores_test, pred_y_test, true_y_test))
        results["eaurc"].append(metrics.compute_eaurc(scores_test, pred_y_test, true_y_test))

    return results

In [None]:
# methods_list = ['softmax_temp_scaling_aurc','pmi_temp_scaling_aurc','psi_temp_scaling_aurc','pvi_temp_scaling_aurc',
#                 'softmax_margin_temp_scaling_aurc', 'max_logits', 'logits_margin', 'negative_entropy_temp_scaling_aurc',
#                 'negative_gini_temp_scaling_aurc']
methods_list = ['softmax_temp_scaling_aurc','pmi_temp_scaling_aurc','psi_temp_scaling_aurc','pvi_temp_scaling_aurc',]
for method in methods_list:
    print(f'Method: {method}')
    results = evaluate_failure_pred(ds_test, true_y_test, conf_method=f'{method}', n_runs=10)
    print(f"AUROC           : {utils.format_ci(results['auroc'], scale=100)}")
    print(f"AUPRC (success) : {utils.format_ci(results['auprc_success'], scale=100)}")
    print(f"AUPRC (error)   : {utils.format_ci(results['auprc_error'], scale=100)}")
    print(f"FPR at 95% TPR  : {utils.format_ci(results['fpr_at_95tpr'], scale=100)}")
    print(f"AURC            : {utils.format_ci(results['aurc'], scale=1000)}")
    print(f"EAURC           : {utils.format_ci(results['eaurc'], scale=1000)}")

In [None]:
def get_confidence_scores(conf_method, model, ds_test, pred_y_test, run, model_name, dataset_name):
    base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration'
    metric = conf_method.split('_')[-1] if 'temp_scaling' in conf_method else None
    method_key = conf_method.replace(f'_temp_scaling_{metric}', '') if metric else conf_method

    if method_key == 'softmax':
        if metric:
            opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
            return methods.max_softmax_prob(model, ds_test, opt_temp)
        else:
            return methods.max_softmax_prob(model, ds_test)

    elif method_key in ['pmi', 'psi', 'pvi', 'pvi_best']:
        if method_key == 'pmi':
            exp_path = f'{base_path}/pmi/separable_variational_f_js'
            class_file = 'pmi_output_class_test.npy'
        elif method_key == 'psi':
            exp_path = f'{base_path}/psi/gaussian'
            class_file = 'psi_output_class_500_projs_test.npy'
        elif method_key == 'pvi':
            exp_path = f'{base_path}/pvi/training_from_scratch'
            class_file = 'pvi_class_test.npy'
        elif method_key == 'pvi_best':
            exp_path = f'{base_path}/pvi/training_from_scratch'
            class_file = 'pvi_class_best_test.npy'

        opt_temp = np.load(f'{exp_path}/{method_key}_opt_temp_{metric}.npy')
        scores_class = np.load(f'{exp_path}/{class_file}')
        scores_class = np.array([utils.softmax(x / opt_temp) for x in scores_class])
        return np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])

    elif method_key == 'softmax_margin':
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        return methods.softmax_margin(model, ds_test, opt_temp)

    elif method_key == 'max_logits':
        return methods.max_logits(model, ds_test)

    elif method_key == 'logits_margin':
        return methods.logits_margin(model, ds_test)

    elif method_key == 'negative_entropy':
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        return methods.negative_entropy(model, ds_test, opt_temp)

    elif method_key == 'negative_gini':
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        return methods.negative_gini(model, ds_test, opt_temp)

    elif method_key == 'isotonic_regression':
        return methods.isotonic_reg(model, ds_val, ds_test, true_y_val)

    else:
        raise ValueError(f"Unknown confidence method: {conf_method}")


def evaluate_failure_pred(ds_test, true_y_test, conf_method, n_runs=10):
    results = {
        "auroc": [],
        "fpr_at_95tpr": [],
        "auprc_success": [],
        "auprc_error": [],
        "aurc": [],
        "eaurc": []
    }

    for run in range(n_runs):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        scores_test = get_confidence_scores(conf_method, model, ds_test, pred_y_test, run, model_name, dataset_name)

        results["auroc"].append(metrics.compute_auroc(scores_test, pred_y_test, true_y_test))
        results["auprc_success"].append(metrics.compute_auprc_success(scores_test, pred_y_test, true_y_test))
        results["auprc_error"].append(metrics.compute_auprc_error(scores_test, pred_y_test, true_y_test))
        results["fpr_at_95tpr"].append(metrics.compute_fpr_at_95tpr(scores_test, pred_y_test, true_y_test))
        results["aurc"].append(metrics.compute_aurc(scores_test, pred_y_test, true_y_test))
        results["eaurc"].append(metrics.compute_eaurc(scores_test, pred_y_test, true_y_test))

    return results

In [8]:
def apply_ets(logits, opt_temp, opt_weights, n_class):
    p1 = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    scaled_logits = logits / opt_temp
    p0 = np.exp(scaled_logits) / np.sum(np.exp(scaled_logits), axis=1, keepdims=True)
    p2 = np.ones_like(p0) / n_class
    w = opt_weights / np.sum(opt_weights)  # just in case
    calibrated_probs = w[0] * p0 + w[1] * p1 + w[2] * p2
    return calibrated_probs


method = 'softmax ETS'
print(f'Method: {method}')
results = {
        "auroc": [],
        "fpr_at_95tpr": [],
        "auprc_success": [],
        "auprc_error": [],
        "aurc": [],
        "eaurc": []
    }
for run in range(10):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        
        logits = model.predict(ds_test.batch(512), verbose=0)
        
        base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/'
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_ets_nll.npy')
        opt_weights = np.load(f'{base_path}/softmax_opt_weights_ets_nll.npy')
        
        scores_class = apply_ets(logits,opt_temp,opt_weights,num_classes)
        scores_test = np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])
        
        results["auroc"].append(metrics.compute_auroc(scores_test, pred_y_test, true_y_test))
        results["auprc_success"].append(metrics.compute_auprc_success(scores_test, pred_y_test, true_y_test))
        results["auprc_error"].append(metrics.compute_auprc_error(scores_test, pred_y_test, true_y_test))
        results["fpr_at_95tpr"].append(metrics.compute_fpr_at_95tpr(scores_test, pred_y_test, true_y_test))
        results["aurc"].append(metrics.compute_aurc(scores_test, pred_y_test, true_y_test))
        results["eaurc"].append(metrics.compute_eaurc(scores_test, pred_y_test, true_y_test))
        
print(f"AUROC           : {utils.format_ci(results['auroc'], scale=100)}")
print(f"AUPRC (success) : {utils.format_ci(results['auprc_success'], scale=100)}")
print(f"AUPRC (error)   : {utils.format_ci(results['auprc_error'], scale=100)}")
print(f"FPR at 95% TPR  : {utils.format_ci(results['fpr_at_95tpr'], scale=100)}")
print(f"AURC            : {utils.format_ci(results['aurc'], scale=1000)}")
print(f"EAURC           : {utils.format_ci(results['eaurc'], scale=1000)}")

Method: softmax ETS
AUROC           : 92.04 (0.25)
AUPRC (success) : 99.15 (0.05)
AUPRC (error)   : 50.76 (1.23)
FPR at 95% TPR  : 48.24 (1.71)
AURC            : 12.05 (0.75)
EAURC           : 7.93 (0.48)


In [14]:
def apply_ets(logits, opt_temp, opt_weights, n_class):
    p1 = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    scaled_logits = logits / opt_temp
    p0 = np.exp(scaled_logits) / np.sum(np.exp(scaled_logits), axis=1, keepdims=True)
    p2 = np.ones_like(p0) / n_class
    w = opt_weights / np.sum(opt_weights)  # just in case
    calibrated_probs = w[0] * p0 + w[1] * p1 + w[2] * p2
    return calibrated_probs


method = 'PVI ETS'
print(f'Method: {method}')
results = {
        "auroc": [],
        "fpr_at_95tpr": [],
        "auprc_success": [],
        "auprc_error": [],
        "aurc": [],
        "eaurc": []
    }
for run in range(10):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        
        base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/'
        pvi =  np.load(f'{base_path}/pvi/training_from_scratch/pvi_class_test.npy')
        opt_temp = np.load(f'{base_path}/pvi/training_from_scratch/pvi_opt_temp_ets_nll.npy')
        opt_weights = np.load(f'{base_path}/pvi/training_from_scratch/pvi_opt_weights_ets_nll.npy')
        
        scores_class = apply_ets(pvi,opt_temp,opt_weights,num_classes)
        scores_test = np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])
        
        results["auroc"].append(metrics.compute_auroc(scores_test, pred_y_test, true_y_test))
        results["auprc_success"].append(metrics.compute_auprc_success(scores_test, pred_y_test, true_y_test))
        results["auprc_error"].append(metrics.compute_auprc_error(scores_test, pred_y_test, true_y_test))
        results["fpr_at_95tpr"].append(metrics.compute_fpr_at_95tpr(scores_test, pred_y_test, true_y_test))
        results["aurc"].append(metrics.compute_aurc(scores_test, pred_y_test, true_y_test))
        results["eaurc"].append(metrics.compute_eaurc(scores_test, pred_y_test, true_y_test))
        
print(f"AUROC           : {utils.format_ci(results['auroc'], scale=100)}")
print(f"AUPRC (success) : {utils.format_ci(results['auprc_success'], scale=100)}")
print(f"AUPRC (error)   : {utils.format_ci(results['auprc_error'], scale=100)}")
print(f"FPR at 95% TPR  : {utils.format_ci(results['fpr_at_95tpr'], scale=100)}")
print(f"AURC            : {utils.format_ci(results['aurc'], scale=1000)}")
print(f"EAURC           : {utils.format_ci(results['eaurc'], scale=1000)}")

Method: PVI ETS
AUROC           : 92.79 (0.62)
AUPRC (success) : 99.23 (0.06)
AUPRC (error)   : 55.88 (3.50)
FPR at 95% TPR  : 41.97 (3.61)
AURC            : 11.30 (0.54)
EAURC           : 7.18 (0.53)


In [5]:
method = 'softmax PTS'
print(f'Method: {method}')
results = {
        "auroc": [],
        "fpr_at_95tpr": [],
        "auprc_success": [],
        "auprc_error": [],
        "aurc": [],
        "eaurc": []
    }
for run in range(10):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        
        logits = model.predict(ds_test.batch(512), verbose=0)
        
        pts_loaded = temp_scaling.PTSCalibrator(
        epochs=0,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=64,
        nlayers=2,
        n_nodes=32,
        length_logits=10,
        top_k_logits=5
    )
        pts_loaded.load(path=f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/calibration_model/')
        scores_class = pts_loaded.calibrate(logits)
        scores_test = np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])
        
        results["auroc"].append(metrics.compute_auroc(scores_test, pred_y_test, true_y_test))
        results["auprc_success"].append(metrics.compute_auprc_success(scores_test, pred_y_test, true_y_test))
        results["auprc_error"].append(metrics.compute_auprc_error(scores_test, pred_y_test, true_y_test))
        results["fpr_at_95tpr"].append(metrics.compute_fpr_at_95tpr(scores_test, pred_y_test, true_y_test))
        results["aurc"].append(metrics.compute_aurc(scores_test, pred_y_test, true_y_test))
        results["eaurc"].append(metrics.compute_eaurc(scores_test, pred_y_test, true_y_test))
        
print(f"AUROC           : {utils.format_ci(results['auroc'], scale=100)}")
print(f"AUPRC (success) : {utils.format_ci(results['auprc_success'], scale=100)}")
print(f"AUPRC (error)   : {utils.format_ci(results['auprc_error'], scale=100)}")
print(f"FPR at 95% TPR  : {utils.format_ci(results['fpr_at_95tpr'], scale=100)}")
print(f"AURC            : {utils.format_ci(results['aurc'], scale=1000)}")
print(f"EAURC           : {utils.format_ci(results['eaurc'], scale=1000)}")

Method: softmax PTS
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_1/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_2/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_3/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_4/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_5/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_6/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_7/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_8/calibration/calibration_model/pts_model.h5
Loaded PTS m

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


In [5]:
method = 'PVI PTS'
print(f'Method: {method}')
results = {
        "auroc": [],
        "fpr_at_95tpr": [],
        "auprc_success": [],
        "auprc_error": [],
        "aurc": [],
        "eaurc": []
    }
for run in range(10):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        
        base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/'
        pvi =  np.load(f'{base_path}/pvi/training_from_scratch/pvi_class_test.npy')
        
        pts_loaded = temp_scaling.PTSCalibrator(
        epochs=0,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=64,
        nlayers=2,
        n_nodes=32,
        length_logits=10,
        top_k_logits=5
    )
        pts_loaded.load(path=f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/calibration_model/')
        scores_class = pts_loaded.calibrate(pvi)
        scores_test = np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])
        
        results["auroc"].append(metrics.compute_auroc(scores_test, pred_y_test, true_y_test))
        results["auprc_success"].append(metrics.compute_auprc_success(scores_test, pred_y_test, true_y_test))
        results["auprc_error"].append(metrics.compute_auprc_error(scores_test, pred_y_test, true_y_test))
        results["fpr_at_95tpr"].append(metrics.compute_fpr_at_95tpr(scores_test, pred_y_test, true_y_test))
        results["aurc"].append(metrics.compute_aurc(scores_test, pred_y_test, true_y_test))
        results["eaurc"].append(metrics.compute_eaurc(scores_test, pred_y_test, true_y_test))
        
print(f"AUROC           : {utils.format_ci(results['auroc'], scale=100)}")
print(f"AUPRC (success) : {utils.format_ci(results['auprc_success'], scale=100)}")
print(f"AUPRC (error)   : {utils.format_ci(results['auprc_error'], scale=100)}")
print(f"FPR at 95% TPR  : {utils.format_ci(results['fpr_at_95tpr'], scale=100)}")
print(f"AURC            : {utils.format_ci(results['aurc'], scale=1000)}")
print(f"EAURC           : {utils.format_ci(results['eaurc'], scale=1000)}")

Method: PVI PTS
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_1/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_2/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_3/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_4/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_5/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_6/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_7/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_8/calibration/calibration_model/pts_model.h5
Loaded PTS model

### Calibration

In [4]:
def get_scores_for_calibration(conf_method, model, ds_test, pred_y_test, run, model_name, dataset_name):
    base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration'

    def softmax_scaled(scores, temp=1.0):
        return np.array([utils.softmax(x / temp) for x in scores])

    if conf_method == 'softmax':
        scores_class = methods.softmax_prob(model, ds_test)
        scores_test = methods.max_softmax_prob(model, ds_test)
        return scores_class, scores_test

    if conf_method.startswith('softmax_temp_scaling'):
        metric = conf_method.split('_')[-1]
        opt_temp = np.load(f'{base_path}/softmax_opt_temp_{metric}.npy')
        scores_class = methods.softmax_prob(model, ds_test, opt_temp)
        scores_test = methods.max_softmax_prob(model, ds_test, opt_temp)
        return scores_class, scores_test

    if conf_method in ['pmi', 'psi', 'pvi', 'pvi_best']:
        method = conf_method
        metric = None
        temp = 1.0
    elif conf_method.startswith(('pmi_temp_scaling', 'psi_temp_scaling', 'pvi_temp_scaling', 'pvi_best_temp_scaling')):
        parts = conf_method.split('_')
        method = '_'.join(parts[:2]) if 'best' in parts else parts[0]
        metric = parts[-1]
        method_dir = {
            'pmi': 'pmi/separable_variational_f_js',
            'psi': 'psi/gaussian',
            'pvi': 'pvi/training_from_scratch',
            'pvi_best': 'pvi/training_from_scratch'
        }[method]
        temp = float(np.load(f'{base_path}/{method_dir}/{method}_opt_temp_{metric}.npy'))
    else:
        raise ValueError(f"Unknown confidence method: {conf_method}")

    method_paths = {
        'pmi': (f'{base_path}/pmi/separable_variational_f_js', 'pmi_output_class_test.npy'),
        'psi': (f'{base_path}/psi/gaussian', 'psi_output_class_500_projs_test.npy'),
        'pvi': (f'{base_path}/pvi/training_from_scratch', 'pvi_class_test.npy'),
        'pvi_best': (f'{base_path}/pvi/training_from_scratch', 'pvi_class_best_test.npy'),
    }

    method_path, class_file = method_paths[method]
    scores_class = np.load(f'{method_path}/{class_file}')
    scores_class = softmax_scaled(scores_class, temp)
    scores_test = np.array([score[pred] for score, pred in zip(scores_class, pred_y_test)])
    return scores_class, scores_test

def evaluate_calibration(ds_test, true_y_test, conf_method, n_runs=10):
    results = {
        "ece": [],
        "cc_ece": [],
        "mce": [],
        "ace": [],
        "sce": [],
        "ada_ece": [],
        "ada_sce": [],
        "cc_ada_ece": [],
        "cc_ada_sce": [],
        "cc_ada_sce_rms": [],
        "cw_ece": [],
        "cw_sce": [],
        "cw_ada_ece": [],
        "cw_ada_sce": [],
        "cw_ada_ece_rms": [],
        "cw_ada_sce_rms": [],
        "nll": [],
        "bs": [],
        "sharpness": [],
    }

    for run in range(n_runs):
        tf.keras.utils.set_random_seed(run + 10)
        model = create_model()
        model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

        pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
        scores_class, scores_test = get_scores_for_calibration(
            conf_method, model, ds_test, pred_y_test, run, model_name, dataset_name
        )

#         results["ece"].append(metrics.compute_ece(scores_test, pred_y_test, true_y_test, 15))
#         results["cc_ece"].append(metrics.compute_cc_ece(scores_test, pred_y_test, true_y_test, 15))
#         results["mce"].append(metrics.compute_mce(scores_test, pred_y_test, true_y_test, 15))
#         results["ace"].append(metrics.compute_ace(scores_test, pred_y_test, true_y_test, 15))
#         results["sce"].append(metrics.compute_sce(scores_class, true_y_test, num_classes, 15))
#         results["ada_ece"].append(metrics.compute_adaece(scores_test, pred_y_test, true_y_test, 15))
#         results["ada_sce"].append(metrics.compute_adasce(scores_class, true_y_test, num_classes, 15))
#         results["cc_ada_ece"].append(metrics.compute_cc_adaece(scores_test, pred_y_test, true_y_test, 15))
#         results["cc_ada_sce"].append(metrics.compute_cc_adasce(scores_class, true_y_test, num_classes, 15))
        results["cc_ada_sce_rms"].append(metrics.compute_cc_adasce_rms(scores_class, true_y_test, num_classes, 15))
#         results["cw_ece"].append(metrics.compute_cw_ece(scores_class, true_y_test, num_classes, 15))
#         results["cw_sce"].append(metrics.compute_cw_sce(scores_class, true_y_test, num_classes, 15))
#         results["cw_ada_ece"].append(metrics.compute_cw_adaece(scores_class, true_y_test, num_classes, 15))
#         results["cw_ada_sce"].append(metrics.compute_cw_adasce(scores_class, true_y_test, num_classes, 15))
#         results["cw_ada_ece_rms"].append(metrics.compute_cw_adaece_rms(scores_class, true_y_test, num_classes, 15))
#         results["cw_ada_sce_rms"].append(metrics.compute_cw_adaece_rms(scores_class, true_y_test, num_classes, 15))
#         results["nll"].append(metrics.compute_nll(scores_class, true_y_test, num_classes))
#         results["bs"].append(metrics.compute_brier_score(scores_class, true_y_test, num_classes))
#         results["sharpness"].append(metrics.compute_sharpness(scores_class))

    return results

In [5]:
methods_list = ['softmax','pmi','psi','pvi','pvi_best',
                'softmax_temp_scaling_nll','pmi_temp_scaling_nll','psi_temp_scaling_nll','pvi_temp_scaling_nll','pvi_best_temp_scaling_nll']
for method in methods_list:
    print(f'Method: {method}')
    results = evaluate_calibration(ds_test, true_y_test, conf_method=f'{method}', n_runs=10)
#     print(f"ECE:            {utils.format_ci(results['ece'], scale=100)}")
#     print(f"CC-ECE:         {utils.format_ci(results['cc_ece'], scale=100)}")
#     print(f"MCE:            {utils.format_ci(results['mce'], scale=100)}")
#     print(f"ACE:            {utils.format_ci(results['ace'], scale=100)}")
#     print(f"SCE:            {utils.format_ci(results['sce'], scale=100)}")
#     print(f"Ada-ECE:        {utils.format_ci(results['ada_ece'], scale=100)}")
#     print(f"Ada-SCE:        {utils.format_ci(results['ada_sce'], scale=100)}")
#     print(f"CC-Ada-ECE:     {utils.format_ci(results['cc_ada_ece'], scale=100)}")
#     print(f"CC-Ada-SCE:     {utils.format_ci(results['cc_ada_sce'], scale=100)}")
    print(f"CC-Ada-SCE-RMS: {utils.format_ci(results['cc_ada_sce_rms'], scale=100)}")
#     print(f"CW-ECE:         {utils.format_ci(results['cw_ece'], scale=100)}")
#     print(f"CW-SCE:         {utils.format_ci(results['cw_sce'], scale=100)}")
#     print(f"CW-Ada-ECE:     {utils.format_ci(results['cw_ada_ece'], scale=100)}")
#     print(f"CW-Ada-SCE:     {utils.format_ci(results['cw_ada_sce'], scale=100)}")
#     print(f"CW-Ada-ECE-RMS: {utils.format_ci(results['cw_ada_ece_rms'], scale=100)}")
#     print(f"CW-Ada-SCE-RMS: {utils.format_ci(results['cw_ada_sce_rms'], scale=100)}")
#     print(f"NLL:            {utils.format_ci(results['nll'], scale=100)}")
#     print(f"Brier Score:    {utils.format_ci(results['bs'], scale=100)}")
#     print(f"Sharpness:      {utils.format_ci(results['sharpness'], scale=100)}")

Method: softmax


2025-04-09 08:55:31.137098: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:467] Loaded cuDNN version 90100


CC-Ada-SCE-RMS: 5.81 (0.34)
Method: pmi
CC-Ada-SCE-RMS: 7.59 (0.13)
Method: psi
CC-Ada-SCE-RMS: 9.36 (0.19)
Method: pvi
CC-Ada-SCE-RMS: 8.12 (0.17)
Method: pvi_best
CC-Ada-SCE-RMS: 8.01 (0.04)
Method: softmax_temp_scaling_nll
CC-Ada-SCE-RMS: 8.11 (0.18)
Method: pmi_temp_scaling_nll
CC-Ada-SCE-RMS: 8.46 (0.12)
Method: psi_temp_scaling_nll
CC-Ada-SCE-RMS: 8.52 (0.13)
Method: pvi_temp_scaling_nll
CC-Ada-SCE-RMS: 8.36 (0.14)
Method: pvi_best_temp_scaling_nll
CC-Ada-SCE-RMS: 8.19 (0.00)


In [12]:
def apply_ets(logits, opt_temp, opt_weights, n_class):
    p1 = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    scaled_logits = logits / opt_temp
    p0 = np.exp(scaled_logits) / np.sum(np.exp(scaled_logits), axis=1, keepdims=True)
    p2 = np.ones_like(p0) / n_class
    w = opt_weights / np.sum(opt_weights)  # just in case
    calibrated_probs = w[0] * p0 + w[1] * p1 + w[2] * p2
    return calibrated_probs


method = 'softmax ETS'
print(f'Method: {method}')
results = {
        "sce": [],
        "ada_sce": [],
        "cc_ada_sce": [],
        "cc_ada_sce_rms": [],
        "nll": [],
        "bs": [],
    }
for run in range(10):
    tf.keras.utils.set_random_seed(run + 10)
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)

    logits = model.predict(ds_test.batch(512), verbose=0)

    base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/'
    opt_temp = np.load(f'{base_path}/softmax_opt_temp_ets_nll.npy')
    opt_weights = np.load(f'{base_path}/softmax_opt_weights_ets_nll.npy')

    scores_class = apply_ets(logits,opt_temp,opt_weights,num_classes)

    results["sce"].append(metrics.compute_sce(scores_class, true_y_test, num_classes, 15))
    results["ada_sce"].append(metrics.compute_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce"].append(metrics.compute_cc_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce_rms"].append(metrics.compute_cc_adasce_rms(scores_class, true_y_test, num_classes, 15))
    results["nll"].append(metrics.compute_nll(scores_class, true_y_test, num_classes))
    results["bs"].append(metrics.compute_brier_score(scores_class, true_y_test, num_classes))
        
print(f"SCE:            {utils.format_ci(results['sce'], scale=100)}")
print(f"Ada-SCE:        {utils.format_ci(results['ada_sce'], scale=100)}")
print(f"CC-Ada-SCE:     {utils.format_ci(results['cc_ada_sce'], scale=100)}")
print(f"CC-Ada-SCE-RMS: {utils.format_ci(results['cc_ada_sce_rms'], scale=100)}")
print(f"NLL:            {utils.format_ci(results['nll'], scale=100)}")
print(f"Brier Score:    {utils.format_ci(results['bs'], scale=100)}")

Method: softmax ETS
SCE:            0.59 (0.04)
Ada-SCE:        0.51 (0.04)
CC-Ada-SCE:     1.16 (0.05)
CC-Ada-SCE-RMS: 8.14 (0.17)
NLL:            27.64 (0.87)
Brier Score:    13.07 (0.40)


In [15]:
def apply_ets(logits, opt_temp, opt_weights, n_class):
    p1 = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    scaled_logits = logits / opt_temp
    p0 = np.exp(scaled_logits) / np.sum(np.exp(scaled_logits), axis=1, keepdims=True)
    p2 = np.ones_like(p0) / n_class
    w = opt_weights / np.sum(opt_weights)  # just in case
    calibrated_probs = w[0] * p0 + w[1] * p1 + w[2] * p2
    return calibrated_probs


method = 'PVI ETS'
print(f'Method: {method}')
results = {
        "sce": [],
        "ada_sce": [],
        "cc_ada_sce": [],
        "cc_ada_sce_rms": [],
        "nll": [],
        "bs": [],
    }
for run in range(10):
    tf.keras.utils.set_random_seed(run + 10)
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
    base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/'
    pvi =  np.load(f'{base_path}/pvi/training_from_scratch/pvi_class_test.npy')
    opt_temp = np.load(f'{base_path}/pvi/training_from_scratch/pvi_opt_temp_ets_nll.npy')
    opt_weights = np.load(f'{base_path}/pvi/training_from_scratch/pvi_opt_weights_ets_nll.npy')

    scores_class = apply_ets(pvi,opt_temp,opt_weights,num_classes)

    results["sce"].append(metrics.compute_sce(scores_class, true_y_test, num_classes, 15))
    results["ada_sce"].append(metrics.compute_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce"].append(metrics.compute_cc_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce_rms"].append(metrics.compute_cc_adasce_rms(scores_class, true_y_test, num_classes, 15))
    results["nll"].append(metrics.compute_nll(scores_class, true_y_test, num_classes))
    results["bs"].append(metrics.compute_brier_score(scores_class, true_y_test, num_classes))
        
print(f"SCE:            {utils.format_ci(results['sce'], scale=100)}")
print(f"Ada-SCE:        {utils.format_ci(results['ada_sce'], scale=100)}")
print(f"CC-Ada-SCE:     {utils.format_ci(results['cc_ada_sce'], scale=100)}")
print(f"CC-Ada-SCE-RMS: {utils.format_ci(results['cc_ada_sce_rms'], scale=100)}")
print(f"NLL:            {utils.format_ci(results['nll'], scale=100)}")
print(f"Brier Score:    {utils.format_ci(results['bs'], scale=100)}")

Method: PVI ETS
SCE:            0.59 (0.04)
Ada-SCE:        0.54 (0.05)
CC-Ada-SCE:     1.24 (0.04)
CC-Ada-SCE-RMS: 8.30 (0.15)
NLL:            27.74 (0.79)
Brier Score:    13.03 (0.40)


In [6]:
method = 'softmax ETS'
print(f'Method: {method}')
results = {
        "sce": [],
        "ada_sce": [],
        "cc_ada_sce": [],
        "cc_ada_sce_rms": [],
        "nll": [],
        "bs": [],
    }
for run in range(10):
    tf.keras.utils.set_random_seed(run + 10)
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)

    logits = model.predict(ds_test.batch(512), verbose=0)

    pts_loaded = temp_scaling.PTSCalibrator(
        epochs=0,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=64,
        nlayers=2,
        n_nodes=32,
        length_logits=10,
        top_k_logits=5
    )
    pts_loaded.load(path=f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/calibration_model/')
    scores_class = pts_loaded.calibrate(logits)

    results["sce"].append(metrics.compute_sce(scores_class, true_y_test, num_classes, 15))
    results["ada_sce"].append(metrics.compute_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce"].append(metrics.compute_cc_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce_rms"].append(metrics.compute_cc_adasce_rms(scores_class, true_y_test, num_classes, 15))
    results["nll"].append(metrics.compute_nll(scores_class, true_y_test, num_classes))
    results["bs"].append(metrics.compute_brier_score(scores_class, true_y_test, num_classes))
        
print(f"SCE:            {utils.format_ci(results['sce'], scale=100)}")
print(f"Ada-SCE:        {utils.format_ci(results['ada_sce'], scale=100)}")
print(f"CC-Ada-SCE:     {utils.format_ci(results['cc_ada_sce'], scale=100)}")
print(f"CC-Ada-SCE-RMS: {utils.format_ci(results['cc_ada_sce_rms'], scale=100)}")
print(f"NLL:            {utils.format_ci(results['nll'], scale=100)}")
print(f"Brier Score:    {utils.format_ci(results['bs'], scale=100)}")

Method: softmax ETS
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_1/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_2/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_3/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_4/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_5/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_6/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_7/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_8/calibration/calibration_model/pts_model.h5
Loaded PTS m

In [6]:
method = 'PVI PTS'
print(f'Method: {method}')
results = {
        "sce": [],
        "ada_sce": [],
        "cc_ada_sce": [],
        "cc_ada_sce_rms": [],
        "nll": [],
        "bs": [],
    }
for run in range(10):
    tf.keras.utils.set_random_seed(run + 10)
    model = create_model()
    model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')

    pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)
    base_path = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/'
    pvi =  np.load(f'{base_path}/pvi/training_from_scratch/pvi_class_test.npy')

    pts_loaded = temp_scaling.PTSCalibrator(
        epochs=0,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=64,
        nlayers=2,
        n_nodes=32,
        length_logits=10,
        top_k_logits=5
    )
    pts_loaded.load(path=f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/calibration_model/')
    scores_class = pts_loaded.calibrate(pvi)

    results["sce"].append(metrics.compute_sce(scores_class, true_y_test, num_classes, 15))
    results["ada_sce"].append(metrics.compute_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce"].append(metrics.compute_cc_adasce(scores_class, true_y_test, num_classes, 15))
    results["cc_ada_sce_rms"].append(metrics.compute_cc_adasce_rms(scores_class, true_y_test, num_classes, 15))
    results["nll"].append(metrics.compute_nll(scores_class, true_y_test, num_classes))
    results["bs"].append(metrics.compute_brier_score(scores_class, true_y_test, num_classes))
        
print(f"SCE:            {utils.format_ci(results['sce'], scale=100)}")
print(f"Ada-SCE:        {utils.format_ci(results['ada_sce'], scale=100)}")
print(f"CC-Ada-SCE:     {utils.format_ci(results['cc_ada_sce'], scale=100)}")
print(f"CC-Ada-SCE-RMS: {utils.format_ci(results['cc_ada_sce_rms'], scale=100)}")
print(f"NLL:            {utils.format_ci(results['nll'], scale=100)}")
print(f"Brier Score:    {utils.format_ci(results['bs'], scale=100)}")

Method: PVI PTS
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_1/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_2/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_3/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_4/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_5/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_6/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_7/calibration/calibration_model/pts_model.h5
Loaded PTS model weights from: ../results/PI_Explainability/vgg16_stl10/run_8/calibration/calibration_model/pts_model.h5
Loaded PTS model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

run = 0
conf_bin_num=15

def reliability_params(df):
    df['correct'] = (df.pred == df.true).astype(int)
    bins = np.linspace(0, 1, conf_bin_num + 1)
    df['bin'] = pd.cut(df['conf'], bins=bins, include_lowest=True, labels=False)

    bin_acc = df.groupby('bin')['correct'].mean()
    bin_counts = df.groupby('bin')['conf'].count()

    bin_centers = (bins[:-1] + bins[1:]) / 2

    acc = np.zeros(conf_bin_num)
    for i in range(conf_bin_num):
        acc[i] = bin_acc[i] if i in bin_acc else 0
    return acc, bin_centers

model = create_model()
model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)

exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pmi/separable_variational_f_js'
opt_temp = np.load(f'{exp_name}/pmi_opt_temp_nll.npy')
scores_class = np.load(f'{exp_name}/pmi_output_class_test.npy')
scores_class = np.array([utils.softmax(x/opt_temp) for x in scores_class])
scores_test = np.array([score[pred_value] for score, pred_value in zip(scores_class, pred_y_test)])

df = pd.DataFrame({'conf': scores_test, 'true': true_y_test, 'pred': pred_y_test})
acc, bin_centers = reliability_params(df)
plt.bar(bin_centers, acc, width=1.0/conf_bin_num, color='red', edgecolor='black', align='center', alpha=0.7, label='PMI')

# exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/psi/gaussian'
# opt_temp = np.load(f'{exp_name}/psi_opt_temp_nll.npy')
# scores_class = np.load(f'{exp_name}/psi_output_class_500_projs_test.npy')
# scores_class = np.array([utils.softmax(x/opt_temp) for x in scores_class])
# scores_test = np.array([score[pred_value] for score, pred_value in zip(scores_class, pred_y_test)])

# df = pd.DataFrame({'conf': scores_test, 'true': true_y_test, 'pred': pred_y_test})
# acc, bin_centers = reliability_params(df)
# plt.bar(bin_centers, acc, width=1.0/conf_bin_num, edgecolor='black', align='center', alpha=0.7, label='PSI')

# exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
# opt_temp = np.load(f'{exp_name}/pvi_opt_temp_nll.npy')
# scores_class = np.load(f'{exp_name}/pvi_class_test.npy')
# scores_class = np.array([utils.softmax(x/opt_temp) for x in scores_class])
# scores_test = np.array([score[pred_value] for score, pred_value in zip(scores_class, pred_y_test)])

# df = pd.DataFrame({'conf': scores_test, 'true': true_y_test, 'pred': pred_y_test})
# acc, bin_centers = reliability_params(df)
# plt.bar(bin_centers, acc, width=1.0/conf_bin_num, color='red', edgecolor='black', align='center', alpha=0.7, label='PVI')

opt_temp = np.load(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_nll.npy')
scores_class = methods.softmax_prob(model, ds_test, opt_temp)
scores_test = methods.max_softmax_prob(model, ds_test, opt_temp)

df = pd.DataFrame({'conf': scores_test, 'true': true_y_test, 'pred': pred_y_test})
acc, bin_centers = reliability_params(df)

plt.bar(bin_centers, acc, width=1.0/conf_bin_num, color='blue', edgecolor='black', align='center', alpha=0.7, label='Softmax')

plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
plt.xlabel('Confidence')
plt.ylabel('Accuracy')
plt.title('Reliability Diagram')
plt.grid(True)
plt.legend()
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.show()

In [None]:
def ece_contrib_params(df):
    
    df['correct'] = (df.pred == df.true).astype(int)
    bins = np.linspace(0, 1, conf_bin_num + 1)
    df['bin'] = pd.cut(df['conf'], bins=bins, include_lowest=True, labels=False)

    bin_acc = df.groupby('bin')['correct'].mean()
    bin_conf = df.groupby('bin')['conf'].mean()
    bin_counts = df.groupby('bin')['conf'].count()
    total = len(df)
    
    bin_centers = (bins[:-1] + bins[1:]) / 2

    bin_ece_contrib = np.zeros(conf_bin_num)
    for i in range(conf_bin_num):
        if i in bin_acc and i in bin_conf:
            bin_ece_contrib[i] = np.abs(bin_acc[i] - bin_conf[i]) * (bin_counts[i] / total)
    return bin_ece_contrib, bin_centers

model = create_model()
model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)

exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
opt_temp = np.load(f'{exp_name}/pvi_opt_temp_nll.npy')
scores_class = np.load(f'{exp_name}/pvi_class_test.npy')
scores_class = np.array([utils.softmax(x/opt_temp) for x in scores_class])
scores_test = np.array([score[pred_value] for score, pred_value in zip(scores_class, pred_y_test)])

df = pd.DataFrame({'conf': scores_test, 'true': true_y_test, 'pred': pred_y_test})
bin_ece_contrib, bin_centers = ece_contrib_params(df)
plt.figure(figsize=(7, 5))
plt.bar(bin_centers, bin_ece_contrib, width=1.0/conf_bin_num, color='red', edgecolor='black', align='center', alpha=0.7, label='PVI')

opt_temp = np.load(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_nll.npy')
scores_class = methods.softmax_prob(model, ds_test, opt_temp)
scores_test = methods.max_softmax_prob(model, ds_test, opt_temp)

df = pd.DataFrame({'conf': scores_test, 'true': true_y_test, 'pred': pred_y_test})
bin_ece_contrib, bin_centers = ece_contrib_params(df)

plt.bar(bin_centers, bin_ece_contrib, width=1.0/conf_bin_num, color='blue', edgecolor='black', align='center', alpha=0.7, label='Softmax')

plt.xlabel('Confidence')
plt.ylabel('ECE Contribution')
plt.title('Per-Bin Contribution to ECE')
plt.grid(True)
plt.show()

In [None]:
def bin_counts_params(conf):
    bins = np.linspace(0, 1, conf_bin_num + 1)
    bin_indices = pd.cut(conf, bins=bins, include_lowest=True, labels=False)

    counts = pd.Series(bin_indices).value_counts().sort_index()
    full_counts = np.zeros(conf_bin_num)
    for i in range(conf_bin_num):
        if i in counts:
            full_counts[i] = counts[i]

    bin_centers = (bins[:-1] + bins[1:]) / 2
    return full_counts, bin_centers

model = create_model()
model.load_weights(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/saved_models/trained_weights.h5')
pred_y_test = np.argmax(model.predict(ds_test.batch(256), verbose=0), axis=1)

exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/training_from_scratch'
opt_temp = np.load(f'{exp_name}/pvi_opt_temp_nll.npy')
scores_class = np.load(f'{exp_name}/pvi_class_test.npy')
scores_class = np.array([utils.softmax(x/opt_temp) for x in scores_class])
scores_test = np.array([score[pred_value] for score, pred_value in zip(scores_class, pred_y_test)])

full_counts, bin_centers = bin_counts_params(scores_test)
plt.figure(figsize=(7, 5))
plt.bar(bin_centers, full_counts, width=1.0/conf_bin_num, color='red', edgecolor='black', align='center', label='PVI')

opt_temp = np.load(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/softmax_opt_temp_nll.npy')
scores_class = methods.softmax_prob(model, ds_test, opt_temp)
scores_test = methods.max_softmax_prob(model, ds_test, opt_temp)

full_counts, bin_centers = bin_counts_params(scores_test)

plt.bar(bin_centers, full_counts, width=1.0/conf_bin_num, color='blue', edgecolor='black', align='center', label='Softmax')

plt.xlabel('Confidence')
plt.ylabel('Sample Count')
plt.title('Number of Samples per Confidence Bin')
plt.grid(True)
plt.show()