# Calibration

Evaluating calibration methods on convolutional neural networks.

In [1]:
import numpy as np
import pandas as pd
from os.path import join
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from cal_methods import TemperatureScaling, evaluate, softmax, cal_results

Using TensorFlow backend.


Paths to files with logits.

In [2]:
PATH = '/Users/wildflowerlyi/Desktop/Github/NN_calibration/'
files = ('resnet_cifar/probs_resnet110_c10clip_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_augmean_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_2250mean_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_1125mean_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_560mean_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_2250_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_1125_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_560_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_interpol2_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_interpol2_2250_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_interpol2_1125_logits.p'
         ,'resnet_cifar/probs_resnet110_c10clip_aug_interpol2_560_logits.p'
        )

### Isotonic Regression

In [3]:
df_iso = cal_results(IsotonicRegression, PATH, files, {'y_min':0, 'y_max':1}, approach = "single")

cifar/probs_resnet110_c10clip
('Accuracy:', 93.43)
('Error:', 6.569999999999993)
('ECE:', 0.04945112048983577)
('MCE:', 0.3290730118751526)
('Loss:', 0.37601464391059347)
cifar/probs_resnet110_c10clip_augmean
('Accuracy:', 93.01)
('Error:', 6.989999999999995)
('ECE:', 0.04638580127060413)
('MCE:', 0.32111917436122894)
('Loss:', 0.3885267087939526)
cifar/probs_resnet110_c10clip_aug_2250mean
('Accuracy:', 92.69)
('Error:', 7.310000000000002)
('ECE:', 0.05112848895192146)
('MCE:', 0.3638733774423599)
('Loss:', 0.37633696130207867)
cifar/probs_resnet110_c10clip_aug_1125mean
('Accuracy:', 92.82000000000001)
('Error:', 7.179999999999993)
('ECE:', 0.05065809172987938)
('MCE:', 0.33941681563854215)
('Loss:', 0.3812145050069002)
cifar/probs_resnet110_c10clip_aug_560mean
('Accuracy:', 93.27)
('Error:', 6.730000000000004)
('ECE:', 0.04739113273620607)
('MCE:', 0.27788410355283333)
('Loss:', 0.3663087175342839)
cifar/probs_resnet110_c10clip_aug
('Accuracy:', 92.47999999999999)
('Error:', 7.5200000

### Temperature scaling

In [4]:
df_temp_scale = cal_results(TemperatureScaling, PATH, files, approach = "all")

cifar/probs_resnet110_c10clip
('Accuracy:', 93.43)
('Error:', 6.569999999999993)
('ECE:', 0.04945112048983577)
('MCE:', 0.3290730118751526)
('Loss:', 0.37601464391059347)
cifar/probs_resnet110_c10clip_augmean
('Accuracy:', 93.01)
('Error:', 6.989999999999995)
('ECE:', 0.04638580127060413)
('MCE:', 0.32111917436122894)
('Loss:', 0.3885267087939526)
cifar/probs_resnet110_c10clip_aug_2250mean
('Accuracy:', 92.69)
('Error:', 7.310000000000002)
('ECE:', 0.05112848895192146)
('MCE:', 0.3638733774423599)
('Loss:', 0.37633696130207867)
cifar/probs_resnet110_c10clip_aug_1125mean
('Accuracy:', 92.82000000000001)
('Error:', 7.179999999999993)
('ECE:', 0.05065809172987938)
('MCE:', 0.33941681563854215)
('Loss:', 0.3812145050069002)
cifar/probs_resnet110_c10clip_aug_560mean
('Accuracy:', 93.27)
('Error:', 6.730000000000004)
('ECE:', 0.04739113273620607)
('MCE:', 0.27788410355283333)
('Loss:', 0.3663087175342839)
cifar/probs_resnet110_c10clip_aug
('Accuracy:', 92.47999999999999)
('Error:', 7.5200000

#### Calibrated scores for CIFAR datasets.

In [5]:
df_iso

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,cifar/probs_resnet110_c10clip,6.57,0.049451,0.329073,0.376015
1,cifar/probs_resnet110_c10clip_calib,6.45,0.013637,0.098281,0.277402
2,cifar/probs_resnet110_c10clip_augmean,6.99,0.046386,0.321119,0.388527
3,cifar/probs_resnet110_c10clip_augmean_calib,7.13,0.0134,0.181916,0.351594
4,cifar/probs_resnet110_c10clip_aug_2250mean,7.31,0.051128,0.363873,0.376337
5,cifar/probs_resnet110_c10clip_aug_2250mean_calib,7.27,0.015229,0.262662,0.291171
6,cifar/probs_resnet110_c10clip_aug_1125mean,7.18,0.050658,0.339417,0.381215
7,cifar/probs_resnet110_c10clip_aug_1125mean_calib,7.19,0.014753,0.136246,0.34701
8,cifar/probs_resnet110_c10clip_aug_560mean,6.73,0.047391,0.277884,0.366309
9,cifar/probs_resnet110_c10clip_aug_560mean_calib,6.93,0.010145,0.089232,0.277712


In [6]:
df_temp_scale

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,cifar/probs_resnet110_c10clip,6.57,0.049451,0.329073,0.376015
1,cifar/probs_resnet110_c10clip_calib,6.57,0.009663,0.132728,0.21464
2,cifar/probs_resnet110_c10clip_augmean,6.99,0.046386,0.321119,0.388527
3,cifar/probs_resnet110_c10clip_augmean_calib,6.99,0.008131,0.250516,0.267974
4,cifar/probs_resnet110_c10clip_aug_2250mean,7.31,0.051128,0.363873,0.376337
5,cifar/probs_resnet110_c10clip_aug_2250mean_calib,7.31,0.009419,0.240732,0.256992
6,cifar/probs_resnet110_c10clip_aug_1125mean,7.18,0.050658,0.339417,0.381215
7,cifar/probs_resnet110_c10clip_aug_1125mean_calib,7.18,0.011827,0.120542,0.252089
8,cifar/probs_resnet110_c10clip_aug_560mean,6.73,0.047391,0.277884,0.366309
9,cifar/probs_resnet110_c10clip_aug_560mean_calib,6.73,0.006387,0.145598,0.234207


## Dataframe with results 

In [7]:
dfs = [df_iso, df_temp_scale]
names = ["Name", "Uncalibrated", "Isotonic Regression", "Temperature Scaling"]


def get_dataframe(dfs, column, names):

    df_res = pd.DataFrame(columns=names)

    for i in range(1, len(df_iso), 2):

        name = dfs[0].iloc[i-1]["Name"] # Get name of method
        uncalibrated = dfs[0].iloc[i-1][column]  # Get uncalibrated score

        row = [name, uncalibrated]  # Add scores to row

        for df in dfs:
            row.append(df.iloc[i][column])

        df_res.loc[(i-1)//2] = row
    
    df_res.set_index('Name', inplace = True)
        
    return df_res

In [8]:
df_error = get_dataframe(dfs, "Error", names)
df_ece = get_dataframe(dfs, "ECE", names)
df_mce = get_dataframe(dfs, "MCE", names)
df_loss = get_dataframe(dfs, "Loss", names)

## Scores

In [9]:
def highlight_min(s):
    '''
    highlight the min in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

## Error Rate

In [10]:
df_error.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10clip,6.57,6.45,6.57
cifar/probs_resnet110_c10clip_augmean,6.99,7.13,6.99
cifar/probs_resnet110_c10clip_aug_2250mean,7.31,7.27,7.31
cifar/probs_resnet110_c10clip_aug_1125mean,7.18,7.19,7.18
cifar/probs_resnet110_c10clip_aug_560mean,6.73,6.93,6.73
cifar/probs_resnet110_c10clip_aug,7.52,7.48,7.52
cifar/probs_resnet110_c10clip_aug_2250,7.26,7.31,7.26
cifar/probs_resnet110_c10clip_aug_1125,7.06,7.16,7.06
cifar/probs_resnet110_c10clip_aug_560,6.71,6.83,6.71
cifar/probs_resnet110_c10clip_aug_interpol2,6.99,6.97,6.99


## ECE

In [11]:
df_ece.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10clip,0.0494511,0.0136369,0.00966252
cifar/probs_resnet110_c10clip_augmean,0.0463858,0.0133997,0.00813075
cifar/probs_resnet110_c10clip_aug_2250mean,0.0511285,0.0152291,0.00941942
cifar/probs_resnet110_c10clip_aug_1125mean,0.0506581,0.0147528,0.0118265
cifar/probs_resnet110_c10clip_aug_560mean,0.0473911,0.0101452,0.00638738
cifar/probs_resnet110_c10clip_aug,0.0495661,0.0147355,0.00701188
cifar/probs_resnet110_c10clip_aug_2250,0.0500325,0.0172277,0.0116023
cifar/probs_resnet110_c10clip_aug_1125,0.0483665,0.0124107,0.00658389
cifar/probs_resnet110_c10clip_aug_560,0.0473179,0.0122243,0.00649982
cifar/probs_resnet110_c10clip_aug_interpol2,0.05177,0.0156049,0.009412


## MCE

In [12]:
df_mce.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10clip,0.329073,0.0982814,0.132728
cifar/probs_resnet110_c10clip_augmean,0.321119,0.181916,0.250516
cifar/probs_resnet110_c10clip_aug_2250mean,0.363873,0.262662,0.240732
cifar/probs_resnet110_c10clip_aug_1125mean,0.339417,0.136246,0.120542
cifar/probs_resnet110_c10clip_aug_560mean,0.277884,0.0892319,0.145598
cifar/probs_resnet110_c10clip_aug,0.253617,0.384103,0.199705
cifar/probs_resnet110_c10clip_aug_2250,0.315802,0.144837,0.242517
cifar/probs_resnet110_c10clip_aug_1125,0.282526,0.131619,0.13784
cifar/probs_resnet110_c10clip_aug_560,0.299693,0.366628,0.0772722
cifar/probs_resnet110_c10clip_aug_interpol2,0.307911,0.241892,0.256922


## Loss

In [13]:
df_loss.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10clip,0.376015,0.277402,0.21464
cifar/probs_resnet110_c10clip_augmean,0.388527,0.351594,0.267974
cifar/probs_resnet110_c10clip_aug_2250mean,0.376337,0.291171,0.256992
cifar/probs_resnet110_c10clip_aug_1125mean,0.381215,0.34701,0.252089
cifar/probs_resnet110_c10clip_aug_560mean,0.366309,0.277712,0.234207
cifar/probs_resnet110_c10clip_aug,0.393511,0.315992,0.276306
cifar/probs_resnet110_c10clip_aug_2250,0.399636,0.313541,0.27224
cifar/probs_resnet110_c10clip_aug_1125,0.37671,0.297253,0.250734
cifar/probs_resnet110_c10clip_aug_560,0.365818,0.294716,0.236182
cifar/probs_resnet110_c10clip_aug_interpol2,0.416593,0.303821,0.253339
