# Calibration

Evaluating calibration methods on convolutional neural networks.

In [1]:
import numpy as np
import pandas as pd
from os.path import join
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from cal_methods import TemperatureScaling, evaluate, softmax, cal_results

Using TensorFlow backend.


Paths to files with logits.

In [2]:
PATH = '/Users/wildflowerlyi/Desktop/Github/NN_calibration/scripts'
files = ('resnet_cifar/probs_resnet110_c10_logits.p'
        , 'resnet_wide/probs_resnet_wide32_c10_logits.p'
        , 'resnet_densenet/probs_densenet40_c10_logits.p'
        ,'resnet_cifar/probs_resnet110_c10clip_logits.p'
        ,'resnet_wide/probs_resnet_wide32_c10clip_logits.p'
        #,'resnet_sd/probs_resnet110_SD_c10clip_logits.p'
        ,'resnet_densenet/probs_densenet40_c10clip_logits.p'
        #,'resnet_cifar/probs_resnet110_c100_logits.p'
        #,'resnet_wide/probs_resnet_wide32_c100_logits.p'
        #,'resnet_densenet/probs_densenet40_c100_logits.p'
        #,'resnet_wide/probs_resnet_wide32_c100clip_logits.p'    
        )

### Isotonic Regression

In [3]:
df_iso = cal_results(IsotonicRegression, PATH, files, {'y_min':0, 'y_max':1}, approach = "single")

cifar/probs_resnet110_c10
('Accuracy:', 93.489999999999995)
('Error:', 6.5100000000000051)
('ECE:', 0.045524321058392554)
('MCE:', 0.31361674944559731)
('Loss:', 0.3370770369223714)
wide/probs_resnet_wide32_c10
('Accuracy:', 93.799999999999997)
('Error:', 6.2000000000000028)
('ECE:', 0.047311796060204525)
('MCE:', 0.3678059713045756)
('Loss:', 0.37100536326453859)
densenet/probs_densenet40_c10
('Accuracy:', 92.900000000000006)
('Error:', 7.0999999999999943)
('ECE:', 0.051972183519601804)
('MCE:', 0.35494045674345875)
('Loss:', 0.41025025195239284)
cifar/probs_resnet110_c10clip
('Accuracy:', 93.430000000000007)
('Error:', 6.5699999999999932)
('ECE:', 0.04945112048983577)
('MCE:', 0.32907301187515259)
('Loss:', 0.37601464391059347)
wide/probs_resnet_wide32_c10clip
('Accuracy:', 94.040000000000006)
('Error:', 5.9599999999999937)
('ECE:', 0.044856612253189064)
('MCE:', 0.31552806931237376)
('Loss:', 0.35969095881889424)
densenet/probs_densenet40_c10clip
('Accuracy:', 93.109999999999999)
('

### Temperature scaling

In [4]:
df_temp_scale = cal_results(TemperatureScaling, PATH, files, approach = "all")

cifar/probs_resnet110_c10
('Accuracy:', 93.489999999999995)
('Error:', 6.5100000000000051)
('ECE:', 0.045524321058392554)
('MCE:', 0.31361674944559731)
('Loss:', 0.3370770369223714)
wide/probs_resnet_wide32_c10
('Accuracy:', 93.799999999999997)
('Error:', 6.2000000000000028)
('ECE:', 0.047311796060204525)
('MCE:', 0.3678059713045756)
('Loss:', 0.37100536326453859)
densenet/probs_densenet40_c10
('Accuracy:', 92.900000000000006)
('Error:', 7.0999999999999943)
('ECE:', 0.051972183519601804)
('MCE:', 0.35494045674345875)
('Loss:', 0.41025025195239284)
cifar/probs_resnet110_c10clip
('Accuracy:', 93.430000000000007)
('Error:', 6.5699999999999932)
('ECE:', 0.04945112048983577)
('MCE:', 0.32907301187515259)
('Loss:', 0.37601464391059347)
wide/probs_resnet_wide32_c10clip
('Accuracy:', 94.040000000000006)
('Error:', 5.9599999999999937)
('ECE:', 0.044856612253189064)
('MCE:', 0.31552806931237376)
('Loss:', 0.35969095881889424)
densenet/probs_densenet40_c10clip
('Accuracy:', 93.109999999999999)
('

#### Calibrated scores for CIFAR datasets.

In [5]:
df_iso

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,cifar/probs_resnet110_c10,6.51,0.045524,0.313617,0.337077
1,cifar/probs_resnet110_c10_calib,6.41,0.012154,0.269047,0.259339
2,wide/probs_resnet_wide32_c10,6.2,0.047312,0.367806,0.371005
3,wide/probs_resnet_wide32_c10_calib,6.05,0.010466,0.083494,0.217106
4,densenet/probs_densenet40_c10,7.1,0.051972,0.35494,0.41025
5,densenet/probs_densenet40_c10_calib,7.1,0.012183,0.265471,0.281433
6,cifar/probs_resnet110_c10clip,6.57,0.049451,0.329073,0.376015
7,cifar/probs_resnet110_c10clip_calib,6.45,0.013637,0.098281,0.277402
8,wide/probs_resnet_wide32_c10clip,5.96,0.044857,0.315528,0.359691
9,wide/probs_resnet_wide32_c10clip_calib,5.68,0.012156,0.186214,0.232532


In [6]:
df_temp_scale

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,cifar/probs_resnet110_c10,6.51,0.045524,0.313617,0.337077
1,cifar/probs_resnet110_c10_calib,6.51,0.008471,0.100231,0.20831
2,wide/probs_resnet_wide32_c10,6.2,0.047312,0.367806,0.371005
3,wide/probs_resnet_wide32_c10_calib,6.2,0.008058,0.251238,0.186995
4,densenet/probs_densenet40_c10,7.1,0.051972,0.35494,0.41025
5,densenet/probs_densenet40_c10_calib,7.1,0.009334,0.096582,0.216684
6,cifar/probs_resnet110_c10clip,6.57,0.049451,0.329073,0.376015
7,cifar/probs_resnet110_c10clip_calib,6.57,0.009663,0.132728,0.21464
8,wide/probs_resnet_wide32_c10clip,5.96,0.044857,0.315528,0.359691
9,wide/probs_resnet_wide32_c10clip_calib,5.96,0.006065,0.242671,0.18537


## Dataframe with results 

In [7]:
dfs = [df_iso, df_temp_scale]
names = ["Name", "Uncalibrated", "Isotonic Regression", "Temperature Scaling"]


def get_dataframe(dfs, column, names):

    df_res = pd.DataFrame(columns=names)

    for i in range(1, len(df_iso), 2):

        name = dfs[0].iloc[i-1]["Name"] # Get name of method
        uncalibrated = dfs[0].iloc[i-1][column]  # Get uncalibrated score

        row = [name, uncalibrated]  # Add scores to row

        for df in dfs:
            row.append(df.iloc[i][column])

        df_res.loc[(i-1)//2] = row
    
    df_res.set_index('Name', inplace = True)
        
    return df_res

In [8]:
df_error = get_dataframe(dfs, "Error", names)
df_ece = get_dataframe(dfs, "ECE", names)
df_mce = get_dataframe(dfs, "MCE", names)
df_loss = get_dataframe(dfs, "Loss", names)

## Scores

In [9]:
def highlight_min(s):
    '''
    highlight the min in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

## Error Rate

In [10]:
df_error.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,6.51,6.41,6.51
wide/probs_resnet_wide32_c10,6.2,6.05,6.2
densenet/probs_densenet40_c10,7.1,7.1,7.1
cifar/probs_resnet110_c10clip,6.57,6.45,6.57
wide/probs_resnet_wide32_c10clip,5.96,5.68,5.96
densenet/probs_densenet40_c10clip,6.89,6.94,6.89


## ECE

In [11]:
df_ece.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,0.0455243,0.0121544,0.00847066
wide/probs_resnet_wide32_c10,0.0473118,0.0104662,0.00805839
densenet/probs_densenet40_c10,0.0519722,0.0121829,0.00933419
cifar/probs_resnet110_c10clip,0.0494511,0.0136369,0.00966252
wide/probs_resnet_wide32_c10clip,0.0448566,0.0121559,0.00606529
densenet/probs_densenet40_c10clip,0.0502588,0.0103795,0.00711893


## MCE

In [12]:
df_mce.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,0.313617,0.269047,0.100231
wide/probs_resnet_wide32_c10,0.367806,0.0834937,0.251238
densenet/probs_densenet40_c10,0.35494,0.265471,0.096582
cifar/probs_resnet110_c10clip,0.329073,0.0982814,0.132728
wide/probs_resnet_wide32_c10clip,0.315528,0.186214,0.242671
densenet/probs_densenet40_c10clip,0.294821,0.115042,0.0553514


## Loss

In [None]:
df_loss.style.apply(highlight_min, axis = 1)