# Calibration

Evaluating calibration methods on convolutional neural networks.

In [1]:
import numpy as np
import pandas as pd
from os.path import join
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from cal_methodspy3 import TemperatureScaling, evaluate, cal_results

Using TensorFlow backend.


## Calibration approaches

#### 1-vs-K calibration
Histogram binning, isotonic regression and beta calibration are calibrated in 1-vs-K fashion. This means K different models are trained for K classes, so 1 model for each class
#### Multiclass calibration
Temperature scaling calibrates one model for all the classes together.

## Calibration of Predictions.

Paths to files with logits.

In [4]:
PATH = '/Users/wildflowerlyi/Desktop/Github/NN_calibration/scripts'
files = ('resnet_cifar/probs_resnet110_c10_logits.p', 'resnet_cifar/probs_resnet110_c100_logits.p'
        , 'resnet_wide/probs_resnet_wide32_c10_logits.p','resnet_wide/probs_resnet_wide32_c100_logits.p'
        , 'resnet_densenet/probs_densenet40_c10_logits.p','resnet_densenet/probs_densenet40_c100_logits.p'
        )

### Isotonic Regression

In [5]:
df_iso = cal_results(IsotonicRegression, PATH, files, {'y_min':0, 'y_max':1}, approach = "single")

cifar/probs_resnet110_c10
Accuracy: 93.39
Error: 6.61
ECE: 0.0482704070032
MCE: 0.379260828194
Loss: 0.382928193973


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### Temperature scaling

In [4]:
df_temp_scale = cal_results(TemperatureScaling, PATH, files, approach = "all")

cifar/probs_resnet110_c10
('Accuracy:', 93.589999999999989)
('Error:', 6.4100000000000108)
('ECE:', 0.046430649885535205)
('MCE:', 0.32451272834607259)
('Loss:', 0.33779848147198643)
('Time taken:', 2.539167881011963, '\n')
cifar/probs_resnet110_c100
('Accuracy:', 70.899999999999991)
('Error:', 29.100000000000009)
('ECE:', 0.18215797445029019)
('MCE:', 0.40451414786555151)
('Loss:', 1.6784112737965144)
('Time taken:', 3.1326470375061035, '\n')
wide/probs_resnet_wide32_c10
('Accuracy:', 93.799999999999997)
('Error:', 6.2000000000000028)
('ECE:', 0.047311796060204525)
('MCE:', 0.3678059713045756)
('Loss:', 0.37100536326453859)
('Time taken:', 2.916138172149658, '\n')
wide/probs_resnet_wide32_c100
('Accuracy:', 73.740000000000009)
('Error:', 26.259999999999991)
('ECE:', 0.18916032450944181)
('MCE:', 0.44225481720476578)
('Loss:', 1.802476874117287)
('Time taken:', 3.423107862472534, '\n')
densenet/probs_densenet40_c10
('Accuracy:', 92.900000000000006)
('Error:', 7.0999999999999943)
('ECE:

#### Calibrated scores for CIFAR datasets.

In [5]:
df_iso

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,cifar/probs_resnet110_c10,6.41,0.046431,0.324513,0.337798
1,cifar/probs_resnet110_c10_calib,6.35,0.011908,0.133175,0.272972
2,cifar/probs_resnet110_c100,29.1,0.182158,0.404514,1.678411
3,cifar/probs_resnet110_c100_calib,29.81,0.063051,0.135451,1.9264
4,wide/probs_resnet_wide32_c10,6.2,0.047312,0.367806,0.371005
5,wide/probs_resnet_wide32_c10_calib,6.05,0.010466,0.083494,0.217106
6,wide/probs_resnet_wide32_c100,26.26,0.18916,0.442255,1.802477
7,wide/probs_resnet_wide32_c100_calib,26.25,0.057913,0.149843,1.526906
8,densenet/probs_densenet40_c10,7.1,0.051972,0.35494,0.41025
9,densenet/probs_densenet40_c10_calib,7.1,0.012183,0.265471,0.281433


In [6]:
df_temp_scale

Unnamed: 0,Name,Error,ECE,MCE,Loss
0,cifar/probs_resnet110_c10,6.41,0.046431,0.324513,0.337798
1,cifar/probs_resnet110_c10_calib,6.41,0.006597,0.190398,0.207832
2,cifar/probs_resnet110_c100,29.1,0.182158,0.404514,1.678411
3,cifar/probs_resnet110_c100_calib,29.1,0.016323,0.05549,1.108074
4,wide/probs_resnet_wide32_c10,6.2,0.047312,0.367806,0.371005
5,wide/probs_resnet_wide32_c10_calib,6.2,0.008058,0.251238,0.186995
6,wide/probs_resnet_wide32_c100,26.26,0.18916,0.442255,1.802477
7,wide/probs_resnet_wide32_c100_calib,26.26,0.014632,0.054702,0.941676
8,densenet/probs_densenet40_c10,7.1,0.051972,0.35494,0.41025
9,densenet/probs_densenet40_c10_calib,7.1,0.009334,0.096582,0.216684


## Dataframe with results 

In [7]:
dfs = [df_iso, df_temp_scale]
names = ["Name", "Uncalibrated", "Isotonic Regression", "Temperature Scaling"]


def get_dataframe(dfs, column, names):

    df_res = pd.DataFrame(columns=names)

    for i in range(1, len(df_iso), 2):

        name = dfs[0].iloc[i-1]["Name"] # Get name of method
        uncalibrated = dfs[0].iloc[i-1][column]  # Get uncalibrated score

        row = [name, uncalibrated]  # Add scores to row

        for df in dfs:
            row.append(df.iloc[i][column])

        df_res.loc[(i-1)//2] = row
    
    df_res.set_index('Name', inplace = True)
        
    return df_res

In [8]:
df_error = get_dataframe(dfs, "Error", names)
df_ece = get_dataframe(dfs, "ECE", names)
df_mce = get_dataframe(dfs, "MCE", names)
df_loss = get_dataframe(dfs, "Loss", names)

## Scores

In [9]:
def highlight_min(s):
    '''
    highlight the min in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

## Error Rate

In [10]:
df_error.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,6.41,6.35,6.41
cifar/probs_resnet110_c100,29.1,29.81,29.1
wide/probs_resnet_wide32_c10,6.2,6.05,6.2
wide/probs_resnet_wide32_c100,26.26,26.25,26.26
densenet/probs_densenet40_c10,7.1,7.1,7.1
densenet/probs_densenet40_c100,30.08,30.67,30.08


## ECE

In [11]:
df_ece.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,0.0464306,0.0119079,0.00659679
cifar/probs_resnet110_c100,0.182158,0.0630511,0.0163234
wide/probs_resnet_wide32_c10,0.0473118,0.0104662,0.00805839
wide/probs_resnet_wide32_c100,0.18916,0.057913,0.014632
densenet/probs_densenet40_c10,0.0519722,0.0121831,0.00933419
densenet/probs_densenet40_c100,0.213435,0.0636392,0.00995869


## MCE

In [12]:
df_mce.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,0.324513,0.133175,0.190398
cifar/probs_resnet110_c100,0.404514,0.135451,0.0554897
wide/probs_resnet_wide32_c10,0.367806,0.0834937,0.251238
wide/probs_resnet_wide32_c100,0.442255,0.149843,0.0547023
densenet/probs_densenet40_c10,0.35494,0.265471,0.096582
densenet/probs_densenet40_c100,0.447594,0.134847,0.0583265


## Loss

In [13]:
df_loss.style.apply(highlight_min, axis = 1)

Unnamed: 0_level_0,Uncalibrated,Isotonic Regression,Temperature Scaling
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cifar/probs_resnet110_c10,0.337798,0.272972,0.207832
cifar/probs_resnet110_c100,1.67841,1.9264,1.10807
wide/probs_resnet_wide32_c10,0.371005,0.217106,0.186995
wide/probs_resnet_wide32_c100,1.80248,1.52691,0.941676
densenet/probs_densenet40_c10,0.41025,0.281433,0.216684
densenet/probs_densenet40_c100,2.04647,1.76232,1.06756
