# Reliability diagrams

In [None]:
import os, sys
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

from collections import OrderedDict
from reliability_diagrams import *

## Load the data

The results are stored in a CSV file with 3 columns: `true_label`, `pred_label`, `confidence`. For a multi-class model, the predicted label and the confidence are for the highest-scoring class.

In [None]:
csv_dir = "./results"

In [None]:
datasets = sorted(os.listdir(csv_dir))
datasets

In [None]:
files = {}
for dataset in datasets:
    path = os.path.join(csv_dir, dataset)
    filenames = [x for x in sorted(os.listdir(path)) if x[-4:] == ".csv"]
    files[dataset] = filenames

In [None]:
files

## Look at the results for a single model

In [None]:
dataset = "ImageNet_pytorch-image-models"
filename = files[dataset][4]
filename

In [None]:
dataset = "markus93"
filename = files[dataset][1]
filename

In [None]:
dataset = "other"
filename = files[dataset][0]
filename

In [None]:
dataset = "ImageNet_torchvision"
filename = files[dataset][3]
filename

In [None]:
df = pd.read_csv(os.path.join(csv_dir, dataset, filename))
plot_name = filename[:-4] + " " + dataset

In [None]:
df.head()

In [None]:
# Should equal the number of test examples.
len(df)

In [None]:
# Sanity check: compute top-1 accuracy.
(df.true_label == df.pred_label).sum() / len(df)

In [None]:
df.confidence.mean()

In [None]:
y_true = df.true_label.values
y_pred = df.pred_label.values
y_conf = df.confidence.values

## Make the plot

In [None]:
# Override matplotlib default styling.
plt.style.use("seaborn")

plt.rc("font", size=12)
plt.rc("axes", labelsize=12)
plt.rc("xtick", labelsize=12)
plt.rc("ytick", labelsize=12)
plt.rc("legend", fontsize=12)

plt.rc("axes", titlesize=16)
plt.rc("figure", titlesize=16)

In [None]:
title = "\n".join(plot_name.split())

fig = reliability_diagram(y_true, y_pred, y_conf, num_bins=10, draw_ece=True,
                          draw_bin_importance="alpha", draw_averages=True,
                          title=title, figsize=(6, 6), dpi=100, 
                          return_fig=True)

In [None]:
fig.savefig("figures/" + "_".join(plot_name.split()) + ".png", 
            format="png", dpi=144, bbox_inches="tight", pad_inches=0.2)

In [None]:
title = "\n".join(plot_name.split())

fig = reliability_diagram(y_true, y_pred, y_conf, num_bins=10, draw_ece=True,
                          draw_bin_importance="alpha", draw_averages=True,
                          title=title, figsize=(6, 6), dpi=100, 
                          return_fig=True, hist_upside_down=True)

## Look at the bins

In [None]:
bin_data = compute_calibration(y_true, y_pred, y_conf, num_bins=20)

In [None]:
bin_data

## Plot reliability diagrams for all models

In [None]:
def read_results(csv_dir, dataset, filename):
    df = pd.read_csv(os.path.join(csv_dir, dataset, filename))

    y_true = df.true_label.values
    y_pred = df.pred_label.values
    y_conf = df.confidence.values

    plot_name = filename[:-4] + " " + dataset
    return plot_name, { "true_labels": y_true,
                        "pred_labels": y_pred,
                        "confidences": y_conf}

In [None]:
results = OrderedDict()
for dataset, filenames in files.items():
    for filename in filenames:
        plot_name, data = read_results(csv_dir, dataset, filename)
        results[plot_name] = data

In [None]:
fig = reliability_diagrams(results, num_bins=10, draw_bin_importance="alpha",
                           num_cols=5, dpi=100, return_fig=True)

In [None]:
fig.savefig("figures/all.png", format="png", dpi=144, bbox_inches="tight", pad_inches=0.2)

In [None]:
# TODO: maybe draw a separate graph per subdir?