## Multiclass Venn-ABERS calibration  - Satellite Image dataset

This notebook illustrates the multi-class Venn-ABERS calibration as implemented in:

Manokhin, Valery. "Multi-class probabilistic classification using inductive and cross Venn–Abers predictors." In Conformal and Probabilistic Prediction and Applications, pp. 228-240. PMLR, 2017.

As an example we use the Satellite Image dataset described in more detail here: https://www.openml.org/search?type=data&sort=runs&id=182&status=active

## Import libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.metrics import log_loss, brier_score_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.calibration import CalibratedClassifierCV

from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

from venn_abers import VennAbersCalibrator

## Load dataset

In [2]:
data = arff.loadarff('dataset_186_satimage.arff')
df = pd.DataFrame(data[0])
df['class'] = ([int(str(i)[2]) for i in df['class'].values])
df['class'] = df['class'].astype('int')-1

In [3]:
df.head()

Unnamed: 0,Aattr,Battr,Cattr,Dattr,Eattr,Fattr,A1attr,B2attr,C3attr,D4attr,...,D22attr,E23attr,F24attr,A25attr,B26attr,C27attr,D28attr,E29attr,F30attr,class
0,0.117596,1.241362,1.184036,0.815302,-0.158561,1.256483,1.193546,0.818486,-0.141965,0.879481,...,0.807707,-0.069968,1.21916,1.250463,0.597678,-0.054291,1.233342,1.262255,0.603258,0
1,-1.205362,-1.249654,-0.077532,0.444886,-0.895959,-0.447579,-0.78676,-0.554203,-0.364672,0.092157,...,-0.192752,-0.736996,-0.969292,-0.844805,-0.40003,-0.725852,-0.344432,-0.594534,-0.183967,4
2,0.779075,0.148811,0.042617,-0.24303,0.800057,0.164136,0.05337,-0.448612,0.154978,-0.345245,...,-0.877277,0.671174,-0.006373,-0.425752,-0.662584,0.691889,0.356801,-0.175259,-0.236449,6
3,1.146564,0.585831,0.342991,0.021553,0.947536,0.601074,0.353416,0.02655,1.788164,1.010702,...,0.28115,1.412317,1.044084,0.532085,0.282612,1.438068,1.058033,0.842981,0.130923,2
4,-0.764376,-1.16225,-0.137607,0.180303,-0.969698,-1.146681,-0.126658,0.184937,-0.735851,-1.132569,...,-0.192752,-0.885225,-1.231906,-0.784941,-0.347519,-0.875088,-1.220973,-0.774223,-0.551339,4


In [4]:
df['class'].value_counts()

class
0    1531
6    1508
2    1356
4     707
1     703
3     625
Name: count, dtype: int64

There are five classed, with labels 0-6 (class label 5 is missing)

We split the data into a proper training set, calibration set and test set

In [5]:
df_train_cal, df_test = train_test_split(df, test_size = 2000, random_state = 42, shuffle = False)
df_proper_train, df_cal = train_test_split(df_train_cal, test_size = 2000, random_state = 42, shuffle = False)

In [6]:
X_train = df_train_cal.drop('class', axis=1)
y_train = df_train_cal['class']

X_proper_train = df_proper_train.drop('class', axis=1)
y_proper_train = df_proper_train['class']

X_cal = df_cal.drop('class', axis=1)
y_cal = df_cal['class']

X_test = df_test.drop('class', axis=1)
y_test = df_test['class']

In [7]:
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)

In [8]:
def brier_loss_calc(y_true, prob):
    return ((y_true - prob)**2).mean()

## Classifier comparison

We compare 5 underlying classifiers and two calibration methods (sigmoid and isotonic), both in their inductive and cross-validated format

In [9]:
clfs = {}
clfs['Naive Bayes'] = GaussianNB()
clfs['SVM'] = SVC(probability=True)
clfs['RF'] = RandomForestClassifier()
clfs['XGB'] = AdaBoostClassifier()
clfs['Logistic'] = LogisticRegression(max_iter=10000)
clfs['Neural Network'] =  MLPClassifier(max_iter=10000)


def run_multiclass_comparison(clf_name, clf):

    print(clf_name + ':')
    log_loss_list = []
    brier_loss_list = []
    acc_list = []
    ece_list = []
    
    print('base')
    clf.fit(X_train, y_train)
    p_pred = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred))
    
    print('sigmoid')
    clf.fit(X_proper_train, y_proper_train)
    cal_sigm = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit')
    cal_sigm.fit(X_cal, y_cal)
    p_pred = cal_sigm.predict_proba(X_test)
    y_pred = cal_sigm.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred))
    
    print('isotonic')
    cal_iso = CalibratedClassifierCV(clf, method='isotonic', cv='prefit')
    cal_iso.fit(X_cal, y_cal)
    p_pred = cal_iso.predict_proba(X_test)
    y_pred = cal_iso.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred))
    
    print('sigmoid_cv')
    cal_sigm_cv = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
    cal_sigm_cv.fit(X_train, y_train)
    p_pred = cal_sigm_cv.predict_proba(X_test)
    y_pred = cal_sigm_cv.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred))
    
    print('isotonic_cv')
    cal_iso_cv = CalibratedClassifierCV(clf, method='isotonic', cv=5)
    cal_iso_cv.fit(X_train, y_train)
    p_pred = cal_iso_cv.predict_proba(X_test)
    y_pred = cal_iso_cv.predict(X_test)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred))
    
    print('ivap')
    va = VennAbersCalibrator(clf, inductive=True, cal_size=0.2, random_state=42)
    va.fit(np.asarray(X_train), np.asarray(y_train))
    p_pred_va = va.predict_proba(np.array(X_test))
    y_pred = va.predict(np.array(X_test), one_hot=False)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred_va))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred_va))
    
    print('cvap')
    va_cv = VennAbersCalibrator(clf, inductive=False, n_splits=5)
    va_cv.fit(np.asarray(X_train), np.asarray(y_train))
    p_pred_cv = va_cv.predict_proba(np.asarray(X_test))
    y_pred = va_cv.predict(np.array(X_test), one_hot=False)
    acc_list.append(accuracy_score(y_test, y_pred))
    log_loss_list.append(log_loss(y_test, p_pred_cv))
    brier_loss_list.append(brier_loss_calc(y_test_binary, p_pred_cv))

    print('')
    
    df_ll = pd.DataFrame(columns=['Classifier', 'Uncalibrated', 'Platt', 'Isotonic', 'Platt-CV', 'Isotonic-CV', 'IVAP', 'CVAP'])
    df_ll.loc[0] =  [clf_name] + log_loss_list
    df_bl = pd.DataFrame(columns=['Classifier', 'Uncalibrated', 'Platt', 'Isotonic', 'Platt-CV', 'Isotonic-CV', 'IVAP', 'CVAP'])
    df_bl.loc[0] =  [clf_name] + brier_loss_list
    df_acc = pd.DataFrame(columns=['Classifier', 'Uncalibrated', 'Platt', 'Isotonic', 'Platt-CV', 'Isotonic-CV', 'IVAP', 'CVAP'])
    df_acc.loc[0] =  [clf_name] + acc_list

    return df_bl, df_ll, df_acc

We compare accuracy, Brier and log loss

In [10]:
results_brier = pd.DataFrame()
results_log = pd.DataFrame()
results_acc = pd.DataFrame()

for clf_name in clfs:
    scratch_b, scratch_l, scratch_acc = run_multiclass_comparison(clf_name, clfs[clf_name])
    results_brier = pd.concat((results_brier, scratch_b), ignore_index=True)
    results_log = pd.concat((results_log, scratch_l), ignore_index=True)
    results_acc = pd.concat((results_acc, scratch_acc), ignore_index=True)
    

Naive Bayes:
base
sigmoid
isotonic
sigmoid_cv
isotonic_cv
ivap
cvap

SVM:
base
sigmoid
isotonic
sigmoid_cv
isotonic_cv
ivap
cvap

RF:
base
sigmoid
isotonic
sigmoid_cv
isotonic_cv
ivap
cvap

XGB:
base
sigmoid
isotonic
sigmoid_cv
isotonic_cv
ivap
cvap

Logistic:
base
sigmoid
isotonic
sigmoid_cv
isotonic_cv
ivap
cvap

Neural Network:
base
sigmoid
isotonic
sigmoid_cv
isotonic_cv
ivap
cvap



Calibrating can result in a slight improvement in accuracy

In [11]:
results_acc.set_index('Classifier', inplace=True)
results_acc.round(3)

Unnamed: 0_level_0,Uncalibrated,Platt,Isotonic,Platt-CV,Isotonic-CV,IVAP,CVAP
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Naive Bayes,0.796,0.792,0.804,0.795,0.806,0.81,0.812
SVM,0.894,0.88,0.88,0.892,0.89,0.893,0.892
RF,0.906,0.892,0.898,0.906,0.904,0.904,0.902
XGB,0.75,0.518,0.576,0.74,0.77,0.883,0.884
Logistic,0.862,0.802,0.822,0.79,0.814,0.864,0.866
Neural Network,0.908,0.882,0.889,0.906,0.907,0.892,0.898


It generally improves Brier and log loss metrics

In [12]:
results_brier.set_index('Classifier', inplace=True)
results_brier.round(4)

Unnamed: 0_level_0,Uncalibrated,Platt,Isotonic,Platt-CV,Isotonic-CV,IVAP,CVAP
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Naive Bayes,0.0657,0.0571,0.048,0.0556,0.0469,0.0449,0.0444
SVM,0.0257,0.035,0.0309,0.0304,0.0274,0.0273,0.0271
RF,0.0239,0.025,0.0251,0.0233,0.0231,0.0257,0.0248
XGB,0.0994,0.1025,0.0958,0.0744,0.0613,0.0291,0.0283
Logistic,0.0323,0.0529,0.0481,0.0515,0.0455,0.0317,0.0313
Neural Network,0.0253,0.0318,0.0272,0.0245,0.0226,0.0262,0.0245


In [13]:
results_log.set_index('Classifier', inplace=True)
results_log.round(4)

Unnamed: 0_level_0,Uncalibrated,Platt,Isotonic,Platt-CV,Isotonic-CV,IVAP,CVAP
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Naive Bayes,4.0378,0.7704,0.6101,0.7532,0.596,0.515,0.5117
SVM,0.2803,0.4341,0.4587,0.3678,0.3335,0.3383,0.3346
RF,0.2675,0.2978,0.3951,0.2736,0.2912,0.3069,0.295
XGB,1.176,1.2685,1.2437,0.9034,0.728,0.3522,0.349
Logistic,0.3564,0.6104,0.6166,0.5951,0.5857,0.3739,0.3725
Neural Network,0.3779,0.4228,0.4203,0.3279,0.2454,0.315,0.3021


**Mean accuracy**

In [14]:
results_acc.mean()

Uncalibrated    0.852417
Platt           0.794167
Isotonic        0.811167
Platt-CV        0.838083
Isotonic-CV     0.848250
IVAP            0.874500
CVAP            0.875500
dtype: float64

**Average Brier and log loss ranking**

In [15]:
results_brier.rank(axis=1).mean()

Uncalibrated    4.000000
Platt           6.500000
Isotonic        5.333333
Platt-CV        4.000000
Isotonic-CV     2.666667
IVAP            3.500000
CVAP            2.000000
dtype: float64

In [16]:
results_log.rank(axis=1).mean()

Uncalibrated    3.333333
Platt           6.166667
Isotonic        6.166667
Platt-CV        4.166667
Isotonic-CV     2.666667
IVAP            3.333333
CVAP            2.166667
dtype: float64

**Mean Brier and log loss scores**

In [17]:
results_brier.mean()

Uncalibrated    0.045378
Platt           0.050707
Isotonic        0.045853
Platt-CV        0.043287
Isotonic-CV     0.037782
IVAP            0.030816
CVAP            0.030061
dtype: float64

In [18]:
results_log.mean()

Uncalibrated    1.082658
Platt           0.634009
Isotonic        0.624070
Platt-CV        0.536826
Isotonic-CV     0.463297
IVAP            0.366858
CVAP            0.360809
dtype: float64

Overall, in this example, IVAP and CVAP calibrators achieve highest overall accuracy and lowest Brier and log loss errors

## Multiclass implemetation of pre-fitted classifiers

In standard IVAP and CVAP multi-class calibration, we consider a collection of binary
classification problems and then combine their solutions (when solutions include pairwise
class probabilities) to obtain multi-class probabilities (see e.g. *Manokhin In Conformal and Probabilistic Prediction and Applications, pp. 228-240. PMLR, 2017.*). For some classifiers (e.g. Deep Neural Networks) it may not be practical to modify the problem to generate a set of one-vs-one probability outputs. In this case, we can use the calibration set multi-class probabilities to calibrate the outputs in a similar way, by converting them into equivalent one-vs-one outputs first for all binary class pair combinations in the calibrations set and applying the VennABERS procdure to them. 

In [20]:
# create a synthetic dataset

n_features = 10
rand_seed = 0
n_samples = 10000

X, y = make_classification(
    n_classes=5,
    n_samples=n_samples,
    n_clusters_per_class=2,
    n_features=n_features,
    n_informative=int(n_features/2),
    n_redundant=int(n_features/4),
    random_state=rand_seed)

#  split into a proper training, calibration and test set

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=rand_seed)

X_train_proper, X_cal, y_train_proper, y_cal = train_test_split(
    X_train, y_train, test_size=0.2, shuffle=False)

lb = LabelBinarizer()
y_test_one_hot = lb.fit_transform(y_test)

# fit a Random Forest classifier to the proper training set 
# and generate calibration and test set probabilities

clf = RandomForestClassifier(random_state=rand_seed)

clf.fit(X_train_proper, y_train_proper)
p_cal = clf.predict_proba(X_cal)
p_test = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)

# fit a VennAbers calibrator using the calibration set and generate calibrated test set 
# probabilities (note this is done using a single command) and corresponding predictions

va = VennAbersCalibrator()

p_prime, p0p1 = va.predict_proba(
    p_cal=p_cal, y_cal=y_cal, p_test=p_test, p0_p1_output=True)

y_prime = va.predict(
    p_cal=p_cal, y_cal=y_cal, p_test=p_test, one_hot=False)

# compare log loss, Brier loss and accuracy 

stat_list = {}
stat_list['RF'] = [
    brier_loss_calc(y_test_one_hot, p_test),
    log_loss(y_test, p_test),
    accuracy_score(y_test, y_pred)]

stat_list['VA'] = [
    brier_loss_calc(y_test_one_hot, p_prime),
    log_loss(y_test, p_prime),
    accuracy_score(y_test, y_prime)]
    
stat_df = pd.DataFrame(stat_list, index = ['Brier loss', 'log loss', 'accuracy'])
stat_df.round(3)

Unnamed: 0,RF,VA
Brier loss,0.064,0.062
log loss,0.73,0.618
accuracy,0.79,0.789


In this example the VennABERS celibrated RF multiclass probabilities yield a lower Brier and log loss metrics with similar accuracy