# Conformal Prediction for Classification

### Loading Libraries

In [19]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

#Dat Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# SciPy
from scipy.stats import rankdata

#
import openml

#Scikit-Learn

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, confusion_matrix

# Boosting Libraries
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Conformal Libraries
from nonconformist.nc import NcFactory
from nonconformist.cp import TcpClassifier
from nonconformist.base import ClassifierAdapter
from nonconformist.nc import InverseProbabilityErrFunc, MarginErrFunc

### Loading Libraries

In [2]:
datasets_df = openml.datasets.list_datasets(output_format="dataframe")
print(datasets_df.head(n=10))

datasets_df.set_index('did', inplace = True)

    did             name  version uploader  status format  MajorityClassSize  \
2     2           anneal        1        1  active   ARFF              684.0   
3     3         kr-vs-kp        1        1  active   ARFF             1669.0   
4     4            labor        1        1  active   ARFF               37.0   
5     5       arrhythmia        1        1  active   ARFF              245.0   
6     6           letter        1        1  active   ARFF              813.0   
7     7        audiology        1        1  active   ARFF               57.0   
8     8  liver-disorders        1        1  active   ARFF                NaN   
9     9            autos        1        1  active   ARFF               67.0   
10   10            lymph        1        1  active   ARFF               81.0   
11   11    balance-scale        1        1  active   ARFF              288.0   

    MaxNominalAttDistinctValues  MinorityClassSize  NumberOfClasses  \
2                           7.0                8

In [3]:
datasets_df.head()

Unnamed: 0_level_0,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
did,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0


In [4]:
datasets_df.query('NumberOfClasses == 2')

Unnamed: 0_level_0,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
did,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
13,breast-cancer,1,1,active,ARFF,201.0,11.0,85.0,2.0,10.0,286.0,9.0,9.0,0.0,10.0
15,breast-w,1,1,active,ARFF,458.0,2.0,241.0,2.0,10.0,699.0,16.0,16.0,9.0,1.0
24,mushroom,1,1,active,ARFF,4208.0,12.0,3916.0,2.0,23.0,8124.0,2480.0,2480.0,0.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46359,Fraud-Detection-Updated,1,44101,active,arff,3739.0,,417.0,2.0,29.0,4156.0,4156.0,25715.0,28.0,1.0
46362,Customer_Churn_Classification,1,44101,active,arff,138071.0,,36957.0,2.0,25.0,175028.0,0.0,0.0,24.0,1.0
46363,Customer_Churn_Classification,2,44101,active,arff,138071.0,,36957.0,2.0,25.0,175028.0,0.0,0.0,24.0,1.0
46364,Customer_Churn_Classification,3,44101,active,arff,138071.0,,36957.0,2.0,25.0,175028.0,0.0,0.0,24.0,1.0


#### Selected Dataset Information

In [5]:
datasets_df.loc[1461]

name                                  bank-marketing
version                                            1
uploader                                          64
status                                        active
format                                          ARFF
MajorityClassSize                            39922.0
MaxNominalAttDistinctValues                     12.0
MinorityClassSize                             5289.0
NumberOfClasses                                  2.0
NumberOfFeatures                                17.0
NumberOfInstances                            45211.0
NumberOfInstancesWithMissingValues               0.0
NumberOfMissingValues                            0.0
NumberOfNumericFeatures                          7.0
NumberOfSymbolicFeatures                        10.0
Name: 1461, dtype: object

#### Bank Marketing Dataset: `https://www.openml.org/d/1461`

In [6]:
dataset = openml.datasets.get_dataset(1461)

print(
    f"This is dataset '{dataset.name}', the target feature is "
    f"'{dataset.default_target_attribute}'"
)
print(f"URL: {dataset.url}")
print(dataset.description[:500])

This is dataset 'bank-marketing', the target feature is 'Class'
URL: https://api.openml.org/data/v1/download/1586218/bank-marketing.arff
**Author**: Paulo Cortez, Sérgio Moro
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/bank+marketing)
**Please cite**: S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.       

**Bank Marketing**  
The data is related with direct marketing campaigns of 


#### `openml API`

In [7]:
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute
)
df = pd.DataFrame(X, columns=attribute_names)
df["class"] = y

In [8]:
categorical_indicator

[False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True]

In [9]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,class
0,58,management,married,tertiary,no,2143.0,yes,no,unknown,5,may,261.0,1,-1.0,0.0,unknown,1
1,44,technician,single,secondary,no,29.0,yes,no,unknown,5,may,151.0,1,-1.0,0.0,unknown,1
2,33,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5,may,76.0,1,-1.0,0.0,unknown,1
3,47,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5,may,92.0,1,-1.0,0.0,unknown,1
4,33,unknown,single,unknown,no,1.0,no,no,unknown,5,may,198.0,1,-1.0,0.0,unknown,1


In [10]:
for cat_ind, col in zip(categorical_indicator, df.columns):
  print(cat_ind, col)
  if cat_ind == True:
    df[col] = df[col].astype('category')

False V1
True V2
True V3
True V4
True V5
False V6
True V7
True V8
True V9
False V10
True V11
False V12
False V13
False V14
False V15
True V16


In [11]:
# General Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   V1      45211 non-null  uint8   
 1   V2      45211 non-null  category
 2   V3      45211 non-null  category
 3   V4      45211 non-null  category
 4   V5      45211 non-null  category
 5   V6      45211 non-null  float64 
 6   V7      45211 non-null  category
 7   V8      45211 non-null  category
 8   V9      45211 non-null  category
 9   V10     45211 non-null  uint8   
 10  V11     45211 non-null  category
 11  V12     45211 non-null  float64 
 12  V13     45211 non-null  uint8   
 13  V14     45211 non-null  float64 
 14  V15     45211 non-null  float64 
 15  V16     45211 non-null  category
 16  class   45211 non-null  category
dtypes: category(10), float64(4), uint8(3)
memory usage: 1.9 MB


In [14]:
# Class Value & Mean
df['class'].value_counts()

class
1    39922
2     5289
Name: count, dtype: int64

In [22]:
# df['class'].mean()

df['class'].cat.codes.mean()

0.11698480458295547

In [23]:
cat_columns = df.select_dtypes("category").columns
cat_columns

Index(['V2', 'V3', 'V4', 'V5', 'V7', 'V8', 'V9', 'V11', 'V16', 'class'], dtype='object')

#### Encode categorical Columns using Label Encoder

In [24]:
le = LabelEncoder()

for col in cat_columns:
  df[col] = le.fit_transform(df[col])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      45211 non-null  uint8  
 1   V2      45211 non-null  int64  
 2   V3      45211 non-null  int64  
 3   V4      45211 non-null  int64  
 4   V5      45211 non-null  int64  
 5   V6      45211 non-null  float64
 6   V7      45211 non-null  int64  
 7   V8      45211 non-null  int64  
 8   V9      45211 non-null  int64  
 9   V10     45211 non-null  uint8  
 10  V11     45211 non-null  int64  
 11  V12     45211 non-null  float64
 12  V13     45211 non-null  uint8  
 13  V14     45211 non-null  float64
 14  V15     45211 non-null  float64
 15  V16     45211 non-null  int64  
 16  class   45211 non-null  int64  
dtypes: float64(4), int64(10), uint8(3)
memory usage: 5.0 MB


### Splitting Procedure: Training & Test Sets

In [26]:
# Features
X = df.drop('class', axis = 1)

# Target
y = df['class']    

In [27]:
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
0,58,4,1,2,0,2143.0,1,0,2,5,8,261.0,1,-1.0,0.0,3
1,44,9,2,1,0,29.0,1,0,2,5,8,151.0,1,-1.0,0.0,3
2,33,2,1,1,0,2.0,1,1,2,5,8,76.0,1,-1.0,0.0,3
3,47,1,1,3,0,1506.0,1,0,2,5,8,92.0,1,-1.0,0.0,3
4,33,11,2,3,0,1.0,0,0,2,5,8,198.0,1,-1.0,0.0,3


In [28]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

#### Training, Calibration & Test Set

In [32]:
X_train_calib, X_test, y_train_calib, y_test = train_test_split(X, y, test_size=1000, random_state=42, stratify=y)

X_train, X_calib, y_train, y_calib = train_test_split(X_train_calib, y_train_calib, test_size=1000, random_state=42, stratify=y_train_calib)

In [33]:
print('Training set size {}'.format(len(y_train)))
print('Calibration set size {}'.format(len(y_calib)))
print('Test set size {}'.format(len(y_test)))

Training set size 43211
Calibration set size 1000
Test set size 1000
