In [115]:
from IPython.core.interactiveshell import InteractiveShell
from matplotlib import pyplot 
from numpy import mean
from numpy import std
from sklearn.calibration import CalibratedClassifierCV as CCCV
from sklearn.calibration import calibration_curve
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix as confusion
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score as ppv
from sklearn.metrics import recall_score as recall
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.multioutput import ClassifierChain
from xgboost import XGBClassifier as xgb
import itertools
import ipywidgets as widgets
import joblib
import numpy as np
import pandas as pd
import pip
import shap
InteractiveShell.ast_node_interactivity = "all"

In [3]:
Xi=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\Xi.csv')
Xi_hold=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\Xi_hold.csv')
y=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y.csv')
y_hold=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold.csv')
y_hold_nssi=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold_nssi.csv')
y_hold_si=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold_si.csv')

In [4]:
#Dropping the sitbs column as we are only interested in predicting each component outcome with the chained classifier
y=y.drop(columns='sitbs')

In [5]:
RF=RandomForestClassifier(n_estimators=250, min_samples_split=50, max_features=200, random_state=26)
baseclf=CCCV(RF, method='isotonic')
chain=ClassifierChain(baseclf, order=None, random_state=26)

In [6]:
chain.fit(Xi, y)

In [32]:
#Creating function that predicts probabilities using the calibrated model
def predict(model, data):
    proba=model.predict_proba(data)
    print('Probs: %.3f (%.3f)' % (mean(proba), std(proba)) )

    return proba

In [9]:
testprob=predict(chain, Xi_hold)

Probs: 0.080 (0.137)


In [10]:
testprob

array([[0.08751433, 0.0193342 , 0.01164665],
       [0.32529954, 0.0193342 , 0.01029261],
       [0.01494564, 0.00528675, 0.00404546],
       ...,
       [0.02295547, 0.01525256, 0.00621937],
       [0.10473921, 0.02001447, 0.01029261],
       [0.07783935, 0.02881188, 0.01029261]])

In [11]:
nssi=testprob[:, 1]
nssip=np.where(nssi>0.160344, 1, 0)

In [12]:
def eval(y_hold, ypred, yprob):
    f1= f1_score(y_hold, ypred)
    print(f'F1= {f1:f}')
    sens= recall(y_hold, ypred)
    print(f'Sensitivity= {sens:f}')
    tn, fp, fn, tp= confusion(y_hold, ypred).ravel()
    spec=tn/(tn+fp)
    print(f'Specificity= {spec:f}')
    auc= roc_auc_score(y_hold, yprob)
    print(f'AUROC= {auc:f}')

    return ypred, f1, sens, spec, auc


In [13]:
si=testprob[:, 0]
sip=np.where(si>0.175579, 1, 0)

In [14]:
eval(y_hold_si, sip, si)

F1= 0.456140
Sensitivity= 0.677083
Specificity= 0.804107
AUROC= 0.815643


(array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 

In [15]:
eval(y_hold_nssi, nssip, nssi)

F1= 0.400000
Sensitivity= 0.410714
Specificity= 0.946508
AUROC= 0.806357


(array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [16]:
joblib.dump(chain, 'cclf2802_RF.sav')

['cclf2802_RF.sav']

In [17]:
att=testprob[:, 2]
attp=np.where(att>0.073374, 1, 0)

In [18]:
y_hold_att=y_hold['att']

In [19]:
eval(y_hold_att, attp, att)

F1= 0.325000
Sensitivity= 0.406250
Specificity= 0.949785
AUROC= 0.797391


(array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [22]:
sre = ['si', 'nssi', 'att']
# Saving the base estimators in the chain
for idx, s in enumerate(sre):
    globals()[f"{s}_est"] = chain.estimators_[idx]

Getting Shapley Values

In [None]:
#Creating function to feed into Kernel Explainer
def predict(data):
    return chain.predict_proba(data)
  

In [38]:
explainer=shap.KernelExplainer(model=chain.predict_proba, data=Xi.head(5))

In [51]:
shap_values=explainer.shap_values(X=Xi_hold.head(5))

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [02:14<00:00, 26.86s/it]


In [53]:
#Note that shap.initjs() needs to be run to produce the visualisations- otherwise there will be an error that says Javascript library not omitted
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0], Xi_hold.head(5))

Using XGBoost as base classifier

In [81]:
xgboost=xgb(sub_sample=0.6, n_estimators=500, max_depth=6, eta=0.001, col_sample_bytree=1.0, random_state=26)
baseclf2=CCCV(xgboost, method='isotonic')
chain=ClassifierChain(baseclf2, order=None, random_state=26)

In [82]:
chain.fit(Xi, y)

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.

Parameters: { "col_sample_bytree", "sub_sample" } are not used.



In [83]:
testprob=predict(Xi_hold)

Probs: 0.068 (0.101)


In [84]:
testprob

array([[0.05931608, 0.02533764, 0.00709985],
       [0.44267897, 0.04694778, 0.00709985],
       [0.04463123, 0.01637963, 0.00709985],
       ...,
       [0.06880919, 0.02533764, 0.00709985],
       [0.06441446, 0.01830885, 0.00780464],
       [0.12279579, 0.01830885, 0.01203115]])

In [88]:
nssi=testprob[:, 1]
nssip=np.where(nssi>0.172887, 1, 0)

In [90]:
eval(y_hold_nssi, nssip, nssi)

F1= 0.265060
Sensitivity= 0.196429
Specificity= 0.976226
AUROC= 0.740395


(array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [91]:
att=testprob[:, 2]
attp=np.where(att>0.087420, 1, 0)

In [93]:
eval(y_hold_att, attp, att)

F1= 0.108108
Sensitivity= 0.125000
Specificity= 0.945481
AUROC= 0.700457


(array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 