In [None]:
##Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

##Libraries for evaluation metrics
from scipy.stats import entropy
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score

##Libraries for ensemble methods considered in the study
from xgboost import XGBClassifier

In [None]:
# path to folder where test data after processing with merge_spectrogram_features_n_train_test_split.ipynb is stored
data_path = 'data/'

## Load data

In [None]:
# load test set
test = pd.read_parquet(data_path + 'test_selected_features.parquet')

In [None]:
test.shape

(2975, 2008)

In [None]:
test.head()

Unnamed: 0,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,total_votes,...,eeg_max_f391_10s,RL_18.36_mean_10m,eeg_max_f496_10s,RL_8.2_mean_10m,eeg_std_f482_10s,eeg_std_f5_10s,eeg_min_f395_10s,RL_19.14_max_10m,RL_16.6_max_10m,LL_11.91_max_10m
0,1722186807,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429,84,...,0.729249,0.1389,0.003328,1.018567,0.091203,0.14823,0.215428,2.14,1.87,1.29
1,2663298457,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429,70,...,0.790422,0.117667,-0.030639,0.952567,0.103465,0.112482,0.281489,1.2,0.97,1.29
2,3886831369,6489,LRDA,0.0,0.0,0.0,0.666667,0.0,0.333333,12,...,0.824828,0.516433,0.023559,1.344267,0.090628,0.084811,0.246982,12.52,12.63,4.17
3,1512279764,44475,GPD,0.0,0.0,0.666667,0.0,0.0,0.333333,18,...,0.881155,0.1307,0.187335,0.822967,0.101298,0.037371,0.587079,0.64,0.94,0.93
4,580941735,44475,GPD,0.0,0.0,0.666667,0.0,0.0,0.333333,21,...,0.869112,0.127833,0.19306,0.8286,0.116789,0.041066,0.561338,0.64,0.94,0.93


In [None]:
# vote columns corresponding to each of the classes
vote_cols = test.columns[test.columns.str.endswith('_vote')].tolist()
vote_cols

['seizure_vote',
 'lpd_vote',
 'gpd_vote',
 'lrda_vote',
 'grda_vote',
 'other_vote']

In [None]:
# Features to be used for training
FEATURES = test.columns[range(10,test.shape[1])].tolist()
len(FEATURES)

1998

In [None]:
# order needs to same as in vote_cols
TARGETS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

In [None]:
# sample weights based on total votes
weights_total_vote = [min(t/3,1) for t in test.total_votes.tolist()]
weights_total_vote = np.array(weights_total_vote)
weights_total_vote.shape

(2975,)

In [None]:
# Check that all the weights are 1 for the samples in test set 
sum(weights_total_vote == 1)

2975

## Helper functions

In [None]:
## Function for evaluating the performance of model based on k-fold validation
def eval_kl_div(y_true, y_pred, sample_weight):
    """
    Parameters
    ----------
    y_true : 2D numpy array, True probability values
    y_pred : 2D numpy array, Predicted probability values
    sample_weight: numpy vector, for weighted average KL divergence

    Output
    ------
    Mean (averaged over all samples) KL divergence
    """

    # Compute KL divergence for all the samples
    kl_div_all = entropy(y_true, y_pred, axis=1)

    return np.average(kl_div_all, weights=sample_weight)

## Performance on test set

In [None]:
# load the pre-trained xgboost model
clf = XGBClassifier()
clf.load_model(data_path + 'xgb_model.json')

In [None]:
X_test = test.loc[:,FEATURES].values
y_test_prob = test.loc[:,vote_cols].values

##predict for test set
y_pred_prob = clf.predict_proba(X_test)
##changing zeros in y_pred to 1e-15 to avoid inf from log
y_pred_prob = y_pred_prob.clip(1e-15)

##KL divergence for test set
eval_kl_div(y_test_prob, y_pred_prob, weights_total_vote)

0.7098030163588971

In [None]:
y_test_label = [TARGETS[c] for c in test.expert_consensus]
y_pred_label = clf.predict(X_test)

Compute performance metrics on the test set

average parameter in precision_score() and recall_score()

'micro':
Calculate metrics globally by counting the total true positives, false negatives and false positives.

'macro':
Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

'weighted':
Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.

In [None]:
print('\n------------------ Confusion Matrix -----------------\n')
print(confusion_matrix(y_test_label, y_pred_label))

print('\nAccuracy: {:.2f}'.format(accuracy_score(y_test_label, y_pred_label)))
print('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test_label, y_pred_label)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test_label, y_pred_label, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test_label, y_pred_label, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test_label, y_pred_label, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test_label, y_pred_label, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test_label, y_pred_label, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test_label, y_pred_label, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test_label, y_pred_label, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test_label, y_pred_label, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test_label, y_pred_label, average='weighted')))

print('\n--------------- Classification Report ---------------\n')
print(classification_report(y_test_label, y_pred_label))
print('---------------------- XGBoost ----------------------')


------------------ Confusion Matrix -----------------

[[433   8  36   3  30 125]
 [ 57 276  13  10  18 152]
 [ 97  16 248   0  26  48]
 [ 32  41   2  41  55  56]
 [ 35   7   2   8 285 105]
 [117  60  26  26  66 415]]

Accuracy: 0.57
Balanced Accuracy: 0.53

Micro Precision: 0.57
Micro Recall: 0.57
Micro F1-score: 0.57

Macro Precision: 0.59
Macro Recall: 0.53
Macro F1-score: 0.54

Weighted Precision: 0.58
Weighted Recall: 0.57
Weighted F1-score: 0.57

--------------- Classification Report ---------------

              precision    recall  f1-score   support

           0       0.56      0.68      0.62       635
           1       0.68      0.52      0.59       526
           2       0.76      0.57      0.65       435
           3       0.47      0.18      0.26       227
           4       0.59      0.64      0.62       442
           5       0.46      0.58      0.52       710

    accuracy                           0.57      2975
   macro avg       0.59      0.53      0.54      2975

In [None]:
# add column and index names to confusion matrix
conf_df = pd.DataFrame(confusion_matrix(y_test_label, y_pred_label).T, list(TARGETS.keys()), list(TARGETS.keys()))
conf_df

Unnamed: 0,Seizure,LPD,GPD,LRDA,GRDA,Other
Seizure,433,57,97,32,35,117
LPD,8,276,16,41,7,60
GPD,36,13,248,2,2,26
LRDA,3,10,0,41,8,26
GRDA,30,18,26,55,285,66
Other,125,152,48,56,105,415


In [None]:
conf_df_temp = pd.DataFrame(
    conf_df.values,
    pd.MultiIndex.from_product([['Predicted'], conf_df.index]),
    pd.MultiIndex.from_product([['True'], conf_df.columns])
)

In [None]:
# styling of the confusion matrix for output
s = conf_df_temp.style

s.set_table_styles([  # create internal CSS classes
    {'selector': '.true', 'props': 'background-color: #e6ffe6;'},
    {'selector': '.false', 'props': 'background-color: #ffe6e6;'},
], overwrite=False)
cell_color = pd.DataFrame([['true ', 'false ', 'false ', 'false ', 'false ', 'false '],
                           ['false ', 'true ', 'false ', 'false ', 'false ', 'false '],
                           ['false ', 'false ', 'true ', 'false ', 'false ', 'false '],
                           ['false ', 'false ', 'false ', 'true ', 'false ', 'false '],
                           ['false ', 'false ', 'false ', 'false ', 'true ', 'false '],
                           ['false ', 'false ', 'false ', 'false ', 'false ', 'true ']],
                          index=conf_df_temp.index,
                          columns=conf_df_temp.columns)

s.set_td_classes(cell_color)
s.set_properties(**{'border': '1.3px solid black',
                    'color': 'black',
                    'text-align': 'center',
                    'font-size': '12pt'})

Unnamed: 0_level_0,Unnamed: 1_level_0,True,True,True,True,True,True
Unnamed: 0_level_1,Unnamed: 1_level_1,Seizure,LPD,GPD,LRDA,GRDA,Other
Predicted,Seizure,433,57,97,32,35,117
Predicted,LPD,8,276,16,41,7,60
Predicted,GPD,36,13,248,2,2,26
Predicted,LRDA,3,10,0,41,8,26
Predicted,GRDA,30,18,26,55,285,66
Predicted,Other,125,152,48,56,105,415


In [None]:
# save the confusion matrix
s.to_html(data_path + 'confusion_matrix.html')

In [None]:
# precision and recall for each of the class
prec_recall_df = pd.DataFrame({"Precision":np.round(precision_score(y_test_label, y_pred_label, average=None),2),
                               "Recall": np.round(recall_score(y_test_label, y_pred_label, average=None), 2)},
                              index = list(TARGETS.keys()))

In [None]:
prec_recall_df

Unnamed: 0,Precision,Recall
Seizure,0.56,0.68
LPD,0.68,0.52
GPD,0.76,0.57
LRDA,0.47,0.18
GRDA,0.59,0.64
Other,0.46,0.58
