In [2]:
# Import core libraries.
import pandas as pd 
import numpy as np
import datetime

# Import visualisation libraries.
import matplotlib.pyplot as plt
import seaborn as sns 

# Import preprocessing libraries.
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import (train_test_split, GridSearchCV, StratifiedKFold, 
                                     cross_validate, RandomizedSearchCV)

from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer

# Import machine learning libraries.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier 

# Import evaluation libraries.
from sklearn.metrics import (accuracy_score, roc_auc_score, confusion_matrix, 
                             classification_report, precision_score, recall_score,
                             f1_score, ConfusionMatrixDisplay)
import shap

# Define global seed.
SEED = 42

In [3]:
# Import the dataset.
trial_data = pd.read_csv('../data/raw/Bells Palsy Clinical Trial.csv')

# View the dataset.
display(trial_data.head())
print('Number of examples:', trial_data.shape[0])
print('Number of features:', trial_data.shape[1])
print('Feature names:', trial_data.columns.values)

Unnamed: 0,Patient ID,Sex,Age,Baseline Score on House–Brackmann scale,Time between onset of symptoms and start of treatment,Treatment Group,Received Prednisolone,Received Acyclovir,3-Month Score on House–Brackmann scale,Full Recovery in 3 Months,9-Month Score on House–Brackmann scale,Full Recovery in 9 Months
0,1,Female,77,6,Within 24 hr,Prednisolone–Placebo,Yes,No,2,No,2,No
1,2,Female,61,6,Within 24 hr,Prednisolone–Placebo,Yes,No,1,Yes,1,Yes
2,3,Female,46,4,>24 to ≤48 hr,Prednisolone–Placebo,Yes,No,1,Yes,1,Yes
3,4,Female,46,3,Within 24 hr,Prednisolone–Placebo,Yes,No,1,Yes,1,Yes
4,5,Female,42,3,>24 to ≤48 hr,Prednisolone–Placebo,Yes,No,1,Yes,1,Yes


Number of examples: 494
Number of features: 12
Feature names: ['Patient ID' 'Sex' 'Age' 'Baseline Score on House–Brackmann scale'
 'Time between onset of symptoms and start of treatment' 'Treatment Group'
 'Received Prednisolone' 'Received Acyclovir'
 '3-Month Score on House–Brackmann scale' 'Full Recovery in 3 Months'
 '9-Month Score on House–Brackmann scale' 'Full Recovery in 9 Months']


Full recovery in 9 months will be used as the primary target (1 = fully recovery, 0 = not fully recovered). 

In [8]:
# Convert binary features.
trial_data['Received Prednisolone'] = (trial_data['Received Prednisolone'] == 'Yes').astype(int)
trial_data['Received Acyclovir'] = (trial_data['Received Acyclovir'] == 'Yes').astype(int)
trial_data['Full Recovery in 9 Months'] = (trial_data['Full Recovery in 9 Months'] == 'Yes').astype(int)