In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from statistics import mean, median
import os
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
import scipy.stats as stats
import gc

### ML packages
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, precision_recall_curve, auc, f1_score, make_scorer
from confidenceinterval import roc_auc_score, ppv_score, npv_score, tnr_score, tpr_score
import confidenceinterval as cfi
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import cv
import shap
import ml_insights as mli

### In-hospital death

##### Load cohorts and setup file paths

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'DateOfDeath',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'gt_eld',
 'gt_eld_d1',
 'gt_eld_d2',
 'gt_eld_d3',
 'gt_rehab',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

In [None]:
feature_names = {
    'trQ_waterlow_score': 'Waterlow score',
    'AgeAtAdmission': 'Age',
    'arrival_mode_B': 'Arrival - NHSL Bus',
    'arrival_mode_E': 'Arrival - Emergency Ambulance', 
    'arrival_mode_O': 'Arrival - Other', 
    'arrival_mode_PU': 'Arrival - Public Transport',
    'arrival_mode_U': 'Arrival - GP Ambulance', 
    'arrival_mode_Unk': 'Arrival - Unknown', 
    'arrival_mode_W': 'Arrival - Walked',
    'simd_dec': 'SIMD (most to least deprived)',
    'arrival_mode_PR': 'Arrival - Private Transport',
    'trQ_bwm_urinary_catheterisation': 'Urinary Catheterisation', 
    'trQ_bwm_urinary_incontinence': 'Urinary Incontinence', 
    'trQ_bwm_dysuria': 'Dysuria', 
    'trQ_bwm_>6times_per_day': 'Bowel Movement >6 times per day', 
    'trQ_bwm_nocturia_>2_per_night': 'Nocturia >2 per night', 
    'trQ_bwm_faeces_incontinence': 'Faeces Incontinence', 
    'trQ_bwm_constipation': 'Constipation', 
    'trQ_bwm_diarrhoea': 'Diarrhoea', 
    'trQ_bwm_blood_in_stools': 'Blood in stools', 
    'trQ_bwm_medication': 'Bowel movement medication',
    'trQ_falls_within_6_months': 'Fall within last 6 months', 
    'trQ_falls_clinical_risk': 'At clinical risk of falls', 
    'trQ_nutr_food_allergies': 'Food allergies', 
    'trQ_nutr_swallowing_difficulty': 'Swallowing difficulty', 
    'trQ_mrsa_infection_prevention': 'Infection prevention measures', 
    'trQ_mrsa_transfer_with_norovirus': 'MRSA Norovirus', 
    'trQ_mrsa_resp_or_fever': 'MRSA with Respiratory issues or Fever', 
    'trQ_mrsa_rash_fever_or_flu': 'MRSA with Rash, Fever or Flu', 
    'trQ_mrsa_infectious_diseases_contact': 'MRSA contact with infection diseases', 
    'trQ_rub_nursing_falls_risk_assessment': 'Nursing Falls risk assessment', 
    'trQ_rub_at_risk_of_bed_fall': 'At risk of bed fall', 
    'trQ_MUST_score': 'MUST Score', 
    'trQ_mobility_walking_ASSISTANCE': 'Walking assistance', 
    'trQ_mobility_walking_BED_REST': 'Walking (Bed rest)', 
    'trQ_mobility_walking_INDEPENDENT': 'Walking dependence', 
    'trQ_mobility_toileting_ASSISTANCE': 'Toileting assistance', 
    'trQ_mobility_toileting_BED_REST': 'Toileting (Bed rest)', 
    'trQ_mobility_toileting_INDEPENDENT': 'Toileting dependence', 
    'trQ_mobility_bathing_ASSISTANCE': 'Bathing assistance', 
    'trQ_mobility_bathing_BED_REST': 'Bathing (Bed rest)', 
    'trQ_mobility_bathing_INDEPENDENT': 'Bathing dependence', 
    'trQ_mobility_bed_rolling_ASSISTANCE': 'Rolling in bed assistance', 
    'trQ_mobility_bed_rolling_INDEPENDENT': 'Rolling in bed dependence', 
    'trQ_mobility_bed_moveup_ASSISTANCE': 'Moving up bed assistance',
    'trQ_mobility_bed_moveup_INDEPENDENT': 'Moving up bed dependence', 
    'trQ_mobility_bed_out_ASSISTANCE': 'Moving out of bed assistance', 
    'trQ_mobility_bed_out_BED_REST': 'Moving out of bed (Bed rest)',
    'trQ_mobility_bed_out_INDEPENDENT': 'Moving out of bed dependence', 
    'trQ_mobility_bed_in_ASSISTANCE': 'Moving in bed assistance', 
    'trQ_mobility_bed_in_BED_REST': 'Moving in bed (Bed rest)', 
    'trQ_mobility_bed_in_INDEPENDENT': 'Moving in bed dependence', 
    'trQ_mobility_sss_ASSISTANCE': 'Sit-stand-sit assistance', 
    'trQ_mobility_sss_BED_REST': 'Sit-stand-sit (Bed rest)',
    'trQ_mobility_sss_INDEPENDENT': 'Sit-stand-sit dependence',
    'trQ_mobility_lateral_ASSISTANCE': 'Lateral movement assistance', 
    'trQ_mobility_lateral_BED_REST': 'Lateral movement (Bed rest)', 
    'trQ_mobility_lateral_INDEPENDENT': 'Lateral movement dependence', 
    'trQ_mobility_floorup_ASSISTANCE': 'Floor-up movement assistance',
    'trQ_mobility_floorup_BED_REST': 'Floor-up movement (Bed rest)', 
    'trQ_mobility_floorup_INDEPENDENT': 'Floor-up movement dependence',
    'num_inp_attendances_lyr': 'Scheduled inpatient attendances last year', 
    'total_longterm_conditions': '# unique long-term conditions', 
    'num_outp_att_CB': 'Outpatient visits (Urology)', 
    'lactate_v': 'Lactate (mmol/L) - last value',
    'lactate_rm': 'Lactate (mmol/L) - moving average',
    'dsl_outp_att': 'Last outpatient attendance (days)', 
    'haemoglobin_nl': 'Haemoglobin - low', 
    'dsl_physltc_pulmonary_fibrosis': 'Pulmonary fibrosis (days)', 
    'hba1c_(ifcc)_rs': 'HbA1c (IFCC, mmol/mol) - moving std', 
    'urea_v': 'Urea (mmol/L) - last value', 
    'dsl_antipsychotics': 'Antipsychotics (days since last)', 
    'red_cell_count_nl': 'Red Cell Count - low', 
    'bilirubin_nh': 'Bilirubin - high', 
    'num_outp_att_AR': 'Outpatient visits (Rheumatology)', 
    'num_outp_att_F2': 'Outpatient visits (Gynaecology)', 
    'hba1c_(ifcc)_v': 'HbA1c (IFCC, mmol/mol) - last value', 
    'c-reactive_prot_nh': 'CRP - high', 
    'n_presc_anticoagulant_protamine_drugs': 'Anticoagulants and protaime (# prescribed)', 
    'num_outp_att_G1': 'Outpatient visits (General Psychiatry)', 
    'ggt_v': 'GGT (U/L) - last value', 
    'num_outp_att_C11': 'Outpatient visits (General Surgery)', 
    'dsl_antidementia_drugs': 'Antidementia drugs (days since last)', 
    'dsl_anti_hypertension_hf_drugs': 'Antihypertensive drugs (days since last)', 
    'num_outp_att_G4': 'Outpatient visits (Psychiatry Of Old Age)', 
    'dsl_antidepressant_drugs': 'Antidepressant drugs (days since last)', 
    'num_outp_att_A1': 'Outpatient visits (General Medicine)', 
    'phys_men_multimorbidity': 'Physical-mental multimorbidity', 
    'num_outp_att_C3': 'Outpatient visits (Anaesthetics)', 
    'hba1c_(ifcc)_nh': 'HbA1c (IFCC) - high', 
    'n_presc_nausea_vertigo_drugs': 'Nausea and vertigo drugs (# prescribed)', 
    'albumin_nl': 'Albumin - low', 
    'total_menlongterm_conditions': '# Unique mental chronic conditions', 
    'num_outp_att_AB': 'Outpatient visits (Geriatric Medicine)', 
    'num_outp_att_R5': 'Outpatient visits (Physiotherapy)', 
    'dsl_physltc_prog_neur_disease': 'Progressive neurological disease (days)', 
    'ast_rm': 'AST (U/L) - moving average', 
    'num_inp_attendances': '# Scheduled inpatient attendances', 
    'ferritin_nl': 'Ferritin - low', 
    'dsl_physltc_arthritis_arthropathy': 'Arthritis or other arthropathy (days)', 
    'num_outp_att_C7': 'Outpatient visits (Opthalmology)', 
    'dsl_physltc_heart_failure': 'Heart Failure (days)', 
    'num_outp_att_AG': 'Outpatient visits (Renal Medicine)', 
    'num_outp_att_A9': 'Outpatient visits (Gastroenterology)', 
    'total_drug_categories': '# Unique prescribed drug categories', 
    'num_outp_att_A82': 'Outpatient visits (Diabetes)', 
    'urea_rs': 'Urea (mmol/L) - moving std', 
    'num_outp_att_C8': 'Outpatient visits (Trauma and Orthopaedic Surgery)', 
    'ferritin_nh': 'Ferritin - high', 
    'bilirubin_nl': 'Bilirubin - low', 
    'num_outp_att_A2': 'Outpatient visits (Cardiology)', 
    'num_outp_att_C5': 'Outpatient visits (ENT)',
    'n_presc_antidepressant_drugs': 'Antidepressant drugs (# prescribed)',
    'urea_nh': 'Urea - high', 
    'num_outp_att_A81': 'Outpatient visits (Endocrine)', 
    'dsl_physltc_liver_disease': 'Liver disease (days)', 
    'dsl_antiplatelet_drugs': 'Antiplatelet drugs (days since last)', 
    'num_inp_att_AG': 'Inpatient visits (Renal Medicine)', 
    'white_cell_count_nh': 'White Cell Count - high', 
    'ggt_rs': 'GGT (U/L) - moving std', 
    'mean_cell_volume_nh': 'MCV - high', 
    'dsl_physltc_chronic_renal_disease': 'Chronic renal disease (days)', 
    'bilirubin_rs': 'Bilirubin (umol/L) - moving std', 
    'c-reactive_prot_rs': 'CRP (mg/L) - moving std', 
    'dsl_physltc_stroke': 'Stroke - (days)', 
    'dsl_physltc_atrial_fibrillation': 'Atrial fibrillation (days)', 
    'dsl_physltc_copd': 'COPD (days)', 
    'dsl_physltc_per_vascular_disease': 'Peripheral Vascular Disease (days)', 
    'n_presc_parkinsonism_drugs': 'Parkinsonism drugs (# prescribed)', 
    'num_outp_att_dna_AG': 'Outpatient failed visits (Renal Medicine)', 
    'c-reactive_prot_rm': 'CRP (mg/L) - moving average', 
    'n_presc_nitrates_ccb_drugs': 'Nitrates and CCBs (# prescribed)', 
    'n_presc_beta_blockers': 'Beta blockers (# prescribed)', 
    'ferritin_rs': 'Ferritin (ug/L) - moving std', 
    'num_inp_att_C8': 'Inpatient visits (Trauma and Orthopaedic Surgery)', 
    'n_presc_antiplatelet_drugs': 'Antiplatelet drugs (# prescribed)', 
    'monocyte_count_nh': 'Monocyte count - high', 
    'num_outp_att_C9': 'Outpatient visits (Plastic Surgery)', 
    'lactate_rs': 'Lactate (mmol/L) - moving std', 
    'neutrophil_count_nh': 'Neutrophil count - high', 
    'n_presc_diuretics': 'Diuretics (# prescribed)', 
    'num_outp_attendances': '# Outpatient visits', 
    'dsl_bone_metabolism_affecting_drugs': 'Bone/metabolism-affecting drugs (days since last)', 
    'ggt_rm': 'GGT (U/L) - moving average', 
    'hba1c_(ifcc)_rm': 'HbA1c (IFCC) - moving average', 
    'dsl_menltc_depression': 'Depression (days)', 
    'dsl_physltc_hypertension': 'Hypertension (days)', 
    'sodium_nl': 'Sodium - low', 
    'ggt_nh': 'GGT - high', 
    'n_presc_lipid_regulators': 'Lipid regulators (# prescribed)', 
    'n_presc_antipsychotics': 'Antipsychotics (# prescribed)', 
    'dsl_physltc_inf_bowel_disease': 'Inflammatory Bowel Disease (days)', 
    'calcium_nl': 'Calcium - low', 
    'dsl_physltc_ischaemic_heart_disease': 'Ischaemic Heart Disease (days)', 
    'n_presc_anti_hypertension_hf_drugs': 'Antihypertensive drugs (# prescribed)', 
    'n_presc_antidementia_drugs': 'Anti-dementia drugs (# prescribed)', 
    'lymphocyte_count_nl': 'Lymphocyte count - low', 
    'egfr_(/1.73m2)_nl': 'eGFR (/1.73m2) - low',
    'egfr_(/1.73m2)_v': 'eGFR (/1.73m2) - last value',
    'egfr_(/1.73m2)_rs': 'eGFR (/1.73m2) - moving std',
    'platelet_count_nl': 'Platelet Count - low', 
    'dsl_diuretics': 'Diuretics (days since last)', 
    'n_presc_bone_metabolism_affecting_drugs': 'Bone/metabolism-affecting drugs (# prescribed)', 
    'ferritin_v': 'Ferritin (ug/L) - last value', 
    'num_outp_att_AQ': 'Outpatient visits (Respiratory Medicine)', 
    'esr_v': 'ESR (mm/hr) - last value', 
    'dsl_physltc_epilepsy': 'Epilepsy (days)', 
    'num_outp_att_AD': 'Outpatient visits (Medical Oncology)', 
    'dsl_physltc_diabetes': 'Diabetes (days)', 
    'dsl_menltc_alcohol_substance_misuse': 'Alcohol/substance misuse (days)', 
    'dsl_physltc_obesity': 'Obesity (days)', 
    'num_outp_att_AP': 'Outpatient visits (Rehabilitation Medicine)', 
    'num_outp_att_H2': 'Outpatient visits (Clinical Oncology)', 
    'c-reactive_prot_v': 'CRP (mg/L) - last value', 
    'ck_v': 'CK (IU/L) - last value', 
    'dsl_physltc_asthma': 'Asthma (days)', 
    'hs_troponin_i_v': 'HS Troponin I (ng/L) - last value', 
    'basophil_count_rm': 'Basophil Count - moving average', 
    'num_outp_att_dna_A9': 'Outpatient failed visits (Gastroenterology)', 
    'num_outp_att_AH': 'Outpatient visits (Neurology)',
    'alt_nl': 'ALT - low', 
    'dsl_menltc_chronic_psychiatry_disorder': 'Chronic Psychiatric Disorder (days)', 
    'hs_troponin_i_rm': 'HS Troponin I (ng/L) - moving average', 
    'dsl_physltc_osteoporosis': 'Osteoporosis (days)', 
    'basophil_count_v': 'Basophil Count - last value', 
    'albumin_rm': 'Albumin (g/L) - moving average',
    'creatinine_nl': 'Creatinine - low', 
    'ferritin_rm': 'Ferritin (ug/L) - moving average', 
    'dsl_physltc_hip_fracture': 'Hip fracture (days)', 
    'num_outp_att_dna_C7': 'Outpatient failed visits (Opthalmology)', 
    'creatinine_nh': 'Creatinine - high', 
    'alk.phos_nh': 'Alkaline Phosphatase - high', 
    'eosinophil_count_nl': 'Eosinophil Count - low', 
    'dsl_physltc_historical_or_active_cancer': 'Historical or Active Cancer (days)', 
    'bilirubin_rm': 'Bilirubin (umol/L) - moving average', 
    'Sex_F': 'Sex (Female)',
    'esr_rm': 'ESR (mm/hr) - moving average',
    'urea_v': 'Urea (mmol/L) - last value',
    'urea_rm': 'Urea (mmol/L) - moving average',
    'ck_rm': 'CK (IU/L) - moving average',
    'triage_code': 'ED triage code',
    'tco2_v': 'tCO2 (mmol/L) - last value',
    'arrival_mode_R': 'Arrival - Routine Ambulance',
    'dsl_inp_att': 'Last scheduled inpatient attendance',
    'trQ_4at': '4AT Score',
    'basophil_count_rm': 'Basophil Count - moving average',
    'albumin_v': 'Albumin (g/L) - last value',
    'c-reactive_prot_rs': 'CRP (mg/L) - moving std',
    'hs_troponin_t_v': 'HS Troponin T (ng/L) - last value',
    'albumin_rs': 'Albumin (g/L) - moving std',
    'trQ_mobility_bathing_INDEPENDENT': 'Mobility (Bathing independence)',
    'tco2_rs': 'tCO2 (mmol/L) - moving std',
    'bilirubin_v': 'Bilirubin (umol/L) - last value',
    'haematocrit_nl': 'Haematocrit - low'}

In [None]:
base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_m = train_data[[f for f in lkup_fields if f in train_data.columns]]
val_lkup_m = val_data[[f for f in lkup_fields if f in val_data.columns]]
### GT fields
train_y_m = train_data['gt_m']
val_y_m = val_data['gt_m']

In [None]:
### XGBoost features
train_x_m = train_data.drop(train_lkup_m.columns.tolist(), axis=1)
val_x_m = val_data.drop(val_lkup_m.columns.tolist(), axis=1)
print('Training features')
print(train_x_m.columns.tolist())
print(train_x_m.shape, val_x_m.shape, train_y_m.shape, val_y_m.shape)

In [None]:
### Create XGBoost objects
train_dm_m = xgb.DMatrix(train_x_m, label=train_y_m)
val_dm_m = xgb.DMatrix(val_x_m, label=val_y_m)

##### Setup hyperparameters

In [None]:
early_stopping = 100
rounds = 20000
pos_weight = round(len(train_y_m[train_y_m==0]) / len(train_y_m[train_y_m==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_m[train_y_m==1])/ len(train_y_m), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

##### Training/eval pipeline

In [None]:
def train_optimize_model(train_dm, val_dm, save_path, target, version,
                         param_grid=params_def,
                         es=early_stopping, rounds=rounds):
    print('Training baseline model for target: ', target)
    evals_result = {}
    model = xgb.train(param_grid, train_dm, num_boost_round=rounds, early_stopping_rounds=es,
                      evals=[(train_dm, 'train'), (val_dm, 'validation')], evals_result=evals_result)
    print('Best Score: {:.3f} with {} rounds'.format(model.best_score, model.best_iteration+1))
    print('Refitting model to best iteration...')
    best_iter = model.best_iteration
    best_model = xgb.train(param_grid, train_dm, num_boost_round=best_iter, early_stopping_rounds=es,
                      evals=[(train_dm, 'train'), (val_dm, 'validation')], verbose_eval=False)
    print('Training complete. Storing baseline to disk.')
    best_model.save_model(save_path + '_' + target + '_' + version + '.model')
    return best_model, evals_result

def plot_learning_curve(model, evals_result, metric='logloss'):
    if evals_result==None:
        return
    epochs = len(evals_result['train'][metric])
    x_axis = range(0, epochs)
    plt.figure(figsize=(6,6))
    lw = 1.5
    plt.plot(x_axis, evals_result['train'][metric], color='darkorange', lw=lw, label='Training loss')
    plt.plot(x_axis, evals_result['validation'][metric], color='navy', lw=lw, label='Validation loss')
    #plt.xlim([-0.05, 1.05])
    #plt.ylim([-0.05, 1.05])
    plt.xlabel('# Epochs')
    plt.ylabel('Logistic loss')
    plt.title('XGBoost Classifier learning curves')

def plot_roc(true_labels, pred_probs, model_path=None, title=None):
    fpr, tpr, thresholds = roc_curve(true_labels, pred_probs, pos_label=1)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,6))
    lw = 1.5
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (AUC = {0:.3f})'.format(roc_auc))
    plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--')
    #plt.xlim([-0.05, 1.05])
    #plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    if title:
        plt.title('ROC Curve representing: ' + title)
    plt.legend(loc='lower right')
    plt.show()

def plot_pr(true_labels, precision, recall, prauc):
    rand_th = len(true_labels[true_labels==1]) / len(true_labels)
    plt.figure(figsize=(6,6))
    lw = 1.5
    plt.plot(recall, precision, color='darkorange', lw=lw, label='PR curve (AUC = {0:.3f}, b={1:.3f})'.format(prauc, rand_th))
    plt.plot([0,1], [rand_th,rand_th], color='navy', lw=lw, linestyle='--')
    plt.xlim([-0.01, 1.01])
    #plt.ylim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='upper right')
    plt.show()

def get_roc_performance(labels_val, labels_pred_val):
    fpr_val, tpr_val, th_val = roc_curve(labels_val, labels_pred_val, pos_label=1)
    res_dict_roc = {}
    print('---------ROC----------')
    #print('ROC-AUC score on training set: {0:.3f}'.format(auc(fpr_train, tpr_train)))
    print('ROC-AUC score on validation set: {0:.3f}'.format(auc(fpr_val, tpr_val)))
    ## Get Youden's Index for best point of discrimination
    j_sc = np.argmax(tpr_val - fpr_val)
    yd = th_val[j_sc]
    print('Youden Index {0:.3f}'.format(yd))
    labels_pred_val_b = np.asarray([1 if line > yd else 0 for line in labels_pred_val])
    print('Classification report based on Youden\'s J-statistic.')
    print(classification_report(labels_val, labels_pred_val_b, target_names=['0', '1']))
    aucss, ci = roc_auc_score(labels_val, labels_pred_val, confidence_level=0.95)
    ppv, cip = ppv_score(labels_val, labels_pred_val_b, confidence_level=0.95)
    npv, cin = npv_score(labels_val, labels_pred_val_b, confidence_level=0.95)
    tnr, cit = tnr_score(labels_val, labels_pred_val_b, confidence_level=0.95)
    tpr, cis = tpr_score(labels_val, labels_pred_val_b, confidence_level=0.95)
    print('ROC-AUC with 95% CI based on DeLong test: {0:.3f} [{1:.3f}, {2:.3f}]'.format(aucss, ci[0], ci[1]))
    print('PPV Score: {0:.3f}[{1:.3f}, {2:.3f}], NPV Score: {3:.3f}[{4:.3f}, {5:.3f}]'.format(ppv, cip[0], cip[1],
                                                                                              npv, cin[0], cin[1]))
    print('Specificity: {0:.3f}[{1:.3f}, {2:.3f}]'.format(tnr, cit[0], cit[1]))
    print('Sensitivity: {0:.3f}[{1:.3f}, {2:.3f}]'.format(tpr, cis[0], cis[1]))
    res_dict_roc['ROC-AUC'] = aucss
    res_dict_roc['ROC-upper'] = ci[1]
    res_dict_roc['ROC-lower'] = ci[0]
    return labels_pred_val_b, res_dict_roc

def expect_f1(y_prob, thres):
    idxs = np.where(y_prob >= thres)[0]
    tp = y_prob[idxs].sum()
    fp = len(idxs) - tp
    idxs = np.where(y_prob < thres)[0]
    fn = y_prob[idxs].sum()
    return 2 * tp / (2 * tp + fp + fn)

def opt_thres_f1(y_prob):
    y_prob = np.sort(y_prob[::-1])
    f1s = [expect_f1(y_prob, p) for p in y_prob]
    thres = y_prob[np.argmax(f1s)]
    return thres, f1s

def get_pr_performance(labels_val, labels_pred_val, labels_pred_val_b, opt_f1=True):
    prec, recall, th = precision_recall_curve(labels_val, labels_pred_val, pos_label=1)
    f1, prauc = f1_score(labels_val, labels_pred_val_b), auc(recall, prec)
    res_dict = {}
    if opt_f1:
        thres, _ = opt_thres_f1(labels_pred_val)
        print('F1-optimised probability threshold: {:.3f}'.format(thres))
        labels_pred_val_b = np.asarray([1 if line > thres else 0 for line in labels_pred_val])
        f1 = f1_score(labels_val, labels_pred_val_b)
        print('Classification report based on the F1-optimised threshold.')
        print(classification_report(labels_val, labels_pred_val_b, target_names=['0', '1']))
        ppv, cip = ppv_score(labels_val, labels_pred_val_b, confidence_level=0.95)
        npv, cin = npv_score(labels_val, labels_pred_val_b, confidence_level=0.95)
        tnr, cit = tnr_score(labels_val, labels_pred_val_b, confidence_level=0.95)
        tpr, cis = tpr_score(labels_val, labels_pred_val_b, confidence_level=0.95)
        print('PPV Score: {0:.3f}[{1:.3f}, {2:.3f}], NPV Score: {3:.3f}[{4:.3f}, {5:.3f}]'.format(ppv, cip[0], cip[1],
                                                                                              npv, cin[0], cin[1]))
        print('Specificity: {0:.3f}[{1:.3f}, {2:.3f}]'.format(tnr, cit[0], cit[1]))
        print('Sensitivity: {0:.3f}[{1:.3f}, {2:.3f}]'.format(tpr, cis[0], cis[1]))
    print('---------Precision-Recall----------')
    print('PR-AUC score on validation set: {0:.3f}, F1 Score: {1:.3f}'.format(prauc, f1))
    #aucss, ci = roc_auc_score(labels_val, labels_pred_val, confidence_level=0.95)
    #prec, cip = cfi.precision_score(labels_val, labels_pred_val_b, confidence_level=0.95)
    #recall, cin = cfi.recall_score(labels_val, labels_pred_val_b, confidence_level=0.95)
    tp = sum((labels_val == 1)&(labels_pred_val_b==1))
    fp = sum((labels_val != 1)&(labels_pred_val_b==1))
    fn = sum((labels_val != 0)&(labels_pred_val_b==0))
    prec_s = tp / (fp + tp) if (tp + fp) != 0 else 0
    recall_s = tp / (tp + fn) if (tp + fn) != 0 else 0
    
    se_prec = np.sqrt(prec_s * (1 - prec_s) / (tp + fp))
    se_rec = np.sqrt(recall_s * (1 - recall_s) / (tp + fn))
    z = stats.norm.ppf(1 - 0.05 / 2)
    cip = (prec_s - z * se_prec, prec_s + z * se_prec)
    cin = (recall_s - z * se_rec, recall_s + z * se_rec)
    pr_var = (cip[1] - cip[0]) ** 2 / 4
    recall_var = (cin[1] - cin[0]) ** 2 / 4
    cov_mat = [[pr_var, 0], [0, recall_var]]
    auc_se = np.sqrt(np.dot(np.dot([1,1], cov_mat), [1,1]))
    lb = prauc - 1.96 * auc_se
    ub = prauc + 1.96 * auc_se
    print('PR-AUC with 95% CI based on DeLong test: {0:.3f} [{1:.3f}, {2:.3f}]'.format(prauc, lb, ub))
    print('Precision Score: {0:.3f}[{1:.3f}, {2:.3f}], Recall Score: {3:.3f}[{4:.3f}, {5:.3f}]'.format(prec_s, cip[0], cip[1],
                                                                                              recall_s, cin[0], cin[1]))
    res_dict['PR-AUC'] = prauc
    res_dict['PR-upper'] = ub
    res_dict['PR-lower'] = lb
    return prec, recall, prauc, res_dict

def evaluate_model(train_features, val_features, labels_train, labels_val, model, evals_result, task, 
                   class_names=['0', '1'], roc=True, pr=False, bsamples=100, calib=False, train_lkup=None, 
                  tgt='gt_m'):
    print('Evaluating model for target: ' + task)
    if 'age_gr' in val_features:
        val_features = val_features.drop('age_gr', axis=1)
    plot_learning_curve(model, evals_result)
    labels_pred_val = model.predict(xgb.DMatrix(val_features))
    #labels_pred_train = model.predict(xgb.DMatrix(train_features))
    labels_pred_val_b, _ = get_roc_performance(labels_val, labels_pred_val)
    prec, recall, prauc, _ = get_pr_performance(labels_val, labels_pred_val, labels_pred_val_b)               
    if roc:
        print('Plotting ROC Curve on validation set')
        plot_roc(labels_val, labels_pred_val)
    if pr:
        print('Plotting PR-Curve on validation set')
        plot_pr(labels_val, prec, recall, prauc)
    if calib:
        print('Plotting Calibration curve..')
        spline_probs = plot_calibration(labels_val, labels_pred_val, train_features, labels_train, model, 
                                        train_lkup, tgt)
        print('-------Re-calibrated evaluation--------')
        print('---------------------------------------')
        labels_pred_val_b, res_dict_roc = get_roc_performance(labels_val, spline_probs)
        prec, recall, prauc, res_dict_pr = get_pr_performance(labels_val, spline_probs, labels_pred_val_b) 

    res_dict = res_dict_roc | res_dict_pr
    print('Evaluation complete.')
    return res_dict

def get_roc_pr_summary(models_list, val_data_list, true_labels, md_lb=[], colors=['tan', 'sienna', 'darkblue',
                                                                                 'darkgreen', 'goldenrod', 'tomato']):
    fprs = {}; tprs = {}; roc_aucs = {}; precs = {}; recalls = {};
    praucs = {}; rand_ths = {}; roc_cis = {}; pr_cis = {};
    for model, val, md_name, y_true in zip(models_list, val_data_list, md_lb, true_labels):
        labels_pred = model.predict(xgb.DMatrix(val))
        fpr, tpr, th = roc_curve(y_true, labels_pred, pos_label=1)
        aucss, ci = roc_auc_score(y_true, labels_pred, confidence_level=0.95)
        roc_auc = auc(fpr, tpr)
        fprs[md_name] = fpr
        tprs[md_name] = tpr
        roc_aucs[md_name] = roc_auc
        roc_cis[md_name] = ci
        rand_ths[md_name] = round(len(y_true[y_true==1]) / len(y_true), 2)
        prec, recall, _ = precision_recall_curve(y_true, labels_pred, pos_label=1)
        labels_pred_val_b, _ = get_roc_performance(y_true, labels_pred)
        _, _, _, res_dict = get_pr_performance(y_true, labels_pred, labels_pred_val_b, opt_f1=True)
        pr_auc = auc(recall, prec)
        precs[md_name] = prec
        recalls[md_name] = recall
        praucs[md_name] = pr_auc
        pr_cis[md_name] = [res_dict['PR-lower'], res_dict['PR-upper']]

    plt.figure(figsize=(6,6))
    lw=1.5
    for md_name, color in zip(md_lb, colors):
        plt.plot(fprs[md_name], tprs[md_name], color=color, lw=lw, 
                 label='{0}, (AUC = {1:.2f} [{2:.2f}, {3:.2f}])'.format(md_name, roc_aucs[md_name], roc_cis[md_name][0],
                                                                   roc_cis[md_name][1]))
    plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--', label='Random choice')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve summary')
    plt.legend(loc='lower right')
    plt.show()

    plt.figure(figsize=(6,6))
    lw=1.5
    for md_name, color in zip(md_lb, colors):
        plt.plot(recalls[md_name], precs[md_name], color=color, lw=lw, 
                 label='{0}, (AUC = {1:.2f} [{2:.2f},{3:.2f}])'.format(md_name, praucs[md_name],
                                                                   pr_cis[md_name][0],
                                                                   pr_cis[md_name][1]))
        plt.plot([0,1], [rand_ths[md_name], rand_ths[md_name]], color=color, lw=lw, linestyle='--')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve summary')
    plt.legend(loc='upper right')
    plt.show()

def plot_calibration(y_true, y_pred, x_train, y_train, model, train_lkup, tgt='gt_m'):
    ### Split calibration set from training
    if tgt in ['gt_adrd', 'gt_5yadrd', 'gt_10yadrd']:
        x_train, x_calib, y_train, y_calib = train_test_split(x_train,train_lkup[tgt],
                                                          test_size=0.3, random_state=42, 
                                                          stratify=train_lkup[tgt])
    else:
        x_train, x_calib, y_train, y_calib = train_test_split(x_train, 
                                                          pd.concat([train_lkup[tgt],train_lkup['age_gr']], axis=1),
                                                          test_size=0.3, random_state=42, 
                                                          stratify=pd.concat([train_lkup[tgt],train_lkup['age_gr']], 
                                                                             axis=1))
        y_train = y_train[tgt]
        y_calib = y_calib[tgt]
    calib_preds = model.predict(xgb.DMatrix(x_calib))
    calib = mli.SplineCalib()
    calib.fit(calib_preds, y_calib)
    
    plt.figure(figsize=(6,6))
    calib.show_calibration_curve();
    mli.plot_reliability_diagram(y_true, y_pred, show_histogram=False, error_bars=False,
                                legend_names=['Pre-calibration curve', 'Perfectly calibrated'])
    plt.title('Reliability diagram')
    plt.show()
    plt.figure(figsize=(12,6))
    mli.plot_reliability_diagram(y_true, y_pred, show_histogram=True, error_bars=False)
    plt.title('Probability scores')
    plt.show()
    
    spline_probs = calib.predict(y_pred)
    calibt = mli.SplineCalib()
    calibt.fit(spline_probs, y_true)
    #c_probs = calib.predict(np.linspace(0,max(spline_probs),43409))
    plt.figure(figsize=(6,6))
    calibt.show_calibration_curve();
    mli.plot_reliability_diagram(y_true, spline_probs, show_histogram=False, error_bars=False,
                                 legend_names=['Spline calibration curve', 'Perfectly calibrated'])
    #plt.plot(np.linspace(0,1,43409), c_probs)
    plt.title('After Spline Calibration')
    plt.show()
    plt.figure(figsize=(12,6))
    mli.plot_reliability_diagram(y_true, spline_probs, show_histogram=True, error_bars=False)
    plt.title('After Spline Calibration')
    plt.show()
    return spline_probs
    
def get_xgb_feat_importance(val_data, model, task, n_features=20, imp_type='gain'):
    print('Getting global feature importances for target: ', task)
    model.feature_names = val_data.columns.tolist()
    model.feature_types = None
    importance = model.get_score(importance_type=imp_type)
    for key in importance.keys():
        importance[key] = round(importance[key], 2)
    plt.figure(figsize=(12,10))
    xgb.plot_importance(importance, max_num_features=n_features, importance_type=imp_type)
    plt.title('XGBoost global feature importances for {}. Mode: {}.'.format(task, imp_type))
    plt.show()

def get_shap_feature_importance(val_x, model, out_path, subset='Clinically-supervised model',
                                task='Any dementia diagnosis', 
                                fn='adem_diag', n_feat=20, plot_type='bar', feature_names=None):

    print('Getting global feature importances for task:', task)
    shap.initjs()
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(val_x)
    shap.summary_plot(shap_values, val_x, show=False, plot_size=(10, 7), max_display=n_feat,
                     title='SHAP global importances: ' + subset, feature_names=feature_names)
    plt.title('SHAP global importances: ' + subset)
    #f = plt.gcf()
    #f.savefig(out_path + '/' + fn + '_shap_global_fi.png', bbox_inches='tight', dpi=200)
    plt.show()

In [None]:
xgb_m, evals = train_optimize_model(train_dm_m, val_dm_m, model_path, 'in_hosp_death', 'v1.1.ADM')

In [None]:
xgb_m = xgb.Booster()
xgb_m.load_model('')

In [None]:
plt.rcParams.update({'font.size':12, 'font.weight':'normal', 'font.family':'serif'})

In [None]:
res_dict = evaluate_model(train_x_m, val_x_m, train_y_m, val_y_m, xgb_m, evals, 'in_hosp_death', pr=True, calib=True,
              train_lkup=train_lkup_m, tgt='gt_m')
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'In-hospital death'

In [None]:
get_shap_feature_importance(val_x_m.rename(columns=feature_names), xgb_m, out_path=None, subset='All features model',
                                task='In-hospital death', 
                                fn='in_hosp_death', n_feat=20, plot_type='bar')

##### Decile analysis

In [None]:
def rank_prediction_deciles(model, val_data, gt_data, gt_field, title=None, by_age=False,
                           n_bins=10, age_lb=['65-69', '70-74', '75-79', '80-84', '85-89', '90+'],
                            colors=['#fef0d9', '#fdd49e', '#fdbb84', '#fc8d59', '#e34a33', '#b30000'],
                            age_title=None):
    res_dict = {}
    if 'age_gr' in val_data.columns:
        val_data = val_data.drop('age_gr', axis=1)
    labels_pred = model.predict(xgb.DMatrix(val_data))
    val_pr = pd.DataFrame()
    val_pr['prob'] = labels_pred
    val_pr['rank_xgb'] = pd.qcut(val_pr['prob'], q=n_bins, labels=False) + 1
    val_pr[gt_field] = gt_data[gt_field]
    avg_resp = val_pr[gt_field].mean() * 100
    dec_stats = val_pr.groupby('rank_xgb')[gt_field].sum().reset_index()
    samples = val_pr.groupby('rank_xgb').count().reset_index().iloc[:, 1:2]
    samples.columns = ['total']
    dec_stats['rr'] = round((dec_stats[gt_field] / samples['total']) * 100, 3)

    #### Plot the results
    plt.figure(figsize=(6, 6))
    plt.bar(range(1, n_bins+1), dec_stats['rr'], width=0.8, align='center', alpha=0.7)
    plt.axhline(y=avg_resp, color='r', linestyle='-', label='PO: {:.2f}%'.format(avg_resp))
    plt.xlabel('Prediction deciles')
    plt.ylabel('% of positive examples (Response rate)')
    plt.title(title)
    plt.xticks(np.arange(1, n_bins+1))
    plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:.0f}%'))
    plt.legend(loc='upper left')
    plt.show()
    res_dict['outcome_prev'] = avg_resp
    res_dict['10th_decile_response'] = dec_stats['rr'][9]

    if by_age:
        val_data['age_gr'] = np.where((val_data.AgeAtAdmission >=65)&(val_data.AgeAtAdmission<70), '65-69', '90+')
        val_data['age_gr'] = np.where((val_data.AgeAtAdmission >=70)&(val_data.AgeAtAdmission<75), '70-74', val_data['age_gr'])
        val_data['age_gr'] = np.where((val_data.AgeAtAdmission >=75)&(val_data.AgeAtAdmission<80), '75-79', val_data['age_gr'])
        val_data['age_gr'] = np.where((val_data.AgeAtAdmission >=80)&(val_data.AgeAtAdmission<85), '80-84', val_data['age_gr'])
        val_data['age_gr'] = np.where((val_data.AgeAtAdmission >=85)&(val_data.AgeAtAdmission<90), '85-89', val_data['age_gr'])
        val_pr['age_gr'] = val_data['age_gr']
        val_long = pd.melt(val_pr, id_vars=['rank_xgb'], value_vars=['age_gr'], value_name='age_group')
        val_long = val_long.groupby(['rank_xgb', 'age_group']).size().reset_index(name='Count')
        val_y_counts = val_long.groupby('rank_xgb')['Count'].apply(lambda x: x.sum()).reset_index().rename(columns={'Count':'Total'})
        val_long = val_long.merge(val_y_counts, how='left', on='rank_xgb')
        val_long['Percentage'] = round(val_long['Count'] / val_long['Total'], 3)
        ax = pd.pivot_table(val_long[['rank_xgb', 'age_group', 'Percentage']], columns=['age_group'],
                    index=['rank_xgb'], sort=True).plot(title=age_title,
                                                     kind='bar',
                                                     figsize=(6,6), color=colors,
                                                     stacked=True)
        ax.legend(title='Age group', labels=age_lb,
                 bbox_to_anchor=(1,1))
        ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
        plt.xticks(rotation=0, ha='center')
        plt.xlabel('Prediction deciles')
        plt.ylabel('Age group distribution')
        plt.show()

    return res_dict

In [None]:
risk_dict = rank_prediction_deciles(xgb_m, val_x_m, val_lkup_m[['gt_m', 'age_gr']], 'gt_m', 
                        'Decile analysis in XGB model for in-hospital death.',
                       by_age=True, 
                        age_title='Age-stratified decile analysis for predicted patients with outcome.')

In [None]:
tr_dict = res_dict | risk_dict
tr_df = pd.DataFrame.from_dict(tr_dict, orient='index').T

In [None]:
tr_df

### Extended hospital stay

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_es = train_data[[f for f in lkup_fields if f in train_data.columns]]
val_lkup_es = val_data[[f for f in lkup_fields if f in val_data.columns]]
### GT fields
train_y_es = train_data['gt_es_hosp']
val_y_es = val_data['gt_es_hosp']
### XGBoost features
train_x_es = train_data.drop(train_lkup_es.columns.tolist(), axis=1)
val_x_es = val_data.drop(val_lkup_es.columns.tolist(), axis=1)
print('Training features')
print(train_x_es.columns.tolist())
print(train_x_es.shape, val_x_es.shape, train_y_es.shape, val_y_es.shape)
### Create XGBoost objects
train_dm_es = xgb.DMatrix(train_x_es, label=train_y_es)
val_dm_es = xgb.DMatrix(val_x_es, label=val_y_es)

In [None]:
early_stopping = 100
rounds = 20000
pos_weight = round(len(train_y_es[train_y_es==0]) / len(train_y_es[train_y_es==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_es[train_y_es==1])/ len(train_y_es), 2))
params_def = {
    'max_depth': 4,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
xgb_es, evals = train_optimize_model(train_dm_es, val_dm_es, model_path, 'plos_hosp', 'v1.1.ADM', param_grid=params_def)

In [None]:
xgb_es = xgb.Booster()
xgb_es.load_model('')

In [None]:
res_dict = evaluate_model(train_x_es, val_x_es, train_y_es, val_y_es, xgb_es, evals, 'plos_hosp', pr=True, calib=True,
              train_lkup=train_lkup_es, tgt='gt_es_hosp')
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Extended stay (>=14 days)'

In [None]:
get_shap_feature_importance(val_x_es.rename(columns=feature_names), xgb_es, out_path=None, subset='All features model',
                                task='Extended stay', 
                                fn='Extended stay', n_feat=20, plot_type='bar')

In [None]:
risk_dict = rank_prediction_deciles(xgb_es, val_x_es, val_lkup_es[['gt_es_hosp', 'age_gr']], 'gt_es_hosp', 
                        'Decile analysis in XGB model for extended stay.',
                       by_age=True, 
                        age_title='Age-stratified decile analysis for predicted patients with outcome.')

In [None]:
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Extended stay (>=14 days)'

tr_dict = res_dict | risk_dict
tr_df_es = pd.DataFrame.from_dict(tr_dict, orient='index').T

In [None]:
tr_df_es

In [None]:
tr_df

### ICU/HDU admission

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_cc = train_data[[f for f in lkup_fields if f in train_data.columns]]
val_lkup_cc = val_data[[f for f in lkup_fields if f in val_data.columns]]
### GT fields
train_y_cc = train_data['gt_cc']
val_y_cc = val_data['gt_cc']
### XGBoost features
train_x_cc = train_data.drop(train_lkup_cc.columns.tolist(), axis=1)
val_x_cc = val_data.drop(val_lkup_cc.columns.tolist(), axis=1)
print('Training features')
print(train_x_cc.columns.tolist())
print(train_x_cc.shape, val_x_cc.shape, train_y_cc.shape, val_y_cc.shape)
### Create XGBoost objects
train_dm_cc = xgb.DMatrix(train_x_cc, label=train_y_cc)
val_dm_cc = xgb.DMatrix(val_x_cc, label=val_y_cc)

In [None]:
early_stopping = 100
rounds = 20000
pos_weight = round(len(train_y_cc[train_y_cc==0]) / len(train_y_cc[train_y_cc==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_cc[train_y_cc==1])/ len(train_y_cc), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
xgb_cc, evals = train_optimize_model(train_dm_cc, val_dm_cc, model_path, 'icu_hdu', 'v1.1.ADM', param_grid=params_def)

In [None]:
xgb_cc = xgb.Booster()
xgb_cc.load_model('')

In [None]:
res_dict = evaluate_model(train_x_cc, val_x_cc, train_y_cc, val_y_cc, xgb_cc, evals, 'icu_hdu', pr=True, calib=True,
              train_lkup=train_lkup_cc, tgt='gt_cc')
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'ICU/HDU admission'

In [None]:
get_shap_feature_importance(val_x_cc.rename(columns=feature_names), xgb_cc, out_path=None, subset='All features model',
                                task='ICU/HDU admission', 
                                fn='ICU/HDU admission', n_feat=20, plot_type='bar')

In [None]:
risk_dict = rank_prediction_deciles(xgb_cc, val_x_cc, val_lkup_cc[['gt_cc', 'age_gr']], 'gt_cc', 
                        'Decile analysis in XGB model for ICU/HDU admission.',
                       by_age=True, 
                        age_title='Age-stratified decile analysis for predicted patients with outcome.')

In [None]:
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'ICU/HDU admission'

tr_dict = res_dict | risk_dict
tr_df_cc = pd.DataFrame.from_dict(tr_dict, orient='index').T

In [None]:
tr_df_cc

#### Home discharge

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_dd = train_data[[f for f in lkup_fields if f in train_data.columns]]
val_lkup_dd = val_data[[f for f in lkup_fields if f in val_data.columns]]
### GT fields
train_y_dd = train_data['gt_dd']
val_y_dd = val_data['gt_dd']
### XGBoost features
train_x_dd = train_data.drop(train_lkup_dd.columns.tolist(), axis=1)
val_x_dd = val_data.drop(val_lkup_dd.columns.tolist(), axis=1)
print('Training features')
print(train_x_dd.columns.tolist())
print(train_x_dd.shape, val_x_dd.shape, train_y_dd.shape, val_y_dd.shape)
### Create XGBoost objects
train_dm_dd = xgb.DMatrix(train_x_dd, label=train_y_dd)
val_dm_dd = xgb.DMatrix(val_x_dd, label=val_y_dd)

In [None]:
early_stopping = 100
rounds = 20000
pos_weight = round(len(train_y_dd[train_y_dd==0]) / len(train_y_dd[train_y_dd==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_dd[train_y_dd==1])/ len(train_y_dd), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
xgb_dd, evals = train_optimize_model(train_dm_dd, val_dm_dd, model_path, 'home_disch', 'v1.1.ADM', param_grid=params_def)

In [None]:
xgb_dd = xgb.Booster()
xgb_dd.load_model('')

In [None]:
res_dict = evaluate_model(train_x_dd, val_x_dd, train_y_dd, val_y_dd, xgb_dd, evals, 'home_disch', pr=True, calib=True,
              train_lkup=train_lkup_dd, tgt='gt_dd')
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Home discharge'

In [None]:
get_shap_feature_importance(val_x_dd.rename(columns=feature_names), xgb_dd, out_path=None, subset='All features model',
                                task='Home discharge', 
                                fn='Home discharge', n_feat=20, plot_type='bar')

In [None]:
risk_dict = rank_prediction_deciles(xgb_dd, val_x_dd, val_lkup_dd[['gt_dd', 'age_gr']], 'gt_dd', 
                        'Decile analysis in XGB model for Home discharge.',
                       by_age=True, 
                        age_title='Age-stratified decile analysis for predicted patients with outcome.')

In [None]:
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Home discharge'

tr_dict = res_dict | risk_dict
tr_df_dd = pd.DataFrame.from_dict(tr_dict, orient='index').T

In [None]:
tr_df_dd

#### Any rehab

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_reh = train_data[lkup_fields]
val_lkup_reh = val_data[lkup_fields]
### GT fields
train_y_reh = train_data['gt_rehab']
val_y_reh = val_data['gt_rehab']
### XGBoost features
train_x_reh = train_data.drop(train_lkup_reh.columns.tolist(), axis=1)
val_x_reh = val_data.drop(val_lkup_reh.columns.tolist(), axis=1)
print('Training features')
print(train_x_reh.columns.tolist())
print(train_x_reh.shape, val_x_reh.shape, train_y_reh.shape, val_y_reh.shape)
### Create XGBoost objects
train_dm_reh = xgb.DMatrix(train_x_reh, label=train_y_reh)
val_dm_reh = xgb.DMatrix(val_x_reh, label=val_y_reh)

In [None]:
early_stopping = 100
rounds = 20000
pos_weight = round(len(train_y_reh[train_y_reh==0]) / len(train_y_reh[train_y_reh==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_reh[train_y_reh==1])/ len(train_y_reh), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
xgb_reh, evals = train_optimize_model(train_dm_reh, val_dm_reh, model_path, 'rehab', 'v1.1.ADM', param_grid=params_def)

In [None]:
xgb_reh = xgb.Booster()
xgb_reh.load_model('')

In [None]:
res_dict = evaluate_model(train_x_reh, val_x_reh, train_y_reh, val_y_reh, xgb_reh, evals, 'rehab', pr=True, calib=True,
              train_lkup=train_lkup_reh, tgt='gt_rehab')
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Received rehabilitation'

In [None]:
get_shap_feature_importance(val_x_reh.rename(columns=feature_names), xgb_reh, out_path=None, subset='All features model',
                                task='Received rehabilitation', 
                                fn='Received rehabilitation', n_feat=20, plot_type='bar')

In [None]:
risk_dict = rank_prediction_deciles(xgb_reh, val_x_reh, val_lkup_reh[['gt_rehab', 'age_gr']], 'gt_rehab', 
                        'Decile analysis in XGB model for future rehabilitation.',
                       by_age=True, 
                        age_title='Age-stratified decile analysis for predicted patients with outcome.')

In [None]:
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Received rehabilitation'

tr_dict = res_dict | risk_dict
tr_df_reh = pd.DataFrame.from_dict(tr_dict, orient='index').T

### MoE

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

In [None]:
base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_moe = train_data[[f for f in lkup_fields if f in train_data.columns]]
val_lkup_moe = val_data[[f for f in lkup_fields if f in val_data.columns]]
### GT fields
train_y_moe = train_data['gt_eld']
val_y_moe = val_data['gt_eld']
### XGBoost features
train_x_moe = train_data.drop(train_lkup_moe.columns.tolist(), axis=1)
val_x_moe = val_data.drop(val_lkup_moe.columns.tolist(), axis=1)
print('Training features')
print(train_x_moe.columns.tolist())
print(train_x_moe.shape, val_x_moe.shape, train_y_moe.shape, val_y_moe.shape)
### Create XGBoost objects
train_dm_moe = xgb.DMatrix(train_x_moe, label=train_y_moe)
val_dm_moe = xgb.DMatrix(val_x_moe, label=val_y_moe)

In [None]:
early_stopping = 100
rounds = 20000
pos_weight = round(len(train_y_moe[train_y_moe==0]) / len(train_y_moe[train_y_moe==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_moe[train_y_moe==1])/ len(train_y_moe), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
xgb_moe, evals = train_optimize_model(train_dm_moe, val_dm_moe, model_path, 'moe', 'v1.1.ADM', param_grid=params_def)

In [None]:
xgb_moe = xgb.Booster()
xgb_moe.load_model('')

In [None]:
res_dict = evaluate_model(train_x_moe, val_x_moe, train_y_moe, val_y_moe, xgb_moe, evals, 'eld', pr=True, calib=True,
              train_lkup=train_lkup_moe, tgt='gt_eld')
res_dict['timepoint'] = 'Hospital admission'
res_dict['target'] = 'Admission to MoE'

In [None]:
get_shap_feature_importance(val_x_moe.rename(columns=feature_names), xgb_moe, out_path=None, subset='All features model',
                                task='Admission to MoE', 
                                fn='Admission to MoE', n_feat=20, plot_type='bar')

In [None]:
risk_dict = rank_prediction_deciles(xgb_moe, val_x_moe, val_lkup_moe[['gt_eld', 'age_gr']], 'gt_eld', 
                        'Decile analysis in XGB model for Admission to MoE.',
                       by_age=True, 
                        age_title='Age-stratified decile analysis for predicted patients with outcome.')

In [None]:
tr_dict = res_dict | risk_dict
tr_df_moe = pd.DataFrame.from_dict(tr_dict, orient='index').T

In [None]:
tr_df_all = pd.concat([tr_df, tr_df_es, tr_df_cc, tr_df_dd, tr_df_moe, tr_df_reh], axis=0)

In [None]:
tr_prev = pd.read_csv('')
tr_full = pd.concat([tr_prev, tr_df_all], axis=0)

In [None]:
tr_full

In [None]:
tr_df_all.to_csv('', index=False)

In [None]:
val_x_m.columns

In [None]:
get_roc_pr_summary([xgb_m, xgb_es, xgb_cc, xgb_dd, xgb_moe, xgb_reh],
                   #[val_x_m.drop(['age_gr'], axis=1), val_x_es.drop(['age_gr'], axis=1), val_x_cc.drop(['age_gr'], axis=1),
                   #val_x_dd.drop(['age_gr'], axis=1)], 
                   [val_x_m.drop(['age_gr'], axis=1), val_x_es.drop(['age_gr'], axis=1), val_x_cc, val_x_dd.drop(['age_gr'], axis=1), val_x_moe.drop(['age_gr'], axis=1), val_x_reh],
                   [val_y_m, val_y_es, val_y_cc, val_y_dd, val_y_moe, val_y_reh], 
                   md_lb=['In-hospital death','Extended stay', 'ICU/HDU admission', 'Home discharge', 'Admission to MoE', 'Rehabilitation'],
                  colors=['#d7191c', '#fdae61', '#abd9e9', '#2c7bb6', '#ed849e', '#810f7c'])