In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick
from statistics import mean, median
from matplotlib.dates import DateFormatter
from matplotlib.gridspec import GridSpec
from datetime import timedelta
from datetime import datetime
from tqdm import tqdm
from scipy.interpolate import interp1d
import gc
import os

from confidenceinterval import roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, roc_curve, precision_recall_curve, auc, f1_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibrationDisplay, calibration_curve
from sklearn.model_selection import StratifiedKFold, KFold
import shap
import xgboost as xgb
from xgboost import cv

#### Regression K-fold performance

In [None]:
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_cts = train_data[lkup_fields]
val_lkup_cts = val_data[lkup_fields]
### GT fields
train_y = train_data['total_count_all_tf']
val_y = val_data['total_count_all_tf']
### XGBoost features
train_x = train_data.loc[:, ~train_data.columns.isin(lkup_fields)]
val_x = val_data.loc[:, ~val_data.columns.isin(lkup_fields)]
all_x = pd.concat([train_x, val_x], axis=0)
all_y = pd.concat([train_y, val_y], axis=0)
print(train_x.columns.tolist())
print(train_x.shape, val_x.shape, train_y.shape, val_y.shape)

In [None]:
early_stopping = 50
rounds = 2000

params_def = {
    'max_depth': 3,
    'objective': 'reg:pseudohubererror',
    'nthread': 25,
    #'eval_metric': 'mape',
    'eta': 0.01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
plt.rcParams.update({'font.size':12, 'font.weight':'normal', 'font.family':'serif'})

In [None]:
def bootstrap_metric(labels_true, labels_pred, metric_func, n_iter=1000):
    n = len(labels_true)
    res = np.zeros(n_iter)
    for i in range(n_iter):
        ind = np.random.randint(0, n, n)
        sample_true = labels_true[ind]
        sample_pred = labels_pred[ind]
        res[i] = metric_func(sample_true, sample_pred)
    return res

def compute_ci(bootstrap_res, ci=0.95):
    lp = (1 - ci) / 2
    up = 1 - lp
    return np.round(np.percentile(bootstrap_res, [lp*100, up*100]), 3)

def rmse(labels_true, labels_pred):
    return np.sqrt(mean_squared_error(labels_true, labels_pred))

def mae(labels_true, labels_pred):
    return mean_absolute_error(labels_true, labels_pred)

def mape(labels_true, labels_pred):
    return np.mean(2 * np.abs(labels_true - labels_pred) / (np.abs(labels_true) + np.abs(labels_pred))) * 100

def mape_c(labels_true, labels_pred):
    mask = labels_true != 0
    return np.mean(np.abs((labels_true[mask] - labels_pred[mask]) / labels_true[mask])) * 100

In [None]:
def get_reg_kfold_performance(all_x, all_y, rounds, params_def, folds=10):
    
    print('Cross-validated regression over 10 folds (care intensity).')
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)
    model=xgb.XGBRegressor(n_estimators=rounds, max_depth=params_def['max_depth'],
                            learning_rate=params_def['eta'],
                            n_jobs=params_def['nthread'],
                            objective=params_def['objective'],
                            random_state=42)
    rmses = []; maes = []; mapes = [];
    for i, (train,test) in tqdm(enumerate(cv.split(all_x,all_y))):
        print(f'Evaluating fold {i+1}')
        print('-----------------------')
        model.fit(all_x.iloc[train], all_y.iloc[train])
        labels_val = np.array(all_y.iloc[test].values)
        labels_pred_val = model.predict(all_x.iloc[test])
        rmse_ci = compute_ci(bootstrap_metric(labels_val, labels_pred_val, rmse))
        mae_ci = compute_ci(bootstrap_metric(labels_val, labels_pred_val, mae))
        mape_ci = compute_ci(bootstrap_metric(labels_val, labels_pred_val, mape_c))
        rmse_val = round(rmse(labels_val, labels_pred_val), 3)
        mae_val = round(mae(labels_val, labels_pred_val), 3)
        mape_val = round(mape_c(labels_val, labels_pred_val), 3)
        print(f'RMSE: {rmse_val}, 95% CI: {rmse_ci}')
        print(f'MAE: {mae_val}, 95% CI: {mae_ci}')
        print(f'MAPE: {mape_val}, 95% CI: {mape_ci}')
        print('-----------------------')
        rmses.append(rmse_val)
        maes.append(mae_val)
        mapes.append(mape_val)

    mean_rmse = np.mean(rmses)
    mean_mae = np.mean(maes)
    mean_mapes = np.mean(mapes)
    std_rmse = np.std(rmses)
    std_mae = np.std(maes)
    std_mape = np.std(mapes)
    print('Overall scores')
    print(r'Mean RMSE=%0.3f $\pm$ %0.3f'%(mean_rmse, std_rmse))
    print(r'Mean MAE=%0.3f $\pm$ %0.3f'%(mean_mae, std_mae))
    print(r'Mean MAPE=%0.3f $\pm$ %0.3f'%(mean_mapes, std_mape))

In [None]:
get_reg_kfold_performance(all_x, all_y, rounds, params_def)

#### Validate on in-hospital death

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']
    
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_m = train_data[lkup_fields]
val_lkup_m = val_data[lkup_fields]
### GT fields
train_y = train_data['gt_m']
val_y = val_data['gt_m']
### XGBoost features
train_x = train_data.loc[:, ~train_data.columns.isin(lkup_fields)]
val_x = val_data.loc[:, ~val_data.columns.isin(lkup_fields)]
all_x = pd.concat([train_x, val_x], axis=0)
all_y = pd.concat([train_y, val_y], axis=0)
print(train_x.columns.tolist())
print(train_x.shape, val_x.shape, train_y.shape, val_y.shape)

In [None]:
early_stopping = 100
rounds = 2000
pos_weight = round(len(train_y_m[train_y_m==0]) / len(train_y_m[train_y_m==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_m[train_y_m==1])/ len(train_y_m), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
plt.rcParams.update({'font.size':12, 'font.weight':'normal', 'font.family':'serif'})

In [None]:
def get_roc_kfold_performance(all_x, all_y, rounds, params_def, early_stopping, folds=10,
                              roc_title='Cross-validated ROC Curve over 10 folds (in-hospital death).',
                             colors=['#543005', '#8c510a','#bf812d', '#dfc27d', '#f6e8c3',
                                    '#c7eae5', '#80cdc1', '#35978f', '#01665e', '#003c30']):
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    model=xgb.XGBClassifier(n_estimators=rounds, max_depth=params_def['max_depth'],
                            learning_rate=params_def['eta'],
                            n_jobs=params_def['nthread'], eval_metric=params_def['eval_metric'],
                            objective=params_def['objective'],
                            random_state=42)
    tprs = []; aucs = [];
    mean_fpr = np.linspace(0,1,100)
    fig, ax = plt.subplots(figsize=(6,6))
    lbl_list = []
    for i, (train,test) in tqdm(enumerate(cv.split(all_x,all_y))):
        model.fit(all_x.iloc[train], all_y.iloc[train])
        labels_val = np.array(all_y.iloc[test].values)
        #labels_pred_val = model.predict(all_x.iloc[test])
        labels_pred_val = model.predict_proba(all_x.iloc[test])[:, 1]
        #print(labels_val.shape, labels_pred_val.shape)
        #print(labels_val[labels_val==0].shape)
        #print(labels_val[labels_val==1].shape)
        #print(labels_pred_val)
        aucss, ci = roc_auc_score(labels_val, labels_pred_val, confidence_level=0.95)
        fpr_val, tpr_val, th_val = roc_curve(labels_val, labels_pred_val, pos_label=1)
        label = f'Fold {i+1} (AUC={aucss:.2f}, 95% CI:[{ci[0]:.2f}, {ci[1]:.2f}])'
        print(label)
        lbl_list.append(label)
        RocCurveDisplay(fpr=fpr_val, tpr=tpr_val, roc_auc=aucss, estimator_name=label).plot(ax=ax, color=colors[i])
        #viz = RocCurveDisplay.from_estimator(model, all_x.iloc[test], all_y.iloc[test], 
                                             #estimator-name='Fold {}'.format(i+1), alpha=0.3, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, fpr_val, tpr_val)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(aucss)

    ax.plot([0,1], [0,1], color='navy', lw=2, linestyle='--', alpha=0.8)
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='r', label=r'Mean ROC (AUC=%0.3f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=0.8)
    lbl_list.append(r'Mean ROC (AUC=%0.3f $\pm$ %0.3f)' % (mean_auc, std_auc))
    std_tpr = np.std(tprs,axis=0)
    tprs_u = np.minimum(mean_tpr + std_tpr, 1)
    tprs_l = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_l, tprs_u, color='grey', alpha=0.5, label=r'$\pm$ 1 std. dev')
    lbl_list.append(r'$\pm$ 1 std. dev')
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=roc_title)
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend(loc='lower right', labels=lbl_list)
    plt.show()

In [None]:
get_roc_kfold_performance(all_x, all_y, rounds, params_def, early_stopping)

#### Validate on extended stay

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']
    
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_m = train_data[lkup_fields]
val_lkup_m = val_data[lkup_fields]
### GT fields
train_y = train_data['gt_es_hosp']
val_y = val_data['gt_es_hosp']
### XGBoost features
train_x = train_data.loc[:, ~train_data.columns.isin(lkup_fields)]
val_x = val_data.loc[:, ~val_data.columns.isin(lkup_fields)]
all_x = pd.concat([train_x, val_x], axis=0)
all_y = pd.concat([train_y, val_y], axis=0)
print(train_x.columns.tolist())
print(train_x.shape, val_x.shape, train_y.shape, val_y.shape)

In [None]:
early_stopping = 100
rounds = 2000
pos_weight = round(len(train_y_m[train_y_m==0]) / len(train_y_m[train_y_m==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_m[train_y_m==1])/ len(train_y_m), 2))
params_def = {
    'max_depth': 4,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
get_roc_kfold_performance(all_x, all_y, rounds, params_def, early_stopping,
                         roc_title='Cross-validated ROC Curve over 10 folds (extended stay).')

#### Validate on ICU/HDU admission

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']
    
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_m = train_data[lkup_fields]
val_lkup_m = val_data[lkup_fields]
### GT fields
train_y = train_data['gt_cc']
val_y = val_data['gt_cc']
### XGBoost features
train_x = train_data.loc[:, ~train_data.columns.isin(lkup_fields)]
val_x = val_data.loc[:, ~val_data.columns.isin(lkup_fields)]
all_x = pd.concat([train_x, val_x], axis=0)
all_y = pd.concat([train_y, val_y], axis=0)
print(train_x.columns.tolist())
print(train_x.shape, val_x.shape, train_y.shape, val_y.shape)

In [None]:
early_stopping = 100
rounds = 2000
pos_weight = round(len(train_y_m[train_y_m==0]) / len(train_y_m[train_y_m==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_m[train_y_m==1])/ len(train_y_m), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
get_roc_kfold_performance(all_x, all_y, rounds, params_def, early_stopping, 
                         roc_title='Cross-validated ROC Curve over 10 folds (ICU/HDU admission).')

#### Validate on home discharge

In [None]:
#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']
    
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_m = train_data[lkup_fields]
val_lkup_m = val_data[lkup_fields]
### GT fields
train_y = train_data['gt_dd']
val_y = val_data['gt_dd']
### XGBoost features
train_x = train_data.loc[:, ~train_data.columns.isin(lkup_fields)]
val_x = val_data.loc[:, ~val_data.columns.isin(lkup_fields)]
all_x = pd.concat([train_x, val_x], axis=0)
all_y = pd.concat([train_y, val_y], axis=0)
print(train_x.columns.tolist())
print(train_x.shape, val_x.shape, train_y.shape, val_y.shape)

In [None]:
early_stopping = 100
rounds = 2000
pos_weight = round(len(train_y_m[train_y_m==0]) / len(train_y_m[train_y_m==1]), 3)
print('Weight scale parameter for imbalanced data:', pos_weight)
print(round(len(train_y_m[train_y_m==1])/ len(train_y_m), 2))
params_def = {
    'max_depth': 3,
    'objective': 'binary:logistic',
    'nthread': 25,
    'eval_metric': 'logloss',
    'eta': .01,
    #'colsample_bytree': .5,
    #'alpha': 1
    #'lambda': 2
    ### For imbalanced data
    #'scale_pos_weight': pos_weight
    #'subsample': .6
}

In [None]:
get_roc_kfold_performance(all_x, all_y, rounds, params_def, early_stopping, 
                          roc_title='Cross-validated ROC Curve over 10 folds (home discharge).')