In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick
from statistics import mean, median
from matplotlib.dates import DateFormatter
from matplotlib.gridspec import GridSpec
from datetime import timedelta
from datetime import datetime
from tqdm import tqdm
from scipy.interpolate import interp1d
import gc
import os

from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc, f1_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.calibration import CalibrationDisplay, calibration_curve
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import shap
import xgboost as xgb
from xgboost import cv

#### Compare diagnostic quality across various regression models

In [None]:
#### Load features while specifying data types for memory efficiency   
lkup_fields = ['ppid',
 'EpisodeNumber',
 'AdmissionDate',
 'ED_adate_dt',
 'IndexAttDate',
 'HOSP_adt',
 'DischargeDate',
 'HOSP_ddt',
 'breq_dt',
 'HOSP_FCC_dt',
 'HOSP_FAS_dt',
 'gt_m',
 'gt_cc',
 'gt_es_hosp',
 'gt_dd',
 'total_count_all',
 'total_count_rehab',
 'total_count_all_tf',
 'total_n_disciplines',
 'total_count_ooh_all',
 'total_n_disciplines_gr',
 'age_gr',
 'total_count_cts_gr']

#### Load features while specifying data types for memory efficiency
dem_types = pd.read_csv('', names=['item', 'dtype'], skiprows=1)
dtype_dict = {}
for idx, row in dem_types.iterrows():
    dtype_dict[row['item']] = row['dtype']

base_path = ''
model_path = ''
train_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True)
val_data = pd.read_csv(os.path.join(base_path, ''), low_memory=True, dtype=dtype_dict)

train_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in train_data.columns]
train_data.columns = [col.replace(',', '_') if ',' in col else col for col in train_data.columns]
val_data.columns = [col.replace('<', '_below_') if '<' in col else col for col in val_data.columns]
val_data.columns = [col.replace(',', '_') if ',' in col else col for col in val_data.columns]

### Shuffle data when using time-series split
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state=42).reset_index(drop=True)
### Lookup fields
train_lkup_cts = train_data[lkup_fields]
val_lkup_cts = val_data[lkup_fields]
### GT fields
train_y_cts = train_data['total_count_all_tf']
val_y_cts = val_data['total_count_all_tf']
### XGBoost features
train_x_cts = train_data.drop(train_lkup_cts.columns.tolist(), axis=1)
val_x_cts = val_data.drop(val_lkup_cts.columns.tolist(), axis=1)
print('Training features')
print(train_x_cts.columns.tolist())
print(train_x_cts.shape, val_x_cts.shape, train_y_cts.shape, val_y_cts.shape)
### Create XGBoost objects
train_dm_cts = xgb.DMatrix(train_x_cts, label=train_y_cts)
val_dm_cts = xgb.DMatrix(val_x_cts, label=val_y_cts)
xgb_cts = xgb.Booster()
xgb_cts.load_model('')

In [None]:
#### Create model set

regs = {
    'OLS Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Decision Tree': DecisionTreeRegressor(),
    'Elastic Net': ElasticNet(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': xgb_cts
}

ridge_params = {'fit_intercept': [True, False], 'copy_X': [True, False],
                'solver': ['auto']}
dtr_params = {'criterion': ['absolute_error', 'friedman_mse', 'poisson'], 'splitter': ['best', 'random'],
              'min_samples_split': [2, 3, 5, 10], 'max_features': ['sqrt', 'log2']}
rf_params = {'n_estimators': [5, 10, 15, 20],  'criterion': ['absolute_error', 'friedman_mse', 'poisson'], 
             'min_samples_split': [2, 3, 5, 10],
             'max_features': ['sqrt', 'log2']}
el_params = {'fit_intercept': [True, False], 'alpha': [0.001, 0.01, 0.1, 1.0], 'copy_X': [True, False],
         'l1_ratio': [0.001, 0.1, 0.5, 1.0]}

colors = ['#a6cee3', '#1f78b4', '#cab2d6', '#ff7f00', '#fb9a99', '#e31a1c']

In [None]:
plt.rcParams.update({'font.size':12, 'font.weight':'normal', 'font.family':'serif'})

In [None]:
def bootstrap_metric(labels_true, labels_pred, metric_func, n_iter=1000):
    n = len(labels_true)
    res = np.zeros(n_iter)
    for i in range(n_iter):
        ind = np.random.randint(0, n, n)
        sample_true = labels_true[ind]
        sample_pred = labels_pred[ind]
        res[i] = metric_func(sample_true, sample_pred)
    return res

def compute_ci(bootstrap_res, ci=0.95):
    lp = (1 - ci) / 2
    up = 1 - lp
    return np.round(np.percentile(bootstrap_res, [lp*100, up*100]), 3)

def rmse(labels_true, labels_pred):
    return np.sqrt(mean_squared_error(labels_true, labels_pred))

def mae(labels_true, labels_pred):
    return mean_absolute_error(labels_true, labels_pred)

def mape(labels_true, labels_pred):
    return np.mean(2 * np.abs(labels_true - labels_pred) / (np.abs(labels_true) + np.abs(labels_pred))) * 100

def mape_c(labels_true, labels_pred):
    mask = labels_true != 0
    return np.mean(np.abs((labels_true[mask] - labels_pred[mask]) / labels_true[mask])) * 100

def f1_cs(labels_true, labels_pred):
    return f1_score(labels_true, labels_pred, average='macro')

def kappa_cs(labels_true, labels_pred):
    return cohen_kappa_score(labels_true, labels_pred, weights='quadratic')

def evaluate_model(labels_val, labels_pred_val, evals_result=None, 
                  task='Total health contacts', tgt='Total health contacts', tp='ED attendance'):
    print('Evaluating model for target: ' + task)
    res_dict = {}
    res_dict['timepoint'] = tp
    res_dict['target'] = tgt
    #plot_learning_curve(model, evals_result)
    #### Get performance measures with 95% CI
    rmse_ci = compute_ci(bootstrap_metric(labels_val, labels_pred_val, rmse))
    mae_ci = compute_ci(bootstrap_metric(labels_val, labels_pred_val, mae))
    mape_ci = compute_ci(bootstrap_metric(labels_val, labels_pred_val, mape_c))
    rmse_val = round(rmse(labels_val, labels_pred_val), 3)
    mae_val = round(mae(labels_val, labels_pred_val), 3)
    mape_val = round(mape_c(labels_val, labels_pred_val), 3)
    print(f'RMSE: {rmse_val}, 95% CI: {rmse_ci}')
    print(f'MAE: {mae_val}, 95% CI: {mae_ci}')
    print(f'MAPE: {mape_val}, 95% CI: {mape_ci}')
    res_dict['RMSE'] = rmse_val
    res_dict['RMSE-upper'] = rmse_ci[0]
    res_dict['RMSE-lower'] = rmse_ci[1]
    res_dict['MAE'] = mae_val
    res_dict['MAE-upper'] = mae_ci[0]
    res_dict['MAE-lower'] = mae_ci[1]
    res_dict['MAPE'] = mape_val
    res_dict['MAPE-upper'] = mape_ci[0]
    res_dict['MAPE-lower'] = mape_ci[1]
    print('Evaluation complete.')
    return res_dict

In [None]:
def eval_regs(train_x, train_y, val_x, val_y, regs):
    res_df = pd.DataFrame()
    params = {}
    for reg_name, reg in regs.items():
        print(f'Evaluating {reg_name}')
        if reg_name == 'XGBoost':
            print(f'Evaluating {reg_name}...')
            preds = reg.predict(xgb.DMatrix(val_x))
        else:
            print(f'Fitting {reg_name}...')
            if reg_name == 'Ridge Regression':
                params = ridge_params
            elif reg_name == 'Decision Tree':
                params = dtr_params
            elif reg_name == 'Elastic Net':
                params = el_params
            elif reg_name == 'Random Forest':
                params = rf_params
                
            gcv = GridSearchCV(reg, params, verbose=1, scoring='neg_mean_absolute_error')
            gcv.fit(train_x, train_y)
            bm = gcv.best_estimator_
            preds = bm.predict(val_x)

        res_dict = pd.DataFrame.from_dict(evaluate_model(val_y, preds), orient='index').T
        res_dict['Model'] = reg_name
        res_df = pd.concat([res_df, res_dict], axis=0)
        print(res_df)
    print('Evaluation complete')
    return res_df

In [None]:
res_df = eval_regs(train_x_cts, train_y_cts, val_x_cts, val_y_cts, regs)

In [None]:
res_df.to_csv('', index=False)