In [None]:
# NOTE: switch to the parent directory
%cd ..

In [None]:
# Imports
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import (roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve,
                            auc, accuracy_score)
from sklearn.dummy import DummyClassifier

from copy import deepcopy

from utils.preproc_utils import run_preprocessing
from utils.plotting_utils import plotting_setup, CB_COLOR_CYCLE, STYLES

from tqdm import tqdm

In [None]:
# Constants
TARGET_VARIABLE = 'DiagnosisByCriteria' # 'DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'
SEED = 1799
TEST_SIZE = 0.2    # the size of the test set created from the external data

In [None]:
# Utility functions
def bootstrap_resample(data, labels):
    n_data_points = len(data)
    bootstrap_indices = np.random.choice(n_data_points, size=n_data_points, replace=True)
    return data.iloc[bootstrap_indices], labels.iloc[bootstrap_indices]

In [None]:
# Load the raw data
app_data_regensburg = pd.read_csv('./data/app_data.csv')
app_data_dusseldorf = pd.read_excel('./data/app_data_ext.csv')

In [None]:
# Preprocess and impute the data
app_data_regensburg, app_data_dusseldorf = run_preprocessing(app_data_regensburg, app_data_dusseldorf)

In [None]:
# Construct targets and design matrices
y_regensburg = app_data_regensburg[TARGET_VARIABLE]
X_regensburg = app_data_regensburg.drop(['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'], axis=1)

y_dusseldorf = app_data_dusseldorf[TARGET_VARIABLE]
X_dusseldorf = app_data_dusseldorf.drop(['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'], axis=1)

In [None]:
# Number of bootstrap resamples
B = 500

# Fix seed for reproducibility
random.seed(SEED)
np.random.seed(SEED)

# Performance metrics
aurocs = {'lr': [], 'gb': [], 'rf': []}
auprs = {'lr': [], 'gb': [], 'rf': []}
sensitivities = {'lr': [], 'gb': [], 'rf': []}
specificities = {'lr': [], 'gb': [], 'rf': []}
accuracies = {'lr': [], 'gb': [], 'rf': []}
balanced_accuracies = {'lr': [], 'gb': [], 'rf': []}
ppvs = {'lr': [], 'gb': [], 'rf': []}
npvs = {'lr': [], 'gb': [], 'rf': []}
cms = {'lr': [], 'gb': [], 'rf': []}
probas = {'lr': [], 'gb': [], 'rf': []}
ys = {'lr': [], 'gb': [], 'rf': []}

for b in tqdm(np.arange(B)):
    # NOTE: we perform a stratified train-test plit of the external data
    X_dusseldorf_1, X_dusseldorf_2, y_dusseldorf_1, y_dusseldorf_2 = train_test_split(
        X_dusseldorf, y_dusseldorf, test_size=TEST_SIZE, stratify=y_dusseldorf)

    # Make a bootstrap resample of the internal and external sets
    X_regensburg_b, y_regensburg_b = bootstrap_resample(X_regensburg, y_regensburg)
    X_dusseldorf_b_1, y_dusseldorf_b_1 = bootstrap_resample(X_dusseldorf_1, y_dusseldorf_1)
    X_dusseldorf_b_2, y_dusseldorf_b_2 = bootstrap_resample(X_dusseldorf_2, y_dusseldorf_2)

    # Predictive models
    model_lr_b = LogisticRegression(max_iter=5000, penalty=None, random_state=SEED)
    model_gb_b = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
    model_rf_b = RandomForestClassifier(n_estimators=1000, random_state=SEED)

    # Train the models
    # NOTE: we train on the internal and a portion of external data
    X_train_b = np.concatenate((X_regensburg_b, X_dusseldorf_b_1), axis=0)
    y_train_b = np.concatenate((y_regensburg_b, y_dusseldorf_b_1), axis=0)

    model_lr_b.fit(X_train_b, y_train_b)
    model_gb_b.fit(X_train_b, y_train_b)
    model_rf_b.fit(X_train_b, y_train_b)

    models_b = {'lr': model_lr_b, 'gb': model_gb_b, 'rf': model_rf_b}

    for model_type in ['lr', 'gb', 'rf']:
        model = models_b[model_type]
        y_test_proba = model.predict_proba(X_dusseldorf_b_2)[:, 1]
        auroc = roc_auc_score(y_dusseldorf_b_2, y_test_proba)
        precision, recall, _ = precision_recall_curve(y_dusseldorf_b_2, y_test_proba)
        aupr = average_precision_score(y_dusseldorf_b_2, y_test_proba)
        y_test_pred = model.predict(X_dusseldorf_b_2)
        cm = confusion_matrix(y_dusseldorf_b_2, y_test_pred)
        sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
        specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
        accuracy = accuracy_score(y_dusseldorf_b_2, y_test_pred)
        balanced_accuracy = (sensitivity + specificity) / 2
        ppv = cm[1, 1] / (cm[1, 1] + cm[0, 1])
        npv = cm[0, 0] / (cm[0, 0] + cm[1, 0])

        # Log the metrics
        aurocs[model_type].append(auroc)
        auprs[model_type].append(aupr)
        sensitivities[model_type].append(sensitivity)
        specificities[model_type].append(specificity)
        accuracies[model_type].append(accuracy)
        balanced_accuracies[model_type].append(balanced_accuracy)
        ppvs[model_type].append(ppv)
        npvs[model_type].append(npv)
        cms[model_type].append(cm)
        probas[model_type].append(y_test_proba)
        ys[model_type].append(y_dusseldorf_b_2)

In [None]:
# Evaluate the coin flip
aurocs['rand'] = []
auprs['rand'] = []
sensitivities['rand'] = []
specificities['rand'] = []
accuracies['rand'] = []
balanced_accuracies['rand'] = []
ppvs['rand'] = []
npvs['rand'] = []
cms['rand'] = []

# Fix seed for reproducibility
random.seed(SEED)
np.random.seed(SEED)

for b in tqdm(np.arange(10000)):
    # NOTE: we perform a stratified train-test plit of the external data
    X_dusseldorf_1, X_dusseldorf_2, y_dusseldorf_1, y_dusseldorf_2 = train_test_split(
        X_dusseldorf, y_dusseldorf, test_size=TEST_SIZE, stratify=y_dusseldorf)

    # Make a bootstrap resample of the internal and external sets
    X_regensburg_b, y_regensburg_b = bootstrap_resample(X_regensburg, y_regensburg)
    X_dusseldorf_b_1, y_dusseldorf_b_1 = bootstrap_resample(X_dusseldorf_1, y_dusseldorf_1)
    X_dusseldorf_b_2, y_dusseldorf_b_2 = bootstrap_resample(X_dusseldorf_2, y_dusseldorf_2)

    X_train_b = np.concatenate((X_regensburg_b, X_dusseldorf_b_1), axis=0)
    y_train_b = np.concatenate((y_regensburg_b, y_dusseldorf_b_1), axis=0)

    y_test_pred = np.random.permutation(deepcopy(y_dusseldorf_b_2))

    auroc = roc_auc_score(y_dusseldorf_b_2, y_test_pred)
    precision, recall, _ = precision_recall_curve(y_dusseldorf_b_2, y_test_pred)
    aupr = average_precision_score(y_dusseldorf_b_2, y_test_pred)
    cm = confusion_matrix(y_dusseldorf_b_2, y_test_pred)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0] + 1E-13)
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1] + 1E-13)
    accuracy = accuracy_score(y_dusseldorf_b_2, y_test_pred)
    balanced_accuracy = (sensitivity + specificity) / 2
    ppv = cm[1, 1] / (cm[1, 1] + cm[0, 1] + 1E-13)
    npv = cm[0, 0] / (cm[0, 0] + cm[1, 0] + 1E-13)

    # Log the metrics
    aurocs['rand'].append(auroc)
    auprs['rand'].append(aupr)
    sensitivities['rand'].append(sensitivity)
    specificities['rand'].append(specificity)
    accuracies['rand'].append(accuracy)
    balanced_accuracies['rand'].append(balanced_accuracy)
    ppvs['rand'].append(ppv)
    npvs['rand'].append(npv)
    cms['rand'].append(cm)

In [None]:
# Print summary statistics for AUROC and AUPR
for model_type in ['rand', 'lr', 'rf', 'gb']:
    print(model_type + ' & ' + str(np.round(np.mean(aurocs[model_type]), 2)) + '$\pm$' + str(np.round(np.std(aurocs[model_type]), 2)) +
         ' & ' + str(np.round(np.mean(auprs[model_type]), 2)) + '$\pm$' + str(np.round(np.std(auprs[model_type]), 2)))