# TSTR

In [1]:
%%time
import os
import random
import numpy as np
import pandas as pd
from utils import *
from tqdm import tqdm_notebook
import IPython.display as ipd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pickle as pkl
import warnings
warnings.filterwarnings("ignore")

GAP_TIME = 6
WINDOW_SIZE = 24
ID_COLS = ['subject_id', 'hadm_id', 'icustay_id']
DATA_FILEPATH = "./Dataset/all_hourly_data.h5"

X = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')
statics = pd.read_hdf(DATA_FILEPATH, 'patients')
Y = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Y['los_3'] = Y['los_icu'] > 3
Y['los_7'] = Y['los_icu'] > 7
Y.drop(columns=['los_icu'], inplace=True)
Y.astype(float)

idx2task = ['mort_hosp', 'mort_icu', 'los_3', 'los_7']
df_X, df_Y = aggregate_data(X, Y)

train_frac, dev_frac, test_frac = 0.8, 0.1, 0.1
X_subj_idx, Y_subj_idx = [df.index.get_level_values('subject_id') for df in (df_X, df_Y)]
X_subjects = set(X_subj_idx)
assert X_subjects == set(Y_subj_idx), "Subject ID pools differ!"

np.random.seed(0)
subjects, N = np.random.permutation(list(X_subjects)), len(X_subjects)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)
train_subj = subjects[:N_train]
dev_subj   = subjects[N_train:N_train + N_dev]
test_subj  = subjects[N_train+N_dev:]

with open("best_hyperparams.pkl", "rb") as f:
    hyperparams = pkl.load(f)
    
print(hyperparams)

{'n_estimators': 446, 'max_depth': 9, 'min_samples_split': 36, 'min_samples_leaf': 47}
CPU times: user 7.79 s, sys: 2.79 s, total: 10.6 s
Wall time: 9.42 s


In [2]:
roc_dict = {}
idx = pd.IndexSlice

for MODEL in ['real', 'codear']:
    print(f"# {MODEL} #")
    roc_dict[MODEL] = {'mort_hosp':[], 'mort_icu':[], 'los_3':[], 'los_7':[]}
    
    if MODEL!='real':
        for subj in tqdm_notebook(train_subj):
            x = np.load(f"./synthetic_dataset/{MODEL}/sequences/{subj}.npy")
            df_X.loc[df_X.index.get_level_values('subject_id') == subj, idx[:, 'mean']] = x

    [(df_X_train, df_X_dev, df_X_test), (df_Y_train, df_Y_dev, df_Y_test)] = [
        [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
        for df in (df_X, df_Y)
    ]
    
    df_X_means = np.nanmean(df_X_train.loc[:, idx[:, ['mean']]].to_numpy(), axis=0)
    df_X_stds = np.nanstd(df_X_train.loc[:, idx[:, ['mean']]].to_numpy(), axis=0)

    df_X_train = preprocess_data(df_X_train, df_X_means, df_X_stds)
    df_X_dev = preprocess_data(df_X_dev, df_X_means, df_X_stds)
    df_X_test = preprocess_data(df_X_test, df_X_means, df_X_stds)

    X_train = df_X_train.to_numpy().reshape(-1, 24, df_X_train.shape[-1])[:,:,1::2]
    X_dev = df_X_dev.to_numpy().reshape(-1, 24, df_X_train.shape[-1])[:,:,1::2]
    X_test = df_X_test.to_numpy().reshape(-1, 24, df_X_train.shape[-1])[:,:,1::2]

    Y_train = df_Y_train.to_numpy().astype(int)
    Y_dev = df_Y_dev.to_numpy().astype(int)
    Y_test = df_Y_test.to_numpy().astype(int)

    n_labels = 4
    for c in range(n_labels):
        print(idx2task[c])
        clf = RandomForestClassifier(**hyperparams, random_state=0, n_jobs=4)
        clf.fit(X_train.reshape(-1, 24*X_train.shape[-1]), Y_train[:, c])

        roc_task = []
        train_roc = roc_auc_score(Y_train[:, c], clf.predict_proba(X_train.reshape(-1, 24*X_train.shape[-1]))[:,1])
        roc_task.append(train_roc)
        print(train_roc)

        dev_roc = roc_auc_score(Y_dev[:, c], clf.predict_proba(X_dev.reshape(-1, 24*X_train.shape[-1]))[:,1])
        roc_task.append(dev_roc)
        print(dev_roc)

        test_roc = roc_auc_score(Y_test[:, c], clf.predict_proba(X_test.reshape(-1, 24*X_train.shape[-1]))[:,1])
        roc_task.append(test_roc)
        print(test_roc)
        roc_dict[MODEL][idx2task[c]].append(roc_task)
    print()
    print()
    print()

iterables = [["mort_icu", "mort_hosp", "los_3", "los_7"], ["train", "dev", "test"]]
multiindex_columns = pd.MultiIndex.from_product(iterables, names=["task", "auroc"])

pd.options.display.float_format = '{:.2f}'.format
rows = []
rocs = []
for model in ['real', 'codear']:
    roc_results = roc_dict[model]
    rows.append(model)
    
    row_rocs=[]
    for target_task in ["mort_icu", "mort_hosp", "los_3", "los_7"]:
        seed_results = 100*np.asarray(roc_results[target_task])
        row_rocs.extend(list(np.mean(seed_results, axis=0)))
    rocs.append(row_rocs)

df = pd.DataFrame(rocs, index = rows, columns = multiindex_columns)
ipd.display(df)
print()
print()

# real #
mort_hosp
0.8951527243797889
0.868291966830414
0.8398673393513596
mort_icu
0.9169491487834793
0.8887373402666828
0.8603057355998531
los_3
0.770203083364848
0.7011973967455738
0.7096158689369912
los_7
0.8380326753671405
0.7343740841825748
0.7347281099615609



# codear #


  0%|          | 0/19155 [00:00<?, ?it/s]

mort_hosp
0.8806695420452674
0.8408225083425233
0.8163459600697702
mort_icu
0.9048915862534834
0.8767176401679624
0.8395450654274185
los_3
0.7818121097888745
0.6794365107507816
0.6790605386685107
los_7
0.8458795462643792
0.7255006893317941
0.7314820304548464





task,mort_icu,mort_icu,mort_icu,mort_hosp,mort_hosp,mort_hosp,los_3,los_3,los_3,los_7,los_7,los_7
auroc,train,dev,test,train,dev,test,train,dev,test,train,dev,test
real,91.69,88.87,86.03,89.52,86.83,83.99,77.02,70.12,70.96,83.8,73.44,73.47
codear,90.49,87.67,83.95,88.07,84.08,81.63,78.18,67.94,67.91,84.59,72.55,73.15




