In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import tempfile
import pandas as pd
import seaborn as sns; sns.set()
import sklearn
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import auc
import matplotlib.lines as mlines
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

In [0]:
!pip install -U tensorboard
!pip install keras-tuner
import kerastuner as kt

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

# Load data


In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_15Tr10jd.npy", 'tlabels_15Tr10jd.npy', "hseq_15Tr10jd.npy", 'hlabels_15Tr10jd.npy'
my_data_dir = 'Colab Notebooks/last3612/'
data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

pos, neg = sum(labels_training != 0),sum(labels_training == 0)
total = len(labels_training)
print(pos, neg, total)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))

#calculate weight

weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)

# Functions for Dataset, Metric, and Plot

In [0]:
# Create dataset
def Create_dataset(data, label, BATCH_SIZE = 128):
  dataset = tf.data.Dataset.from_tensor_slices((data, label))
  dataset = dataset.batch(BATCH_SIZE).prefetch(2)
  return dataset

# resample
#since the dataset is imbalanced, resampling is needed to improve model performance
def Resample_dataset(data, label, BATCH_SIZE = 128,BUFFER_SIZE = 10000):
  pos_labels = label[label != 0]
  neg_labels = label[label == 0]
  pos_data = data[label != 0]
  neg_data = data[label == 0]

  pos_ds = tf.data.Dataset.from_tensor_slices((pos_data, pos_labels))
  pos_ds = pos_ds.cache().shuffle(BUFFER_SIZE).repeat()

  neg_ds= tf.data.Dataset.from_tensor_slices((neg_data, neg_labels))
  neg_ds = neg_ds.cache().shuffle(BUFFER_SIZE).repeat()

  resampled_ds = tf.data.experimental.sample_from_datasets([pos_ds, neg_ds], weights=[0.5, 0.5], seed=0)
  resampled_ds = resampled_ds.batch(BATCH_SIZE).prefetch(2)
  return resampled_ds

In [0]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy',threshold=0.5),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.Precision(name='precision',thresholds=0.5),
      tf.keras.metrics.Recall(name='recall',thresholds=0.5),
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
]

EPOCHS = 1000
BATCH_SIZE = 128
resampled_steps_per_epoch = np.ceil(2.0*neg/BATCH_SIZE)
print(resampled_steps_per_epoch)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [0]:
# Plots for model validation and diagnosis. 
def plot_metrics(history):
  metrics =  ['loss', 'auc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch,  history.history[metric], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
              linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.6,1])
    else:
      plt.ylim([0,1])
    plt.legend()

def plot_roc(name, labels, predictions, **kwargs):
  fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

  plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  #plt.xlim([-0.5,40])
  #plt.ylim([60,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')

def plot_cm(labels, predictions, p=0.5):
  cm = confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

  print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
  print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
  print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
  print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
  print('Total Fraudulent Transactions: ', np.sum(cm[1]))

def plot_pr_curve(name, labels, predictions, **kwargs):
  precision, recall, thresholds = precision_recall_curve(labels, predictions)
  print('precision',precision)
  print('recall',recall)
  print('thresholds',thresholds)
  auc_score = auc(recall, precision)
  print('PR AUC',auc_score)
  plt.plot(100*recall, 100*precision, label=name, linewidth=2, **kwargs)
  plt.xlabel('recall [%]')
  plt.ylabel('precision [%]')
  #plt.xlim([-0.5,40])
  #plt.ylim([60,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.legend()
  ax.set_aspect('equal')

def plot_calibration(name, labels, predictions, **kwargs):
  y, x = calibration_curve(labels, predictions, n_bins=20)
  # calibration curves
  plt.figure(figsize=(10, 10))
  ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
  ax2 = plt.subplot2grid((3, 1), (2, 0))
  ax1.plot(x, y, "s-",label="%s" % (name, ))
  line = mlines.Line2D([0, 1], [0, 1], color='black')
  #transform = ax.transAxes
  #line.set_transform(transform)
  ax1.add_line(line)
  ax2.hist(predictions, range=(0, 1), bins=20, label=name, histtype="step", lw=2)

  ax1.set_ylabel("Fraction of positives")
  ax1.set_ylim([-0.05, 1.05])
  ax1.legend(loc="lower right")
  ax1.set_title('Calibration plots  (reliability curve)')

  ax2.set_xlabel("Mean predicted value")
  ax2.set_ylabel("Count")
  ax2.legend(loc="upper center", ncol=2)

  plt.tight_layout()
  plt.plot(y, x, marker='o', linewidth=1, label=name)

# Model Configuration

## Keras Tuner
Use package kerastuner to tune hyperparameters and find the best deep neural network structure. 

In [0]:
#load dataset
data_train, data_val, labels_train, labels_val = train_test_split(data_training, labels_training, test_size=0.25, random_state=42)
split_point=9
julian_training = data_train[:,0,split_point:]
julina_holdout = data_holdout[:,0,split_point:]
julina_val = data_val[:,0,split_point:]

data_train = data_train[:,:,:split_point]
data_holdout = data_holdout[:,:,:split_point]
data_val = data_val[:,:,:split_point]
# SUM
data_train_point = np.sum(data_train,axis=1)
data_val_point = np.sum(data_val,axis=1)
data_test_point = np.sum(data_holdout,axis=1)
# concat
data_train_point = np.concatenate((data_train_point, julian_training), axis=1) 
data_val_point = np.concatenate((data_val_point, julina_val), axis=1) 
data_test_point = np.concatenate((data_test_point, julina_holdout), axis=1)
print(data_train_point.shape,data_val_point.shape,data_test_point.shape)


valset = Create_dataset(data_val_point, labels_val)
testset = Create_dataset(data_test_point, labels_holdout)
trainset = Resample_dataset(data_train_point,labels_train)

Logistic regression
Search for best lambda for L2 regularization

In [0]:
from kerastuner import HyperModel
l= [100.0,50.0,20.0,10.0,5.0,2.0,1.0,0.5,0.01,0.005,0.001]

class LRHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(1,
                kernel_regularizer=tf.keras.regularizers.l2(
                    hp.Choice('lamba',
                              values=l,
                              default=0.01
                    )
                ),
                activation='sigmoid',
                input_shape=self.input_shape
            )
        )
        
        model.compile(
          optimizer=tf.keras.optimizers.Adam(lr=3e-4),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
        
        return model

INPUT_SHAPE = (data_train_point.shape[-1],)
hypermodel = LRHyperModel(input_shape=INPUT_SHAPE)


In [0]:

HYPERBAND_MAX_EPOCHS = 40
SEED=41
tuner = kt.Hyperband(
    hypermodel,
    max_epochs=HYPERBAND_MAX_EPOCHS,
    objective= kt.Objective("val_auc", direction="max"),
    seed=SEED,
    hyperband_iterations=3
)

In [0]:
tuner.search_space_summary()

In [0]:
# Search best model
tuner.search(trainset,
             validation_data=valset,
             epochs=30,
             steps_per_epoch=resampled_steps_per_epoch,
             callbacks=[early_stopping])

In [0]:
tuner.results_summary()

Search for best deep neural network model structure

In [0]:
from kerastuner import HyperModel


class DNNHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(
                hp.Int(
                    'num_neuron_1',
                    32,
                    256,
                    step=32),
                activation='relu',
                input_shape=self.input_shape
            )
        )
        for i in range(hp.Int('Num_layers_', 2, 4, default=3)):
          units = hp.Int('num_neuron_' + str(i), 32, 256, step=32)
        
          model.add(
              tf.keras.layers.Dense(
                  units,
                  activation='relu'
              )
          )
        model.add(
              tf.keras.layers.Dropout(
                  hp.Float('dropout', 0, 0.5, step=0.1, default=0.2)
              )
          )
        model.add(
            tf.keras.layers.Dense(1, activation='sigmoid')
        )
        
        model.compile(
          optimizer=tf.keras.optimizers.Adam(lr=3e-4),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
        
        return model

INPUT_SHAPE = (data_train_point.shape[-1],)
hypermodel = DNNHyperModel(input_shape=INPUT_SHAPE)


In [0]:

HYPERBAND_MAX_EPOCHS = 40
SEED=41
tuner = kt.Hyperband(
    hypermodel,
    max_epochs=HYPERBAND_MAX_EPOCHS,
    objective= kt.Objective("val_auc", direction="max"),
    seed=SEED,
    hyperband_iterations=3
)

In [0]:
tuner.search_space_summary()

In [0]:
# Search best model
tuner.search(trainset,
             validation_data=valset,
             epochs=30,
             steps_per_epoch=resampled_steps_per_epoch,
             callbacks=[early_stopping])

In [0]:
tuner.results_summary()

## Logistic Regression



A Logistic regression with L2 normalization.


In [0]:
# # without Time-of-Day variable
def SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=10):
  
  
  aucs = []
  recalls = []
  precisions = []
  auprcs = []
  # remove time variable
  data_training = data_training[:,:,:split_point]
  data_holdout = data_holdout[:,:,:split_point]

  # cross validation
  for train_index, val_index in tqdm_notebook(KFold(n_splits=4, random_state=41, shuffle=True).split(data_training)):
    data_train, data_val = data_training[train_index], data_training[val_index]
    labels_train, labels_val = labels_training[train_index], labels_training[val_index]
    # get overall count within 24 hrs
    data_train_point = np.sum(data_train,axis=1)
    data_val_point = np.sum(data_val,axis=1)
    data_test_point = np.sum(data_holdout,axis=1)

    valset = Create_dataset(data_val_point, labels_val)
    testset = Create_dataset(data_test_point, labels_holdout)
    resampled_ds = Resample_dataset(data_train_point,labels_train)
    
    regression = tf.keras.Sequential([
          tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(20), activation='sigmoid',
              input_shape=(data_train_point.shape[-1],))
                            
      ])

    regression.compile(
          optimizer=tf.keras.optimizers.SGD(lr=1e-3),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
    #regression.summary()

    # trainig
    regression_history = regression.fit(
      resampled_ds,
      epochs=EPOCHS,
      steps_per_epoch=resampled_steps_per_epoch,
      callbacks = [early_stopping],
      validation_data=valset,
      verbose=0)

    #validation
    train_predictions = regression.predict(data_train_point, batch_size=BATCH_SIZE)
    test_predictions = regression.predict(data_test_point, batch_size=BATCH_SIZE)
  

    _,_,auroc,precision,recall,_,_,_,_ = regression.evaluate(data_test_point, labels_holdout,
                                             batch_size=BATCH_SIZE, verbose=0)
  
    aucs.append(auroc)
    recalls.append(recall)
    precisions.append(precision)
    print(auroc, recall, precision)

    precision, recall, thresholds = precision_recall_curve(labels_holdout, test_predictions)
    auprc = auc(recall, precision)
    auprcs.append(auprc)
    print('PR AUC',auprc)
    #plot_pr_curve('auprc',labels_holdout, test_predictions)
    #plot_calibration('calibration',labels_holdout, test_predictions)
  #plot_metrics(regression_history)  
  auc_ave = np.array(aucs).mean()
  recall_ave = np.array(recalls).mean()
  precisions_ave = np.array(precisions).mean()
  auprc_ave = np.array(auprcs).mean()
  print("auc:{}, recall:{}, precision:{}".format(auc_ave, recall_ave, precisions_ave))
  print('AUPRC:{}'.format(auprc_ave))

  
  return test_predictions




In [0]:
# with Time-of-Day variable
def SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout, split_point=10):
  
  
  aucs = []
  recalls = []
  precisions = []
  auprcs = []
  
  # Separate Time-of-Day variable
  julian_training = data_training[:,0,split_point:]
  julina_holdout = data_holdout[:,0,split_point:]
  data_training = data_training[:,:,:split_point]
  data_holdout = data_holdout[:,:,:split_point]


# cross validation
  for train_index, val_index in tqdm_notebook(KFold(n_splits=4, random_state=41, shuffle=True).split(data_training)):
    data_train, data_val = data_training[train_index], data_training[val_index]
    labels_train, labels_val = labels_training[train_index], labels_training[val_index]
    #julian
    j_train, j_val = julian_training[train_index], julian_training[val_index]

    # get overall count within 24hrs
    data_train_point = np.sum(data_train,axis=1)
    data_val_point = np.sum(data_val,axis=1)
    data_test_point = np.sum(data_holdout,axis=1)
    # concat dataset with Time-of-day variable
    data_train_point = np.concatenate((data_train_point, j_train), axis=1) 
    data_val_point = np.concatenate((data_val_point, j_val), axis=1) 
    data_test_point = np.concatenate((data_test_point, julina_holdout), axis=1)
    print(data_train_point.shape,data_val_point.shape,data_test_point.shape)

    valset = Create_dataset(data_val_point, labels_val)
    testset = Create_dataset(data_test_point, labels_holdout)
    resampled_ds = Resample_dataset(data_train_point,labels_train)
    #initialize model
    regression = tf.keras.Sequential([
          tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(20), activation='sigmoid',
              input_shape=(data_train_point.shape[-1],))
                            
      ])

    regression.compile(
          optimizer=tf.keras.optimizers.SGD(lr=1e-3),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
    #regression.summary()

    # trainig
    regression_history = regression.fit(
      resampled_ds,
      epochs=EPOCHS,
      steps_per_epoch=resampled_steps_per_epoch,
      callbacks = [early_stopping],
      validation_data=valset,
      verbose=0)

    #validation
    train_predictions = regression.predict(data_train_point, batch_size=BATCH_SIZE)
    test_predictions = regression.predict(data_test_point, batch_size=BATCH_SIZE)
    
    _,_,auroc,precision,recall,_,_,_,_ = regression.evaluate(data_test_point, labels_holdout,
                                             batch_size=BATCH_SIZE, verbose=0)
    
    aucs.append(auroc)
    recalls.append(recall)
    precisions.append(precision)
    print(auroc, recall, precision)

    precision, recall, thresholds = precision_recall_curve(labels_holdout, test_predictions)
    auprc = auc(recall, precision)
    auprcs.append(auprc)
    print('PR AUC',auprc)
  #plot_metrics(regression_history)  
  auc_ave = np.array(aucs).mean()
  recall_ave = np.array(recalls).mean()
  precisions_ave = np.array(precisions).mean()
  auprc_ave = np.array(auprcs).mean()
  print("auc:{}, recall:{}, precision:{}".format(auc_ave, recall_ave, precisions_ave))
  print('AUPRC:{}'.format(auprc_ave))
  return test_predictions




## Deep Neural Network


This deep neural network structure is defined by the result of KerasTuner. A three-layered deep neural network with a drop-out layer

In [0]:
# # without Time-of-Day variable
def DNN(data_training,labels_training,data_holdout,labels_holdout,split_point=10):

  
  aucs = []
  recalls = []
  precisions = []
  auprcs=[]
  
  #remove Time-of-day variable
  data_training = data_training[:,:,:split_point]
  data_holdout = data_holdout[:,:,:split_point]


# cross validation
  for train_index, val_index in tqdm_notebook(KFold(n_splits=4, random_state=41, shuffle=True).split(data_training)):
    data_train, data_val = data_training[train_index], data_training[val_index]
    labels_train, labels_val = labels_training[train_index], labels_training[val_index]
    data_train_point = np.sum(data_train,axis=1)
    data_val_point = np.sum(data_val,axis=1)
    data_test_point = np.sum(data_holdout,axis=1)

    valset = Create_dataset(data_val_point, labels_val)
    testset = Create_dataset(data_test_point, labels_holdout)
    resampled_ds = Resample_dataset(data_train_point,labels_train)
     
    # Initialize model
    FFN = tf.keras.Sequential([
        tf.keras.layers.Dense(
            128, activation='relu',
            input_shape=(data_train_point.shape[-1],)),
        tf.keras.layers.Dense(
            32, activation='relu'),
        tf.keras.layers.Dense(
            160, activation='relu'),
        
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(1, activation='sigmoid')
                          
    ])

    FFN.compile(
          optimizer=tf.keras.optimizers.SGD(lr=1e-3),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
    #regression.summary()

    # trainig
    history = FFN.fit(
      resampled_ds,
      epochs=EPOCHS,
      steps_per_epoch=resampled_steps_per_epoch,
      callbacks = [early_stopping],
      validation_data=valset,
      verbose=0)

    #validation
    train_predictions = FFN.predict(data_train_point, batch_size=BATCH_SIZE)
    test_predictions = FFN.predict(data_test_point, batch_size=BATCH_SIZE)

    _,_,auroc,precision,recall,_,_,_,_ = FFN.evaluate(data_test_point, labels_holdout,
                                             batch_size=BATCH_SIZE, verbose=0)
    aucs.append(auroc)
    recalls.append(recall)
    precisions.append(precision)
    print(auroc, recall, precision)

    precision, recall, thresholds = precision_recall_curve(labels_holdout, test_predictions)
    auprc = auc(recall, precision)
    auprcs.append(auprc)
    print('PR AUC',auprc)
  #plot_metrics(regression_history)  
  auc_ave = np.array(aucs).mean()
  recall_ave = np.array(recalls).mean()
  precisions_ave = np.array(precisions).mean()
  auprc_ave = np.array(auprcs).mean()
  print("auc:{}, recall:{}, precision:{}".format(auc_ave, recall_ave, precisions_ave))
  print('AUPRC:{}'.format(auprc_ave))
  return test_predictions




In [0]:
# # with Time-of-Day variable
def DNN_j(data_training,labels_training,data_holdout,labels_holdout,split_point=10):

  
  aucs = []
  recalls = []
  precisions = []
  auprcs = []
  #Separate Time-of_day variable
  julian_training = data_training[:,0,split_point:]
  julina_holdout = data_holdout[:,0,split_point:]
  data_training = data_training[:,:,:split_point]
  data_holdout = data_holdout[:,:,:split_point]


# cross validation
  for train_index, val_index in tqdm_notebook(KFold(n_splits=4, random_state=41, shuffle=True).split(data_training)):
    data_train, data_val = data_training[train_index], data_training[val_index]
    labels_train, labels_val = labels_training[train_index], labels_training[val_index]
    j_train, j_val = julian_training[train_index], julian_training[val_index]

    # get overall count within 24hrs
    data_train_point = np.sum(data_train,axis=1)
    data_val_point = np.sum(data_val,axis=1)
    data_test_point = np.sum(data_holdout,axis=1)
    # concat dataset with Time-of-day variable
    data_train_point = np.concatenate((data_train_point, j_train), axis=1) 
    data_val_point = np.concatenate((data_val_point, j_val), axis=1) 
    data_test_point = np.concatenate((data_test_point, julina_holdout), axis=1)
    print(data_train_point.shape,data_val_point.shape,data_test_point.shape)

    valset = Create_dataset(data_val_point, labels_val)
    testset = Create_dataset(data_test_point, labels_holdout)
    resampled_ds = Resample_dataset(data_train_point,labels_train)
    
    # Initialize model
    FFN = tf.keras.Sequential([
        tf.keras.layers.Dense(
            128, activation='relu',
            input_shape=(data_train_point.shape[-1],)),
        tf.keras.layers.Dense(
            32, activation='relu'),
        tf.keras.layers.Dense(
            160, activation='relu'),
        
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(1, activation='sigmoid')
                          
    ])

    FFN.compile(
          optimizer=tf.keras.optimizers.Adam(lr=3e-4),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
    #regression.summary()

    # trainig
    history = FFN.fit(
      resampled_ds,
      epochs=EPOCHS,
      steps_per_epoch=resampled_steps_per_epoch,
      callbacks = [early_stopping],
      validation_data=valset,
      verbose=0)

    #validation
    train_predictions = FFN.predict(data_train_point, batch_size=BATCH_SIZE)
    test_predictions = FFN.predict(data_test_point, batch_size=BATCH_SIZE)

    _,_,auroc,precision,recall,_,_,_,_ = FFN.evaluate(data_test_point, labels_holdout,
                                             batch_size=BATCH_SIZE, verbose=0)
    aucs.append(auroc)
    recalls.append(recall)
    precisions.append(precision)
    print(auroc, recall, precision)

    precision, recall, thresholds = precision_recall_curve(labels_holdout, test_predictions)
    auprc = auc(recall, precision)
    auprcs.append(auprc)
    print('PR AUC',auprc)
  #plot_metrics(regression_history)  
  auc_ave = np.array(aucs).mean()
  recall_ave = np.array(recalls).mean()
  precisions_ave = np.array(precisions).mean()
  auprc_ave = np.array(auprcs).mean()
  print("auc:{}, recall:{}, precision:{}".format(auc_ave, recall_ave, precisions_ave))
  print('AUPRC:{}'.format(auprc_ave))

  return test_predictions


## Long Short Term Memory

Single layer LSTM with one dropout layer

In [0]:
def LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=10):
  # Single layer LSTM with dropout layer
  
  aucs = []
  recalls = []
  precisions = []
  auprcs = []

  if jdummies:
    pass
  else: 
    data_training = data_training[:,:,:split_point]
    data_holdout = data_holdout[:,:,:split_point]


# cross validation
  for train_index, val_index in tqdm_notebook(KFold(n_splits=4, random_state=41, shuffle=True).split(data_training)):
    data_train, data_val = data_training[train_index], data_training[val_index]
    labels_train, labels_val = labels_training[train_index], labels_training[val_index]

    valset = Create_dataset(data_val, labels_val)
    #testset = Create_dataset(data_holdout, labels_holdout)
    resampled_ds = Resample_dataset(data_train,labels_train)
    
    # Initiate model
    LSTM = tf.keras.Sequential([
          tf.keras.layers.LSTM(32,
              input_shape=data_train.shape[-2:]),
               tf.keras.layers.Dropout(0.3),
          tf.keras.layers.Dense(1, activation='sigmoid')
                            
      ])

    LSTM.compile(
          optimizer=tf.keras.optimizers.Adam(lr=3e-4),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
    #regression.summary()

    # trainig
    history = LSTM.fit(
      resampled_ds,
      epochs=EPOCHS,
      steps_per_epoch=resampled_steps_per_epoch,
      callbacks = [early_stopping],
      validation_data=valset,
      verbose=0)

    #validation
    #train_predictions = LSTM.predict(data_train, batch_size=BATCH_SIZE)
    test_predictions = LSTM.predict(data_holdout, batch_size=BATCH_SIZE)

    _,_,auroc,precision,recall,_,_,_,_ = LSTM.evaluate(data_holdout, labels_holdout,
                                             batch_size=BATCH_SIZE, verbose=0)
    aucs.append(auroc)
    recalls.append(recall)
    precisions.append(precision)
    print(auroc, recall, precision)

    precision, recall, thresholds = precision_recall_curve(labels_holdout, test_predictions)
    auprc = auc(recall, precision)
    auprcs.append(auprc)
    print('PR AUC',auprc)
  #plot_metrics(regression_history)  
  auc_ave = np.array(aucs).mean()
  recall_ave = np.array(recalls).mean()
  precisions_ave = np.array(precisions).mean()
  auprc_ave = np.array(auprcs).mean()
  print("auc:{}, recall:{}, precision:{}".format(auc_ave, recall_ave, precisions_ave))
  print('AUPRC:{}'.format(auprc_ave))
  return test_predictions




## Gated Recurrent Unit


Single layer GRU with one dropout layer

In [0]:
def GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=10):

  aucs = []
  recalls = []
  precisions = []
  auprcs=[]

  if jdummies:
    pass
  else: 
    data_training = data_training[:,:,:split_point]
    data_holdout = data_holdout[:,:,:split_point]


# cross validation
  for train_index, val_index in tqdm_notebook(KFold(n_splits=4, random_state=41, shuffle=True).split(data_training)):
    data_train, data_val = data_training[train_index], data_training[val_index]
    labels_train, labels_val = labels_training[train_index], labels_training[val_index]

    valset = Create_dataset(data_val, labels_val)
    #testset = Create_dataset(data_holdout, labels_holdout)
    resampled_ds = Resample_dataset(data_train,labels_train)
     
    # Initiate model
    GRU = tf.keras.Sequential([
          tf.keras.layers.GRU(32,
              input_shape=data_train.shape[-2:]),
              tf.keras.layers.Dropout(0.3),
          tf.keras.layers.Dense(1, activation='sigmoid')
                            
      ])

    GRU.compile(
          optimizer=tf.keras.optimizers.Adam(lr=3e-4),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=METRICS)
    #regression.summary()

    # trainig
    history = GRU.fit(
      resampled_ds,
      epochs=EPOCHS,
      steps_per_epoch=resampled_steps_per_epoch,
      callbacks = [early_stopping],
      validation_data=valset,
      verbose=0)

    #validation
    #train_predictions = GRU.predict(data_train, batch_size=BATCH_SIZE)
    test_predictions = GRU.predict(data_holdout, batch_size=BATCH_SIZE)

    _,_,auroc,precision,recall,_,_,_,_ = GRU.evaluate(data_holdout, labels_holdout,
                                             batch_size=BATCH_SIZE, verbose=0)
    aucs.append(auroc)
    recalls.append(recall)
    precisions.append(precision)
    print(auroc, recall, precision)

    precision, recall, thresholds = precision_recall_curve(labels_holdout, test_predictions)
    auprc = auc(recall, precision)
    auprcs.append(auprc)
    print('PR AUC',auprc)
  #plot_metrics(regression_history)  
  auc_ave = np.array(aucs).mean()
  recall_ave = np.array(recalls).mean()
  precisions_ave = np.array(precisions).mean()
  auprc_ave = np.array(auprcs).mean()
  print("auc:{}, recall:{}, precision:{}".format(auc_ave, recall_ave, precisions_ave))
  print('AUPRC:{}'.format(auprc_ave))
  return test_predictions




#Model Development and Validation

## 15 minute time block with 5 features
* vital sign measurements

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_15Tr5.npy", 'tlabels_15Tr5.npy', "hseq_15Tr5.npy", 'hlabels_15Tr5.npy'
d = 'last3612/'
my_data_dir = 'Colab Notebooks/'+d
data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout,split_point=5)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=5)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=5)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=5)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=5)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=5)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=5)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=5)

## 15 minute time block with 9 features
† vital sign measurements, flowsheets comments 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_15Tr10jd.npy", 'tlabels_15Tr10jd.npy', "hseq_15Tr10jd.npy", 'hlabels_15Tr10jd.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout,split_point=9)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=9)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=9)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=9)

In [0]:
reg_pred = SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=9)

In [0]:
dnn_pred = DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=9)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=9)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=9)

## 15 minutes time block with 15 features
‡ vital sign measurements, flowsheets comments, order entry, nursing notes 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_15Tr15jd.npy", 'tlabels_15Tr15jd.npy', "hseq_15Tr15jd.npy", 'hlabels_15Tr15jd.npy'
data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=15)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=15)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=15)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=15)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=15)

## 15 minute time block with 62 features
$ vital sign measurements, flowsheets comments, order entry, nursing notes, occurrence of entities extracted from nursing notes

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_15Tr_all.npy", 'tlabels_15Tr_all.npy', "hseq_15Tr_all.npy", 'hlabels_15Tr_all.npy'
data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=62)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=62)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=62)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=62)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=62)

## 30 minute time block with 5 features
* vital sign measurements

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_30Tr5.npy", 'tlabels_30Tr5.npy', "hseq_30Tr5.npy", 'hlabels_30Tr5.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout,split_point=5)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=5)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=5)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=5)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=5)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=5)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=5)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=5)

## 30 minute time block with 9 features
† vital sign measurements, flowsheets comments 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_30Tr10jd.npy", 'tlabels_30Tr10jd.npy', "hseq_30Tr10jd.npy", 'hlabels_30Tr10jd.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout,split_point=9)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=9)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=9)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=9)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=9)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=9)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=9)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=9)

## 30 minutes time block with 15 features
‡ vital sign measurements, flowsheets comments, order entry, nursing notes 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_30Tr15jd.npy", 'tlabels_30Tr15jd.npy', "hseq_30Tr15jd.npy", 'hlabels_30Tr15jd.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=15)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=15)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=15)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=15)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=15)

## 30 minute time block with 62 features
$ vital sign measurements, flowsheets comments, order entry, nursing notes, occurrence of entities extracted from nursing notes 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_30Tr_all.npy", 'tlabels_30Tr_all.npy', "hseq_30Tr_all.npy", 'hlabels_30Tr_all.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=62)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=62)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=62)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=62)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=62)

## 60 minute time block with 5 features
* vital sign measurements

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_60Tr5jd.npy", 'tlabels_60Tr5jd.npy', "hseq_60Tr5jd.npy", 'hlabels_60Tr5jd.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout,split_point=5)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=5)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=5)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=5)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=5)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=5)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=5)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=5)

## 60 minute time block with 9 features
† vital sign measurements, flowsheets comments 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_60Tr10jd.npy", 'tlabels_60Tr10jd.npy', "hseq_60Tr10jd.npy", 'hlabels_60Tr10jd.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout,split_point=9)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=9)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=9)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=9)

In [0]:
reg_pred = SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=9)

In [0]:
dnn_pred = DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=9)

In [0]:
lstm_pred = LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=9)

In [0]:
gru_pred = GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=9)

###Plot Precision-Recall Curve

In [0]:
plot_pr_curve('Logistic Regression',labels_holdout, reg_pred, linestyle=':')
plot_pr_curve('Deep Neural Network',labels_holdout, dnn_pred, linestyle='-.')
plot_pr_curve('LSTM',labels_holdout, lstm_pred, linestyle='--')
plot_pr_curve('GRU',labels_holdout, gru_pred, linestyle='-')


## 60 minutes time block with 15 features
‡ vital sign measurements, flowsheets comments, order entry, nursing notes 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_60Tr15jd.npy", 'tlabels_60Tr15jd.npy', "hseq_60Tr15jd.npy", 'hlabels_60Tr15jd.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=15)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=15)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=15)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=15)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=15)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=15)

## 60 minute time block with 62 features
$ vital sign measurements, flowsheets comments, order entry, nursing notes, occurrence of entities extracted from nursing notes 

In [0]:
tdata_name,tlabels_name,hdata_name,hlabels_name = "tseq_60Tr_all.npy", 'tlabels_60Tr_all.npy', "hseq_60Tr_all.npy", 'hlabels_60Tr_all.npy'

data_training = np.load("gdrive/My Drive/"+my_data_dir + tdata_name)
labels_training = np.load("gdrive/My Drive/"+my_data_dir + tlabels_name)

data_holdout = np.load("gdrive/My Drive/"+my_data_dir + hdata_name)
labels_holdout = np.load("gdrive/My Drive/"+my_data_dir + hlabels_name)

#view data
print(labels_training.mean(), labels_holdout.mean())
print(data_training.shape, data_holdout.shape)

#
pos_data = data_training[labels_training != 0]
neg_data = data_training[labels_training == 0]
pos_data= np.sum(pos_data,axis=1)
neg_data= np.sum(neg_data,axis=1)
print(pd.DataFrame({'outcome':pos_data.mean(axis=0),'survival':neg_data.mean(axis=0)}))



In [0]:
SimpleRegression(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
DNN(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=62)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= False, split_point=62)

In [0]:
SimpleRegression_j(data_training,labels_training,data_holdout,labels_holdout,split_point=62)

In [0]:
DNN_j(data_training,labels_training,data_holdout,labels_holdout, split_point=62)

In [0]:
LSTM(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=62)

In [0]:
GRU(data_training,labels_training,data_holdout,labels_holdout, jdummies= True, split_point=62)