# Evaluation
Evaluating ES1 variables

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import brier_score_loss, roc_curve, auc
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
np.random.seed(1234)
tf.set_random_seed(1234)

In [2]:
def get_prob_sklearn(clf, train_df, test_df, s):
    # split out features from outcome
    
    ## train
    x_train = train_df.drop('STATUS_DISCHARGE', axis=1)
    y_train = train_df['STATUS_DISCHARGE']
    
    ## test
    x_test = test_df.drop('STATUS_DISCHARGE', axis=1)
    y_test = test_df['STATUS_DISCHARGE']
    
    # train model
    clf.fit(x_train, y_train)
    
    # calculate probabilities for test data
    y = clf.predict_proba(x_test)[:, 1]
    
    # check for missing rows
    assert len(y) == len(x_test)
    
    # set row index
    y = pd.DataFrame(y, index=x_test.index, columns=[s])

    # check for missing rows
    assert x_test.shape[0] == y.shape[0]
    
    return y

In [3]:
def fit_cv_sklearn(clf, train):   
    # split out features from outcome
    
    ## train
    x_train = train.drop('STATUS_DISCHARGE', axis=1)
    y_train = train['STATUS_DISCHARGE']
    
    # train model using k=5 CV
    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc', n_jobs=2)
    
    print("ROC AUC: avg {}, sd {}".format(np.round(np.mean(scores), 3), np.round(np.std(scores), 3)))
    return scores

In [4]:
def tf_grid_search(params, train, numeric_features, means, stds):
    scores = []
    # iterate over parameters
    for hp in params:
        print("testing model: {}".format(hp))
    
        # define model
        model = tf.keras.Sequential()
    
        # add hidden layer
        for layer in hp:
            model.add(tf.keras.layers.Dense(layer, activation='relu'))
    
        # add output layer
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
        # train and test model
        scores.append(fit_cv_tf(model, train, numeric_features, means, stds))
    
    return(scores)

In [5]:
def fit_cv_tf(model, train, numeric_features, means, stds):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

    ## split data into kfolds and train model
    kf = KFold(n_splits=5, random_state=0)
    scores = []
    for train_index, test_index in kf.split(train):
        ## apply standardisation to df & make tf dataset
        x_train, y_train = process_data(train.iloc[train_index], numeric_features, means, stds)
        x_test, y_test = process_data(train.iloc[test_index], numeric_features, means, stds)
    
        ## fit
        model.fit(x_train, y_train, epochs=15, verbose=0)
    
        # predict test probabilities
        y_test_pred = model.predict(x_test)
    
        # calculate roc auc metric
        fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
        scores.append(auc(fpr, tpr))

    print("ROC AUC: avg {}, sd {}".format(np.round(np.mean(scores), 3), np.round(np.std(scores), 3)))
    return scores

In [6]:
def process_data(df, numeric_features, means, stds):
    
    # convert bool to int
    bool_columns = df.select_dtypes(['bool']).columns
    df[bool_columns] = df[bool_columns].astype('int')
    
    # normalise continuous variables
    for i, f in enumerate(numeric_features):
        df[f] = (df[f] - means[i]) / stds[i]
    
    # split x and y
    target = df.pop('STATUS_DISCHARGE')
    
    return df.values, target.values

## Read in data

In [7]:
# load preprocessed data (see feature selection notebook) from csv
train = pd.read_csv('train.esi.features.17.02.20.csv', index_col=0)
test = pd.read_csv('test.esi.features.17.02.20.csv', index_col=0)

# load ES1 & 2 scores
esi_prob = pd.read_csv('test.esi.score.17.02.20.csv', index_col=0)
esii_prob = pd.read_csv('test.esii.score.17.02.20.csv', index_col=0)
probs = pd.merge(esi_prob, esii_prob, how='left', left_index=True, right_index=True)

# merge outcome on probs
probs = pd.merge(probs, test['STATUS_DISCHARGE'], how='left', left_index=True, right_index=True)

# convert bool to int
bool_columns = train.select_dtypes(['bool']).columns
train[bool_columns] = train[bool_columns].astype('int')
bool_columns = test.select_dtypes(['bool']).columns
test[bool_columns] = test[bool_columns].astype('int')

# get mean and SD for **training** dataset to standardise variables
numeric_features = ['Age (continuous)']
desc = train[numeric_features].describe()
means = np.array(desc.T['mean'])
stds = np.array(desc.T['std'])

## Model development

### Logistic regression

In [8]:
clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=50000)
scores = fit_cv_sklearn(clf, train)

ROC AUC: avg 0.822, sd 0.016


### Neural network

#### One hidden layer

In [10]:
# define hyperparameters
params = [
    [18],
    [36],
    [54],
    [72],
    [90],
    [108],
    [126],
    [144]
]
scores = tf_grid_search(params, train, numeric_features, means, stds)

W0218 12:18:59.212293  2700 deprecation.py:506] From C:\Users\LyonMat\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


testing model: [18]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
W0218 12:18:59.660533  2700 deprecation.py:323] From C:\Users\LyonMat\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


ROC AUC: avg 0.826, sd 0.014
testing model: [36]
ROC AUC: avg 0.826, sd 0.008
testing model: [54]
ROC AUC: avg 0.828, sd 0.005
testing model: [72]
ROC AUC: avg 0.828, sd 0.008
testing model: [90]
ROC AUC: avg 0.835, sd 0.008
testing model: [108]
ROC AUC: avg 0.83, sd 0.006
testing model: [126]
ROC AUC: avg 0.833, sd 0.002
testing model: [144]
ROC AUC: avg 0.835, sd 0.007


#### Two hidden layers

In [11]:
# define hyperparameters
params = [
    [18, 18],
    [36, 18],
    [54, 18],
    [72, 18],
    [90, 18],
    [108, 18],
    [126, 18],
    [144, 18],
    [72, 36],
    [72, 54],
    [72, 72],
    [72, 90],
    [72, 108],
    [72, 126],
    [72, 144],
    [90, 36],
    [90, 54],
    [90, 72], # best
    [90, 90],
    [90, 108],
    [90, 126],
    [90, 144]
]
scores = tf_grid_search(params, train, numeric_features, means, stds)

testing model: [18, 18]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


ROC AUC: avg 0.825, sd 0.016
testing model: [36, 18]
ROC AUC: avg 0.831, sd 0.005
testing model: [54, 18]
ROC AUC: avg 0.834, sd 0.005
testing model: [72, 18]
ROC AUC: avg 0.834, sd 0.005
testing model: [90, 18]
ROC AUC: avg 0.823, sd 0.012
testing model: [108, 18]
ROC AUC: avg 0.825, sd 0.011
testing model: [126, 18]
ROC AUC: avg 0.829, sd 0.009
testing model: [144, 18]
ROC AUC: avg 0.834, sd 0.009
testing model: [72, 36]
ROC AUC: avg 0.829, sd 0.008
testing model: [72, 54]
ROC AUC: avg 0.834, sd 0.009
testing model: [72, 72]
ROC AUC: avg 0.836, sd 0.011
testing model: [72, 90]
ROC AUC: avg 0.839, sd 0.009
testing model: [72, 108]
ROC AUC: avg 0.828, sd 0.008
testing model: [72, 126]
ROC AUC: avg 0.834, sd 0.02
testing model: [72, 144]
ROC AUC: avg 0.822, sd 0.021
testing model: [90, 36]
ROC AUC: avg 0.842, sd 0.008
testing model: [90, 54]
ROC AUC: avg 0.832, sd 0.017
testing model: [90, 72]
ROC AUC: avg 0.834, sd 0.006
testing model: [90, 90]
ROC AUC: avg 0.834, sd 0.014
testing mode

#### Three hidden layers

In [None]:
# define hyperparameters
params = [
    [90, 36, 9],
    [90, 36, 18],
    [90, 36, 27],
    [90, 36, 36],
    [90, 18, 18],
    [90, 27, 18],
]
scores = tf_grid_search(params, train, numeric_features, means, stds)

testing model: [90, 36, 9]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


ROC AUC: avg 0.823, sd 0.01
testing model: [90, 36, 18]
ROC AUC: avg 0.832, sd 0.012
testing model: [90, 36, 27]


### Random forest

In [17]:
clf = RandomForestClassifier(random_state=0)
scores = fit_cv_sklearn(clf, train)

ROC AUC: avg 0.656, sd 0.027


#### Estimators

In [18]:
for p in [200, 300, 400, 500, 600, 700, 800]:
    clf = RandomForestClassifier(random_state=0, n_estimators=p)
    scores = fit_cv_sklearn(clf, train)

ROC AUC: avg 0.725, sd 0.021
ROC AUC: avg 0.731, sd 0.024
ROC AUC: avg 0.735, sd 0.023
ROC AUC: avg 0.738, sd 0.024
ROC AUC: avg 0.741, sd 0.023
ROC AUC: avg 0.743, sd 0.022
ROC AUC: avg 0.745, sd 0.024


#### Max depth

In [None]:
for p in [1, 5, 10, 20, 50]:
    clf = RandomForestClassifier(random_state=0, n_estimators=700, max_depth=p)
    scores = fit_cv_sklearn(clf, train)

#### Min samples split

In [None]:
for p in [2, 5, 10, 20]:
    clf = RandomForestClassifier(random_state=0, n_estimators=700, max_depth=10, min_samples_split=p)
    scores = fit_cv_sklearn(clf, train)

#### Min samples leaf

In [None]:
for p in [1, 2, 5, 10, 20, 30]:
    clf = RandomForestClassifier(random_state=0, n_estimators=700, max_depth=10, min_samples_split=5, min_samples_leaf=p)
    scores = fit_cv_sklearn(clf, train)

### Naive Bayes

In [19]:
clf = GaussianNB()
scores = fit_cv_sklearn(clf, train)

ROC AUC: avg 0.779, sd 0.024


### Final models

In [8]:
# LR
clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=50000)
prob = get_prob_sklearn(clf, train, test, "LR")
probs = pd.merge(probs, prob, how='left', left_index=True, right_index=True)

# RF
clf = RandomForestClassifier(random_state=0, n_estimators=700, max_depth=10, min_samples_split=5, min_samples_leaf=20)
prob = get_prob_sklearn(clf, train, test, "RF")
probs = pd.merge(probs, prob, how='left', left_index=True, right_index=True)

# NB
clf = GaussianNB()
prob = get_prob_sklearn(clf, train, test, "NB")
probs = pd.merge(probs, prob, how='left', left_index=True, right_index=True)

# NN
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense('90', activation='relu'))
model.add(tf.keras.layers.Dense('36', activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
x_train, y_train = process_data(train, numeric_features, means, stds)
x_test, y_test = process_data(test, numeric_features, means, stds)
model.fit(x_train, y_train, epochs=15, verbose=0)
y = model.predict(x_test)
y = pd.DataFrame(y, index=test.index, columns=['NN'])
probs = pd.merge(probs, y, how='left', left_index=True, right_index=True)

# write out final probabilities
probs.to_csv("probs.19.02.20.csv")

W0219 13:05:26.099060  5328 deprecation.py:506] From C:\Users\LyonMat\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0219 13:05:26.496416  5328 deprecation.py:323] From C:\Users\LyonMat\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
