In [1]:
import pandas as pd
import numpy as np
#from importlib import reload
from sklearn.cross_validation import train_test_split
import json 
from collections import Counter 

from keras.utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import pickle
import sys 
sys.path.append("../../src/data/")
import make_dataset
sys.path.append("../../src/models/train_model")
import NN_VE_model
import attention_layer
sys.path.append("../../src/features")
import build_features, vital_signs_features, age_features, RFV_features, RFV_text_vectorizing
%matplotlib inline

Using TensorFlow backend.


In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

## Reading CDC File

In [3]:
with open('../../fileConfig.json') as config_file:    
        fileConfig = json.load(config_file)

In [4]:
#reading file
processedDirectory = fileConfig['dataDirectory'] + fileConfig['processedDirectory'] 
cdc_input = pd.read_csv(processedDirectory + 'ED_TOTAL_2009_2009.csv' )

# Preprocessing

In [5]:
reload(build_features)
predictors, target, rss = build_features.get_features(cdc_input, with_features_for_Embedding=True, 
                                                      with_target=True,with_rss_target=True)  

Creating text for embeddings


## Vectorizing text for Embeddings

In [6]:
predictors, max_seq_length, MAX_VOCAB,  tokenizer = \
                     RFV_text_vectorizing.vectorize_RFV_text (predictors,  debug=False)  

Vocabulary size: 1603
Average text length: 12.6051971547
Max text length: 122


In [9]:
list(predictors)

['Temp_Baseline',
 'Pulse_Baseline',
 'Sys_BP_Baseline',
 'Resp_Rate_Baseline',
 'Oxygen_Sat_Baseline',
 'Reason_Chest_Pain',
 'Reason_Abdominal_Pain',
 'Reason_Headache',
 'Reason_Shortness_of_Breath',
 'Reason_Back_Pain',
 'Reason_Cough',
 'Reason_Nausea_Vomiting',
 'Reason_Fever_Chills',
 'Reason_Syncope',
 'Reason_Dizziness',
 'Reason_Psychiatric_Complaint',
 'Reason_Nervous_System',
 'Reason_Cardiovascular_Other',
 'Reason_Ears_Eyes_Complaint',
 'Reason_Respiratory_Other',
 'Reason_Gastrointestinal_Other',
 'Reason_Genitourinary_Other',
 'Reason_Skin_Hair_Nails_Complaint',
 'Reason_Musculoskeletal_Other',
 'Reason_Injury_Poisoning',
 'Reason_Other',
 'Hypothermia',
 'Hyperthermia',
 'Bradycardia',
 'Mild_Tachycardia',
 'Moderate_Tachycardia',
 'Severe_Tachycardia',
 'Hypotension',
 'Hypertension',
 'Bradypnea',
 'Moderate_Tachypnea',
 'Severe_Tachypnea',
 'Mild_Hypoxia',
 'Severe_Hypoxia',
 'Age_18_30',
 'Age_31_40',
 'Age_41_50',
 'Age_51_60',
 'Age_61_70',
 'Age_71_80',
 'Age_81

## NN model

In [15]:
nn_model = NN_VE_model.create_model(l2=0.0001, n_units =100, apply_attention=True,
                                    embedding_nh=100, 
                                    input_text_length=max_seq_length,
                                    vocab_size=MAX_VOCAB)
nn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_

  context_vector =merge([att_weights, inputs], mode='dot', dot_axes=(1,1), name='context_vector_c'+i)
  name=name)


## Train Model

In [16]:
X_train, X_dev, y_train, y_dev = train_test_split(predictors, rss, test_size = 0.1)
X_train_list = NN_VE_model.get_x_list(X_train)
X_dev_list = NN_VE_model.get_x_list(X_dev)

In [17]:
reload(NN_VE_model)
roc_auc, cdc_model = NN_VE_model.train_RSS_model(X_train_list, y_train,X_dev_list, y_dev, 
                                  num_epochs=40, l2=0.0001, n_units=50, 
                                apply_attention= True, embedding_nh=50, n_layers =3,att_l2=0.0001,
                                input_text_length=max_seq_length,  vocab_size=MAX_VOCAB,  verbose = False )

AUROC[0]: 81.16%
AUROC[1]: 75.54%
AUROC[2]: 82.47%
Mean AUROC: 79.72%


## Cross Validation

In [10]:
from sklearn.model_selection import StratifiedKFold

In [11]:

pp = predictors
nepochs = 40
target = rss
units_n = 50
input_text_length=max_seq_length
vocab_size=MAX_VOCAB
att_l2=0.0001
l2=0.0001
apply_attention= True
embedding_nh=50
n_layers =3
att_l2=0.0001

seed = np.random.seed(0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
count = 1
for train, test in kfold.split(pp, target):
    X_train = NN_VE_model.get_x_list(pp.iloc[train])
    X_dev = NN_VE_model.get_x_list(pp.iloc[test])
    y_train = target.iloc[train]
    y_dev = target.iloc[test]
    roc, model = NN_VE_model.train_RSS_model (X_train,y_train,X_dev,y_dev,  num_epochs=nepochs, l2=l2, n_units=units_n, 
                                  apply_attention=apply_attention ,
                                  embedding_nh=embedding_nh, n_layers=n_layers,
                                  input_text_length=max_seq_length, vocab_size=vocab_size,att_l2=att_l2,
                                  verbose=False) 
    cvscores.append(roc)
#print("ROC AUC: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores)*100, np.std(cvscores)))


  context_vector =merge([att_weights, inputs], mode='dot', dot_axes=(1,1), name='context_vector_c'+i)
  name=name)


AUROC[0]: 80.68%
AUROC[1]: 72.50%
AUROC[2]: 82.42%
Mean AUROC: 78.53%
AUROC[0]: 80.51%
AUROC[1]: 74.79%
AUROC[2]: 82.68%
Mean AUROC: 79.33%
AUROC[0]: 80.90%
AUROC[1]: 74.69%
AUROC[2]: 83.15%
Mean AUROC: 79.58%
AUROC[0]: 80.89%
AUROC[1]: 73.53%
AUROC[2]: 83.19%
Mean AUROC: 79.20%
AUROC[0]: 80.80%
AUROC[1]: 72.87%
AUROC[2]: 83.24%
Mean AUROC: 78.97%
AUROC[0]: 80.94%
AUROC[1]: 73.57%
AUROC[2]: 83.25%
Mean AUROC: 79.26%
AUROC[0]: 79.20%
AUROC[1]: 73.28%
AUROC[2]: 81.76%
Mean AUROC: 78.08%
AUROC[0]: 80.00%
AUROC[1]: 72.94%
AUROC[2]: 81.89%
Mean AUROC: 78.28%
AUROC[0]: 79.96%
AUROC[1]: 73.21%
AUROC[2]: 81.14%
Mean AUROC: 78.10%
AUROC[0]: 81.77%
AUROC[1]: 73.16%
AUROC[2]: 83.19%
Mean AUROC: 79.37%


In [12]:
# no recources
reduce(lambda x, y: x + y, [x for x,y,z,n in cvscores]) / len(cvscores)

0.80564322554280776

In [13]:
# 1 resource
reduce(lambda x, y: x + y, [y for x,y,z,n in cvscores]) / len(cvscores)

0.73453661475581666

In [14]:
# more than 2 resources
reduce(lambda x, y: x + y, [z for x,y,z,n in cvscores]) / len(cvscores)

0.82591773568405247

original from w210 repository   

```
Cross validated:
    AUC ROC[0]:0.7989092408302745
    AUC ROC[1]:0.7466151138245335
    AUC ROC[2]:0.8163632162013006
    AUC Avg   :0.7872958569520362
```

## Train Model with all data

In [15]:
X_train_list = NN_VE_model.get_x_list(predictors)

In [21]:
reload(NN_VE_model)
rss_model = NN_VE_model.train_full_RSS_model(X_train_list, rss, 
                                  num_epochs=40, l2=0.0001, n_units=50, 
                                apply_attention= True, embedding_nh=50, n_layers =3,att_l2=0.0001,
                                input_text_length=max_seq_length,  vocab_size=MAX_VOCAB,  verbose = False )

In [22]:
rss_model.save ('../../models/cdc_rss_2009_nn_att_text_embedding.H5')