# FNN_TE:  FNN with Embeddings for RFV text

This is a forward neural network, using the same features used by the FNN model plus embeddings 
for the Reason for Visit (RFV) textual descriptions.    



In [1]:
%c inline
import pandas as pd
import numpy as np
import pickle
import json 
from sklearn.cross_validation import train_test_split
#from importlib import reload
from keras.utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import sys 
sys.path.append("../../src/models/train_model")
import NN_Embeddings_model
sys.path.append("../../src/features")
import build_features, vital_signs_features, age_features, RFV_features
import RFV_text_vectorizing

%matplotlib inline

ERROR:root:Line magic function `%c` not found.
Using TensorFlow backend.


In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

## Model Training

In [3]:
with open('../../fileConfig.json') as config_file:    
        fileConfig = json.load(config_file)

In [19]:
reload(NN_Embeddings_model)
NN_Embeddings_model.FNN_TE_model_training(fileConfig, 'ED_TOTAL_2009_2009.csv')

Creating text for embeddings
Vocabulary size: 1603
Average text length: 12.6051971547
Max text length: 122
AUROC: 83.55%
AUROC: 83.89%
AUROC: 84.34%
AUROC: 87.75%
AUROC: 87.66%
AUROC: 85.67%
AUROC: 87.37%
AUROC: 86.41%
AUROC: 85.81%
AUROC: 83.80%
ROC AUC: 85.6242% (+/- 1.57%)


## Model Training, step by step

### Reading CDC File

In [4]:
#reading file
processedDirectory = fileConfig['dataDirectory'] + fileConfig['processedDirectory'] 
cdc_input = pd.read_csv(processedDirectory + 'ED_TOTAL_2009_2009.csv' )

### Feature Engineering

In [5]:
# it includes the textual descriptions for the RFV (Reason for Visit) codes 
predictors, target = build_features.get_all_features (cdc_input, normalize=True,with_features_for_Embedding = True )

Creating text for embeddings


In [6]:
list(predictors)

['Temp_Baseline',
 'Pulse_Baseline',
 'Sys_BP_Baseline',
 'Resp_Rate_Baseline',
 'Oxygen_Sat_Baseline',
 'Reason_Chest_Pain',
 'Reason_Abdominal_Pain',
 'Reason_Headache',
 'Reason_Shortness_of_Breath',
 'Reason_Back_Pain',
 'Reason_Cough',
 'Reason_Nausea_Vomiting',
 'Reason_Fever_Chills',
 'Reason_Syncope',
 'Reason_Dizziness',
 'Reason_Psychiatric_Complaint',
 'Reason_Nervous_System',
 'Reason_Cardiovascular_Other',
 'Reason_Ears_Eyes_Complaint',
 'Reason_Respiratory_Other',
 'Reason_Gastrointestinal_Other',
 'Reason_Genitourinary_Other',
 'Reason_Skin_Hair_Nails_Complaint',
 'Reason_Musculoskeletal_Other',
 'Reason_Injury_Poisoning',
 'Reason_Other',
 'Hypothermia',
 'Hyperthermia',
 'Bradycardia',
 'Mild_Tachycardia',
 'Moderate_Tachycardia',
 'Severe_Tachycardia',
 'Hypotension',
 'Hypertension',
 'Bradypnea',
 'Moderate_Tachypnea',
 'Severe_Tachypnea',
 'Mild_Hypoxia',
 'Severe_Hypoxia',
 'Age_18_30',
 'Age_31_40',
 'Age_41_50',
 'Age_51_60',
 'Age_61_70',
 'Age_71_80',
 'Age_81

### Pre-Processing RFV text for model Embeddings

In [39]:
# Displaying results for the first 5 records

In [7]:
pd.set_option('display.max_colwidth', -1)
predictors[:5][['RFV1_text','RFV2_text','RFV3_text'] ]

Unnamed: 0,RFV1_text,RFV2_text,RFV3_text
0,neck pain ache soreness discomfort,low back pain ache soreness discomfort,blank entry
1,leg pain ache soreness discomfort,blank entry,blank entry
2,stomach and abdominal pain cramps and spasms gastric pain,blank entry,blank entry
3,other,blank entry,blank entry
4,blank entry,blank entry,blank entry


In [8]:
# append all RFVn_text  into one text
# vectorize, get a number_id for each word (tokenizer has the dictionary)
# make each rfv_data_vectorized the same length, appending zeroes
# returns MAC_VOCAB: length of the dictionary, max_seq_length: maximum text length  
predictors, rfv_data, max_text_length, VOCAB_LENGTH, tokenizer = \
                     RFV_text_vectorizing.vectorize_RFV_text (predictors,  debug=False,in_predictors = False)   

Vocabulary size: 1603
Average text length: 12.6051971547
Max text length: 122


###  NN model with Embeddings 

In [9]:
X_train,  X_dev, rfv_train, rfv_dev, y_train, y_dev = train_test_split(predictors, rfv_data, target, test_size = 0.1)

In [58]:
reload(NN_Embeddings_model)
model_cdc = NN_Embeddings_model.create_e_model(X_train.shape[1:],input_text_length= max_text_length, 
                         vocab_size = VOCAB_LENGTH, embedding_size=30,l2=0.001, units =100)
roc_auc = NN_Embeddings_model.train_cdc_e_model ( X_train, rfv_train, y_train, X_dev, rfv_dev,  y_dev,
                                    num_epochs=10, verbose_flag= True, network =model_cdc)

Train on 21888 samples, validate on 2433 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
AUROC: 86.34%


In [32]:
reload(NN_Embeddings_model)
NN_Embeddings_model.cross_e_Validation (20, predictors, rfv_data, target,max_seq_length,MAX_VOCAB,l2 =0.005,
                                        embedding_size=30, units =100)

AUROC: 83.54%
AUROC: 83.87%
AUROC: 84.30%
AUROC: 87.77%
AUROC: 87.65%
AUROC: 85.69%
AUROC: 87.40%
AUROC: 86.41%
AUROC: 85.76%
AUROC: 83.77%
ROC AUC: 85.6168% (+/- 1.58%)


original run from w210 repository
```
NN_Embeddings_model.cross_e_Validation (10, predictors, rfv_data, target,max_seq_length,MAX_VOCAB,l2 =0.005,
                                        embedding_size=30, units =100)
AUROC: 84.31%
AUROC: 84.32%
AUROC: 84.43%
AUROC: 88.06%
AUROC: 86.51%
AUROC: 85.52%
AUROC: 87.15%
AUROC: 86.58%
AUROC: 85.52%
AUROC: 83.71%
ROC AUC: 85.6107% (+/- 1.36%)
```