## Note:
this is a workbook notebook for testing the baseline, lstm and cnn model... the final notebook will have much more examples and will have visualization on how the data looks

In [1]:
%matplotlib inline
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random
from collections import Counter, defaultdict
from operator import itemgetter
import matplotlib.pyplot as plt


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.models import load_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model
import lstm_model


Using TensorFlow backend.


In [2]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])


In [3]:
full_df.shape

(52696, 5)

In [4]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
#df = full_df
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,156934,22577,2120-01-28 00:00:00,41071 42821 4280 5990 41401 3051 25000 2724 4019,Admission Date: [**2120-1-22**] ...
1,165217,85327,2128-04-10 00:00:00,5849 07044 2761 78959 99812 2851 2869 514 8070...,Admission Date: [**2128-4-2**] D...
2,198570,30581,2116-08-26 00:00:00,41401 4111 412 4019 2724 25000 4439 2449 79439...,"Name: [**Known lastname **],[**Known firstnam..."
3,156818,26179,2168-12-23 00:00:00,486 5119 5109 2851 2761 27652 79902 45829 5191...,Admission Date: [**2168-12-10**] ...
4,184002,15207,2101-08-31 00:00:00,41401 4111 9971 42731 20410 9992 2720 49390,Admission Date: [**2101-8-24**] Dischar...
5,101153,24089,2129-04-27 00:00:00,0389 5990 59080 591 42731 40391 4280 5849 2859,Admission Date: [**2129-4-21**] Discharge...
6,104448,17709,2197-11-24 00:00:00,39891 3962 42731 20280 4400 4019 25000 2749 27...,Admission Date: [**2197-11-10**] ...
7,157907,45,2129-06-20 00:00:00,8249 9984 8730 87344 9110 9130 3051,Admission Date: [**2129-6-10**] Dischar...
8,104434,62476,2173-04-13 00:00:00,41401 9971 4139 42731 30000 4019 2724 71500,Admission Date: [**2173-4-9**] D...
9,145129,28163,2173-03-30 00:00:00,0380 5722 2760 5990 51881 5070 430 99592 5716 ...,Admission Date: [**2173-3-18**] ...


## Pre processing ICD 9 codes

In [5]:
# instead of finding out the top 20 leave icd-9 codes and filter records based on that
# we will use all records and replace the leave icd-9 codes with its grandparents code in the first level of the hierarchy
#N_TOP = 20 
#full_df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)
#df = full_df.head(1000)

In [6]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,156934,22577,2120-01-28 00:00:00,580-629 240-279 390-459 290-319,Admission Date: [**2120-1-22**] ...
1,165217,85327,2128-04-10 00:00:00,240-279 001-139 390-459 290-319 460-519 520-57...,Admission Date: [**2128-4-2**] D...
2,198570,30581,2116-08-26 00:00:00,240-279 390-459 290-319 799 790-796,"Name: [**Known lastname **],[**Known firstnam..."
3,156818,26179,2168-12-23 00:00:00,240-279 001-139 390-459 290-319 460-519 520-57...,Admission Date: [**2168-12-10**] ...
4,184002,15207,2101-08-31 00:00:00,240-279 390-459 140-239 460-519 800-999,Admission Date: [**2101-8-24**] Dischar...
5,101153,24089,2129-04-27 00:00:00,001-139 580-629 390-459 290-319,Admission Date: [**2129-4-21**] Discharge...
6,104448,17709,2197-11-24 00:00:00,240-279 390-459 140-239 520-579,Admission Date: [**2197-11-10**] ...
7,157907,45,2129-06-20 00:00:00,290-319 800-999,Admission Date: [**2129-6-10**] Dischar...
8,104434,62476,2173-04-13 00:00:00,240-279 390-459 710-739 290-319 800-999,Admission Date: [**2173-4-9**] D...
9,145129,28163,2173-03-30 00:00:00,240-279 001-139 390-459 290-319 460-519 520-57...,Admission Date: [**2173-3-18**] ...


In [7]:
#counts by icd9_codes
icd9_codes = Counter()
for label in df['ICD9']:
    for icd9_code in label.split():
        icd9_codes[icd9_code] += 1
number_icd9_first_level = len (icd9_codes)
print icd9_codes
print 'Number of icd9 codes in the first level that have notes in the dataset:', number_icd9_first_level

Counter({'390-459': 4094, '240-279': 3411, '290-319': 2781, '460-519': 2451, '800-999': 2107, '520-579': 2104, '580-629': 2100, '780-789': 1572, '320-389': 1566, '001-139': 1361, '710-739': 964, '140-239': 869, '680-709': 586, '760-779': 565, '790-796': 380, '799': 159, '630-679': 18})
Number of icd9 codes in the first level that have notes in the dataset: 17


In [9]:
#preprocess icd9 codes to vectors 
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)
print 'sample of vectorized icd9 labels: ', labels[0]


sample of vectorized icd9 labels:  [0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]


## Pre process Notes

In [10]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data_vectorized, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data_vectorized, MAX_SEQ_LENGTH)

print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 45221
Average note length: 1640.62903226
Max note length: 8185
Final Vocabulary: 45221
Final Max Sequence Length: 5000


In [11]:
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX= []

In [12]:
#creating glove embeddings
EMBEDDING_LOC = '../data/notes.100.txt' # location of embedding
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True, sigma=True)


('Vocabulary in notes:', 45221)
('Vocabulary in original embedding:', 21056)
('Vocabulary intersection:', 19967)


## Split Files

In [14]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 5000), (3688, 19))
('Validation: ', (1054, 5000), (1054, 19))
('Test: ', (528, 5000), (528, 19))


In [15]:
# Delete temporary variables to free some memory
del df, data, labels

## CNN and attention

In [16]:
import icd9_cnn_att

In [50]:
reload(icd9_cnn_att)
#### build model
cnn_att_model = icd9_cnn_att.build_icd9_cnn_model (input_seq_length=MAX_SEQ_LENGTH, max_vocab = MAX_VOCAB,
                             external_embeddings = True,
                             embedding_dim=EMBEDDING_DIM,embedding_matrix=EMBEDDING_MATRIX,
                             num_filters = 100, filter_sizes=[2,3,4,5],
                             training_dropout=0.5,
                             num_classes=N_TOP )

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_9 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_9 (Embedding)          (None, 5000, 100)     4522200     input_9[0][0]                    
____________________________________________________________________________________________________
conv1d_33 (Conv1D)               (None, 4999, 100)     20100       embedding_9[0][0]                
____________________________________________________________________________________________________
conv1d_34 (Conv1D)               (None, 4998, 100)     30100       embedding_9[0][0]                
___________________________________________________________________________________________

In [51]:
# 5 epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f72103b4990>

In [52]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.530      0.526
0.030:      0.545      0.540
0.040:      0.560      0.555
0.050:      0.574      0.569
0.055:      0.581      0.577
0.058:      0.585      0.580
0.060:      0.587      0.582
0.080:      0.608      0.603
0.100:      0.625      0.620
0.200:      0.689      0.675
0.300:      0.734      0.717
0.400:      0.753      0.728
0.500:      0.741      0.717
0.600:      0.705      0.683
0.700:      0.645      0.618


In [53]:
cnn_att_model.save('models/cnn_att_5_epochs_5k.h5')

In [54]:
# 5 more epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f721039df90>

In [55]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.585      0.566
0.030:      0.609      0.587
0.040:      0.629      0.604
0.050:      0.646      0.617
0.055:      0.655      0.621
0.058:      0.659      0.625
0.060:      0.662      0.627
0.080:      0.687      0.643
0.100:      0.709      0.656
0.200:      0.788      0.704
0.300:      0.830      0.727
0.400:      0.845      0.729
0.500:      0.839      0.720
0.600:      0.813      0.695
0.700:      0.763      0.651


In [56]:
cnn_att_model.save('models/cnn_att_10_epochs_5k.h5')

In [57]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6eb1b74e90>

### Note: overfitting, the model overtis 5k records

In [58]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.668      0.613
0.030:      0.697      0.630
0.040:      0.719      0.645
0.050:      0.738      0.654
0.055:      0.747      0.660
0.058:      0.751      0.663
0.060:      0.754      0.664
0.080:      0.779      0.676
0.100:      0.801      0.686
0.200:      0.865      0.713
0.300:      0.898      0.720
0.400:      0.913      0.719
0.500:      0.913      0.714
0.600:      0.903      0.696
0.700:      0.876      0.671


In [59]:
cnn_att_model.save('models/cnn_att_15_epochs_5k.h5')