In [1]:
%load_ext autoreload
%autoreload 2
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import sys 

#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
sys.path.append("../pipeline")
import icd9_cnn_model
import database_selection
import vectorization
import helpers

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [3]:
N_TOP = 20 
full_df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)
#df = full_df.head(1000)
df = full_df

In [4]:
#preprocess icd9 codes
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)


In [5]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data, MAX_SEQ_LENGTH)
print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 130488
Average note length: 1728.09244863
Max note length: 10924
Final Vocabulary: 130488
Final Max Sequence Length: 5000


In [6]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (30794, 5000), (30794, 20))
('Validation: ', (8798, 5000), (8798, 20))
('Test: ', (4400, 5000), (4400, 20))


In [7]:
# Delete temporary variables to free some memory
del df, data, labels

In [8]:
#creating embeddings
#EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
# embedding pre-trained will all MIMIC notes
EMBEDDING_LOC = '../data/notes.100.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
embedding_matrix, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 130488)
('Vocabulary in original embedding:', 21056)
('Vocabulary intersection:', 20620)


## CNN for text classification

Based on the following papers and links:
* "Convolutional Neural Networks for Sentence Classification"   
* "A Sensitivity Analysis of (and Practitioners� Guide to) Convolutional Neural Networks for Sentence Classification"
* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras/blob/master/sentiment_cnn.py

In [9]:
reload(icd9_cnn_model)
#### build model
model = icd9_cnn_model.build_icd9_cnn_model (input_seq_length=MAX_SEQ_LENGTH, max_vocab = MAX_VOCAB,
                             external_embeddings = True,
                             embedding_dim=EMBEDDING_DIM,embedding_matrix=embedding_matrix,
                             num_filters = 100, filter_sizes=[2,3,4,5],
                             training_dropout_keep_prob=0.5,
                             num_classes=N_TOP )

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 5000, 100)     13048900    input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 4999, 100)     20100       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 4998, 100)     30100       embedding_1[0][0]                
___________________________________________________________________________________________

In [10]:
#first 5 epochs
model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=2)

Train on 30794 samples, validate on 8798 samples
Epoch 1/5
1008s - loss: 0.4447 - acc: 0.8289 - val_loss: 0.3207 - val_acc: 0.8677
Epoch 2/5
984s - loss: 0.3245 - acc: 0.8698 - val_loss: 0.2738 - val_acc: 0.8868
Epoch 3/5
981s - loss: 0.2889 - acc: 0.8835 - val_loss: 0.2522 - val_acc: 0.8978
Epoch 4/5
980s - loss: 0.2708 - acc: 0.8915 - val_loss: 0.2422 - val_acc: 0.9047
Epoch 5/5
977s - loss: 0.2605 - acc: 0.8965 - val_loss: 0.2391 - val_acc: 0.9050


<keras.callbacks.History at 0x7f87e163d310>

In [11]:
model.save('models/cnn_5_epochs_allr.h5')

In [12]:
pred_train = model.predict(X_train, batch_size=200)
pred_dev = model.predict(X_val, batch_size=200)
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.358      0.353
0.030:      0.407      0.400
0.040:      0.456      0.446
0.050:      0.502      0.488
0.055:      0.523      0.508
0.058:      0.534      0.518
0.060:      0.541      0.525
0.080:      0.602      0.582
0.100:      0.645      0.621
0.200:      0.732      0.704
0.300:      0.747      0.717
0.400:      0.738      0.707
0.500:      0.712      0.679
0.600:      0.668      0.631
0.700:      0.594      0.558


In [13]:
# 2 more epochs
model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=2)

Train on 30794 samples, validate on 8798 samples
Epoch 1/2
834s - loss: 0.2518 - acc: 0.9010 - val_loss: 0.2371 - val_acc: 0.9076
Epoch 2/2
837s - loss: 0.2437 - acc: 0.9041 - val_loss: 0.2367 - val_acc: 0.9075


<keras.callbacks.History at 0x7f87a57c4310>

In [14]:
model.save('models/cnn_7_epochs_allr.h5')

In [15]:
pred_train = model.predict(X_train, batch_size=200)
pred_dev = model.predict(X_val, batch_size=200)
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.373      0.366
0.030:      0.424      0.412
0.040:      0.472      0.455
0.050:      0.515      0.494
0.055:      0.536      0.512
0.058:      0.548      0.522
0.060:      0.555      0.528
0.080:      0.622      0.587
0.100:      0.669      0.629
0.200:      0.759      0.713
0.300:      0.774      0.724
0.400:      0.767      0.714
0.500:      0.746      0.691
0.600:      0.708      0.651
0.700:      0.644      0.584


## early termination at this point

In [16]:
model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=2)

Train on 30794 samples, validate on 8798 samples
Epoch 1/2
949s - loss: 0.2371 - acc: 0.9068 - val_loss: 0.2388 - val_acc: 0.9073
Epoch 2/2
942s - loss: 0.2281 - acc: 0.9105 - val_loss: 0.2427 - val_acc: 0.9044


<keras.callbacks.History at 0x7f87a5d1e610>

In [17]:
model.save('models/cnn_8_epochs_allr.h5')

In [18]:
pred_train = model.predict(X_train, batch_size=200)
pred_dev = model.predict(X_val, batch_size=200)
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.406      0.393
0.030:      0.470      0.447
0.040:      0.526      0.494
0.050:      0.574      0.534
0.055:      0.596      0.551
0.058:      0.607      0.561
0.060:      0.614      0.567
0.080:      0.674      0.617
0.100:      0.714      0.650
0.200:      0.793      0.712
0.300:      0.804      0.716
0.400:      0.792      0.700
0.500:      0.761      0.670
0.600:      0.713      0.627
0.700:      0.642      0.564


## previous run with dropout  =0.9

too much, right?

In [20]:
model.fit(X_train, y_train, batch_size=50, epochs=10, validation_data=(X_val, y_val), verbose=2)

Train on 30794 samples, validate on 8798 samples
Epoch 1/10
921s - loss: 0.6316 - acc: 0.7930 - val_loss: 0.4130 - val_acc: 0.8373
Epoch 2/10
921s - loss: 0.4279 - acc: 0.8351 - val_loss: 0.3831 - val_acc: 0.8482
Epoch 3/10
919s - loss: 0.4002 - acc: 0.8457 - val_loss: 0.3557 - val_acc: 0.8612
Epoch 4/10
913s - loss: 0.3839 - acc: 0.8532 - val_loss: 0.3416 - val_acc: 0.8663
Epoch 5/10
906s - loss: 0.3739 - acc: 0.8571 - val_loss: 0.3316 - val_acc: 0.8677
Epoch 6/10
909s - loss: 0.3677 - acc: 0.8595 - val_loss: 0.3270 - val_acc: 0.8684
Epoch 7/10
913s - loss: 0.3621 - acc: 0.8616 - val_loss: 0.3195 - val_acc: 0.8727
Epoch 8/10
909s - loss: 0.3571 - acc: 0.8635 - val_loss: 0.3174 - val_acc: 0.8727
Epoch 9/10
906s - loss: 0.3525 - acc: 0.8654 - val_loss: 0.3120 - val_acc: 0.8747
Epoch 10/10
908s - loss: 0.3502 - acc: 0.8660 - val_loss: 0.3148 - val_acc: 0.8727


<keras.callbacks.History at 0x7fe7ddd6f910>

In [22]:
pred_train = model.predict(X_train, batch_size=200)
pred_dev = model.predict(X_val, batch_size=200)

In [23]:
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.303      0.301
0.030:      0.313      0.311
0.040:      0.327      0.323
0.050:      0.342      0.338
0.055:      0.351      0.346
0.058:      0.357      0.351
0.060:      0.360      0.354
0.080:      0.403      0.394
0.100:      0.448      0.437
0.200:      0.610      0.589
0.300:      0.611      0.591
0.400:      0.559      0.538
0.500:      0.482      0.462
0.600:      0.393      0.376
0.700:      0.319      0.306


In [24]:
model.save('models/cnn_10_epochs_allr.h5')

### Results with external embeddings = True , no additional training,  top 20
```
F1 scores
threshold | training | dev  
0.020:      0.337      0.329
0.030:      0.360      0.353
0.040:      0.365      0.374
0.050:      0.372      0.375
0.055:      0.370      0.377
0.058:      0.369      0.375
0.060:      0.368      0.375
0.080:      0.348      0.361
0.100:      0.309      0.319
0.200:      0.198      0.208
0.300:      0.157      0.138
0.500:      0.000      0.000
```

### Results with external embeddings = False, top 20
```
F1 scores
threshold | training | dev  
0.020:      0.288      0.300
0.030:      0.327      0.322
0.040:      0.371      0.363
0.050:      0.380      0.391
0.055:      0.412      0.383
0.058:      0.403      0.394
0.060:      0.394      0.389
0.080:      0.385      0.390
0.100:      0.229      0.225
0.200:      0.000      0.000
0.300:      0.000      0.000
0.500:      0.000      0.000
```

### Results with external embedding and training them , top 20
```
F1 scores
threshold | training | dev  
0.020:      0.334      0.333
0.030:      0.362      0.360
0.040:      0.366      0.374
0.050:      0.373      0.380
0.055:      0.374      0.382
0.058:      0.376      0.376
0.060:      0.376      0.378
0.080:      0.387      0.371
0.100:      0.366      0.350
0.200:      0.179      0.171
0.300:      0.020      0.020
0.500:      0.000      0.000

```

### Results with external Embeddings = False, top 10, 
We can compare this setup with the LSTM published in the paper "Applying Deep Learning to ICD-9 Multi-label Classification from Medical Records", they got a F1-score of about 0.4168, we are getting 0.447

``` 
F1 scores
threshold | training | dev  
0.020:      0.399      0.407
0.030:      0.399      0.407
0.040:      0.399      0.407
0.050:      0.408      0.413
0.055:      0.433      0.420
0.058:      0.437      0.430
0.060:      0.432      0.427
0.080:      0.501      0.463
0.100:      0.446      0.463
0.200:      0.206      0.066
0.300:      0.000      0.000
0.500:      0.000      0.000
```



## Notes:


(1) There is a LSTM model by this paper: "Applying Deep Learning to ICD-9 Multi-label Classification from Medical Records" which did achieve a 42% F1-score. (https://cs224d.stanford.edu/reports/priyanka.pdf), but it only uses the top 10 icd9 codes. We are getting 46% (just running with 1000 notes so far)


(2) The "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"  study did get a 70% F1-score, but they don't use the icd9-labels but phenotypes labels they annotated themselved (via a group of medical professionals). (https://arxiv.org/abs/1703.08705). There were ONLY 10 phenotypes.

The discharge summaries are labeled with ICD9-codes that are leaves in the ICD9-hierarchy (which has hundreds of ICD9-codes), then maybe these leave nodes are too specific and difficult to predict, one experiment would be to replaced all the ICD9-codes with their parent in the second or third level in the hierarchy and see if predictions work better that way.   

(3) our baseline with top 20 codes had a f1-score of 35% (assigning top 4 icd9 codes to all notes, using a CNN with no external embeddings is getting about 40% f1-score.. a little better than the baseline