## Note:
this is a workbook notebook to run the CNN-with-Attention model and the full dataset

In [1]:
%matplotlib inline
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random
from collections import Counter, defaultdict
from operator import itemgetter
import matplotlib.pyplot as plt


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.models import load_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model
import lstm_model


Using TensorFlow backend.


In [2]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])


In [3]:
full_df.shape
df = full_df
print df.shape

(52696, 5)


## Pre processing ICD 9 codes

In [4]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))


In [5]:
#counts by icd9_codes
icd9_codes = Counter()
for label in df['ICD9']:
    for icd9_code in label.split():
        icd9_codes[icd9_code] += 1
number_icd9_first_level = len (icd9_codes)
print icd9_codes
print 'Number of icd9 codes in the first level that have notes in the dataset:', number_icd9_first_level

Counter({'390-459': 41335, '240-279': 34600, '290-319': 27838, '460-519': 24577, '800-999': 21865, '580-629': 21253, '520-579': 20430, '780-789': 16270, '320-389': 15377, '001-139': 14212, '710-739': 9808, '140-239': 8579, '680-709': 5924, '760-779': 5454, '790-796': 4510, '799': 1624, '630-679': 156, '798': 1})
Number of icd9 codes in the first level that have notes in the dataset: 18


In [6]:
#preprocess icd9 codes to vectors 
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)
print 'sample of vectorized icd9 labels: ', labels[0]


sample of vectorized icd9 labels:  [0 0 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0]


## Pre process Notes

In [7]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data_vectorized, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data_vectorized, MAX_SEQ_LENGTH)

print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 139074
Average note length: 1634.982845
Max note length: 10924
Final Vocabulary: 139074
Final Max Sequence Length: 5000


In [8]:
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX= []

In [9]:
#creating glove embeddings
EMBEDDING_LOC = '../data/notes.100.txt' # location of embedding
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True, sigma=True)


('Vocabulary in notes:', 139074)
('Vocabulary in original embedding:', 21056)
('Vocabulary intersection:', 20640)


## Split Files

In [10]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (36887, 5000), (36887, 19))
('Validation: ', (10539, 5000), (10539, 19))
('Test: ', (5270, 5000), (5270, 19))


In [11]:
# Delete temporary variables to free some memory
del df, data, labels

## CNN and attention

In [12]:
import icd9_cnn_att

In [48]:
reload(icd9_cnn_att)
#### build model
cnn_att_model = icd9_cnn_att.build_icd9_cnn_model (input_seq_length=MAX_SEQ_LENGTH, max_vocab = MAX_VOCAB,
                             external_embeddings = True,
                             embedding_dim=EMBEDDING_DIM,embedding_matrix=EMBEDDING_MATRIX,
                             num_filters = 100, filter_sizes=[2,3,4,5],
                             training_dropout=0.5,
                             num_classes=N_TOP )

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 5000, 100)     13907500    input_2[0][0]                    
____________________________________________________________________________________________________
conv1d_5 (Conv1D)                (None, 4999, 100)     20100       embedding_2[0][0]                
____________________________________________________________________________________________________
conv1d_6 (Conv1D)                (None, 4998, 100)     30100       embedding_2[0][0]                
___________________________________________________________________________________________

In [None]:
1 +1

In [None]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=15, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15

### running with the new dropouts and regularizations 
* dropouts in output layer


In [14]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6c2da355d0>

In [15]:
cnn_att_model.save('models/cnn_att_5_epochs_50k.h5')

In [16]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.526      0.528
0.030:      0.547      0.548
0.040:      0.564      0.565
0.050:      0.579      0.580
0.055:      0.585      0.587
0.058:      0.589      0.591
0.060:      0.592      0.593
0.080:      0.614      0.615
0.100:      0.634      0.634
0.200:      0.710      0.707
0.300:      0.759      0.753
0.400:      0.778      0.772
0.500:      0.774      0.765
0.600:      0.747      0.739
0.700:      0.697      0.690


In [37]:
cnn_att_model = load_model('models/cnn_att_5_epochs_50k.h5')

In [38]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f678edc3d50>

In [40]:
cnn_att_model.save('models/cnn_att_7_epochs_50k.h5')

In [41]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.530      0.531
0.030:      0.550      0.550
0.040:      0.566      0.566
0.050:      0.580      0.581
0.055:      0.587      0.587
0.058:      0.591      0.591
0.060:      0.593      0.593
0.080:      0.617      0.616
0.100:      0.637      0.635
0.200:      0.710      0.705
0.300:      0.763      0.755
0.400:      0.786      0.775
0.500:      0.785      0.772
0.600:      0.764      0.751
0.700:      0.721      0.711


In [42]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f678edc3310>

In [43]:
cnn_att_model.save('models/cnn_att_9_epochs_50k.h5')

In [44]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.524      0.525
0.030:      0.540      0.541
0.040:      0.557      0.556
0.050:      0.572      0.570
0.055:      0.579      0.577
0.058:      0.583      0.582
0.060:      0.586      0.584
0.080:      0.612      0.608
0.100:      0.633      0.629
0.200:      0.711      0.701
0.300:      0.764      0.749
0.400:      0.792      0.775
0.500:      0.798      0.779
0.600:      0.782      0.763
0.700:      0.741      0.724


In [None]:
# from epoch 10 to 14 

In [45]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6c28115b10>

In [46]:
cnn_att_model.save('models/cnn_att_14_epochs_50k.h5')

In [47]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.551      0.546
0.030:      0.568      0.563
0.040:      0.583      0.576
0.050:      0.597      0.589
0.055:      0.604      0.595
0.058:      0.607      0.598
0.060:      0.610      0.601
0.080:      0.634      0.622
0.100:      0.656      0.641
0.200:      0.736      0.711
0.300:      0.788      0.751
0.400:      0.817      0.771
0.500:      0.826      0.775
0.600:      0.816      0.766
0.700:      0.789      0.742


### re-start

In [25]:
cnn_att_model = load_model('models/cnn_att_5_epochs_50k.h5')

  return cls(**config)


In [26]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6b2c106f90>

In [27]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.539      0.538
0.030:      0.559      0.557
0.040:      0.577      0.574
0.050:      0.592      0.589
0.055:      0.599      0.596
0.058:      0.603      0.600
0.060:      0.606      0.602
0.080:      0.630      0.624
0.100:      0.650      0.643
0.200:      0.719      0.707
0.300:      0.766      0.748
0.400:      0.797      0.774
0.500:      0.807      0.782
0.600:      0.795      0.769
0.700:      0.758      0.735


### with l2 regularizations and additional dropouts

In [14]:
# 5 epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f93b2083e90>

In [15]:
cnn_att_model.save('models/cnn_att_5_epochs_50k.h5')

In [16]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.537      0.538
0.030:      0.564      0.565
0.040:      0.588      0.589
0.050:      0.607      0.608
0.055:      0.616      0.616
0.058:      0.620      0.620
0.060:      0.623      0.623
0.080:      0.648      0.648
0.100:      0.669      0.668
0.200:      0.744      0.739
0.300:      0.779      0.772
0.400:      0.789      0.780
0.500:      0.780      0.769
0.600:      0.750      0.741
0.700:      0.698      0.688


In [17]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f93b3746590>

In [22]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.534      0.535
0.030:      0.558      0.558
0.040:      0.578      0.577
0.050:      0.594      0.594
0.055:      0.601      0.601
0.058:      0.605      0.605
0.060:      0.608      0.607
0.080:      0.631      0.629
0.100:      0.652      0.649
0.200:      0.727      0.717
0.300:      0.770      0.754
0.400:      0.796      0.775
0.500:      0.804      0.782
0.600:      0.793      0.772
0.700:      0.762      0.741


In [18]:
cnn_att_model.save('models/cnn_att_7_epochs_50k.h5')

In [19]:
model5 = load_model('models/cnn_att_5_epochs_50k.h5')

  return cls(**config)


In [20]:
model5.fit(X_train, y_train, batch_size=50, epochs=1, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/1


<keras.callbacks.History at 0x7f93b37a8810>

In [21]:
pred_train = model5.predict(X_train, batch_size=100)
pred_dev = model5.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.539      0.541
0.030:      0.563      0.564
0.040:      0.581      0.582
0.050:      0.597      0.597
0.055:      0.604      0.604
0.058:      0.608      0.608
0.060:      0.611      0.611
0.080:      0.636      0.635
0.100:      0.658      0.656
0.200:      0.729      0.723
0.300:      0.770      0.760
0.400:      0.792      0.779
0.500:      0.797      0.782
0.600:      0.782      0.767
0.700:      0.745      0.731


### with additional relu dense activation and 50 as dimension

In [17]:
# 5 epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa4f6a6e7d0>

In [19]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.603      0.596
0.030:      0.628      0.619
0.040:      0.648      0.636
0.050:      0.665      0.650
0.055:      0.673      0.656
0.058:      0.677      0.660
0.060:      0.680      0.662
0.080:      0.705      0.684
0.100:      0.726      0.701
0.200:      0.794      0.757
0.300:      0.826      0.780
0.400:      0.839      0.788
0.500:      0.833      0.780
0.600:      0.808      0.756
0.700:      0.759      0.712


In [20]:
cnn_att_model.save('models/cnn_att_5_epochs_50k.h5')

In [21]:
# 3 more epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=3, validation_data=(X_val, y_val), verbose=1)

Train on 36887 samples, validate on 10539 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa53471ded0>

In [22]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.696      0.650
0.030:      0.722      0.668
0.040:      0.742      0.682
0.050:      0.758      0.693
0.055:      0.765      0.698
0.058:      0.769      0.701
0.060:      0.772      0.703
0.080:      0.794      0.717
0.100:      0.813      0.729
0.200:      0.866      0.760
0.300:      0.891      0.770
0.400:      0.899      0.770
0.500:      0.896      0.762
0.600:      0.884      0.747
0.700:      0.858      0.724


In [None]:
cnn_att_model.save('models/cnn_att_8_epochs_5k.h5')

In [None]:
# 2 more epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=1)

In [None]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

In [56]:
cnn_att_model.save('models/cnn_att_10_epochs_5k.h5')