In [1]:
## just a workbook, we will add more comments in the notebook for submission

In [2]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model
import hatt_model

Using TensorFlow backend.


In [3]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [4]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,144384,8846,2181-12-25 00:00:00,4241 99702 41401 4019 60000,Admission Date: [**2181-12-17**] ...
1,164649,78870,2132-08-27 00:00:00,2630 5130 4821 51881 5849 5184 0389 99592 0318...,Admission Date: [**2132-8-7**] D...
2,114614,8546,2164-07-12 00:00:00,0382 42099 5119 42731 2859,Admission Date: [**2164-7-3**] Discharge ...
3,148970,29739,2128-11-25 00:00:00,4372 73300 27549 71233 56400,Admission Date: [**2128-11-19**] ...
4,111512,99864,2128-05-25 00:00:00,5780 43820 78791 7873 7921 9916 29040 4370 438...,Admission Date: [**2128-5-24**] ...
5,137229,59986,2185-08-10 00:00:00,9663 34839 34541 5569 3694 49390 4019 2449 272...,Admission Date: [**2185-8-4**] D...
6,172712,28291,2132-07-03 00:00:00,41071 42741 5849 78820 42821 4280 5990 99672 4...,Admission Date: [**2132-6-17**] ...
7,131384,21674,2112-04-04 00:00:00,76518 76527 7742,Admission Date: [**2112-3-27**] Dischar...
8,107708,14585,2150-08-12 00:00:00,042 2848 5845 261 1120 00845 0785 5781 03843 9...,Admission Date: [**2150-7-29**] ...
9,119244,8360,2119-11-09 00:00:00,76517 76527,Unit No: [**Numeric Identifier 62648**] Admis...


## Pre processing ICD 9 codes

In [5]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,144384,8846,2181-12-25 00:00:00,580-629 390-459 800-999,Admission Date: [**2181-12-17**] ...
1,164649,78870,2132-08-27 00:00:00,240-279 001-139 390-459 290-319 460-519 520-57...,Admission Date: [**2132-8-7**] D...
2,114614,8546,2164-07-12 00:00:00,001-139 390-459 290-319 460-519,Admission Date: [**2164-7-3**] Discharge ...
3,148970,29739,2128-11-25 00:00:00,240-279 390-459 710-739 520-579,Admission Date: [**2128-11-19**] ...
4,111512,99864,2128-05-25 00:00:00,240-279 390-459 290-319 520-579 800-999 780-78...,Admission Date: [**2128-5-24**] ...
5,137229,59986,2185-08-10 00:00:00,240-279 390-459 290-319 460-519 520-579 800-99...,Admission Date: [**2185-8-4**] D...
6,172712,28291,2132-07-03 00:00:00,240-279 390-459 460-519 520-579 800-999 780-78...,Admission Date: [**2132-6-17**] ...
7,131384,21674,2112-04-04 00:00:00,760-779,Admission Date: [**2112-3-27**] Dischar...
8,107708,14585,2150-08-12 00:00:00,240-279 001-139 290-319 520-579 800-999 780-78...,Admission Date: [**2150-7-29**] ...
9,119244,8360,2119-11-09 00:00:00,760-779,Unit No: [**Numeric Identifier 62648**] Admis...


In [6]:
#preprocess icd9 codes
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

## Preprocess Notes

The notes preprocessin here is a little different sice we want to keep dots and other characters to be able to split the notes into sentences

In [7]:
from nltk import tokenize
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [8]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [9]:
note_sentences = []
notes = []

for idx in range(df.shape[0]):
    # for every note
    text = clean_str(df["TEXT"][idx].encode('ascii','ignore'))
    notes.append(text)
    sentences = tokenize.sent_tokenize(text)
    note_sentences.append(sentences)   



In [10]:
note_sentences[0][0:5]

['admission date:  [**2181-12-17**]              discharge date:   [**2181-12-25**] date of birth:  [**2107-5-29**]             sex:   m service: cardiothoracic allergies: patient recorded as having no known allergies to drugs attending:[**first name3 (lf) 1283**] chief complaint: mr. [**known lastname 59944**] is a 76 yo gentleman who has a 1 year h/o worsening sob and exertional chest discomfort.',
 'major surgical or invasive procedure: s/p cabg/avr [**12-17**]   lima-lad, svg-om, svg-ramus, svg-pda   avr-23mm ce perimount magna pericardial history of present illness: mr. [**known lastname 59944**] has a 1 year h/o dyspnea on exertion and shortness of breath that has been gettting progressively worse.',
 'he underwent an ett which was positive for fatigue/dyspnea and st depression.',
 'past medical history: htn djd bph cad/aortic stenosis s/p turp social history: remote smoker llives with wife in [**name2 (ni) **] drinks 1 glass of wine a day family history: sister-cabg at 72 pertin

### what will be the MAX_SENTS and MAX_SENT_LENGTH ?

In [11]:
note_sentences_length =[len(x) for x in note_sentences]
print "Average number of sentences in a note: ", np.mean(note_sentences_length)  
print "Max number of sentences in a note: ", max(note_sentences_length)

Average number of sentences in a note:  105.068121442
Max number of sentences in a note:  546


In [12]:
sum(np.array(note_sentences_length) > 150)

925

In [13]:
sentence_flat_list = [sentence for note_sentence in note_sentences for sentence in note_sentence]
sentence_words_length =[len(text_to_word_sequence(sentence)) for sentence in sentence_flat_list]
print "Average number of words in a sentence: ", np.mean(sentence_words_length)  
print "Max number of words in a sentence: ", max(sentence_words_length)

Average number of words in a sentence:  16.9376242756
Max number of words in a sentence:  3658


In [14]:
sum(np.array(sentence_words_length) > 250)

1591

In [15]:
MAX_NB_WORDS = None
MAX_SENTS = 150
MAX_SENT_LENGTH  = 250

MAX_VOCAB = None # to limit original number of words (None if no limit)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(notes)




In [16]:
dictionary = tokenizer.word_index

In [17]:
MAX_NB_WORDS = len(tokenizer.word_index)  #vocabulary length
note_matrix = np.zeros((len(notes), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, one_note_sentences in enumerate(note_sentences):
    for j, sentence in enumerate(one_note_sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sentence)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    note_matrix[i,j,k] = tokenizer.word_index[word]
                    k+=1

In [18]:
note_matrix[0][0]

array([   54,    56,  1423,    42,   138,    31,    56,  1423,    42,
          82,    56,     4,   257,  1479,    15,   161,   284,   156,
         120,  1274,   180,    20,   834,    33,   489,    14,   113,
         180,     3,   619,   330,    66,   304,   256,  5665,   360,
         346,   417,   113,   259, 22903,    19,     8,  1396,   865,
        2268,   216,    72,     8,     6,   186,   376,    95,   616,
         806,     2,  3133,    79,  1878,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Split Files

In [19]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 66594)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 25356)


In [20]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    note_matrix, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 150, 250), (3688, 19))
('Validation: ', (1054, 150, 250), (1054, 19))
('Test: ', (528, 150, 250), (528, 19))


## Hierarchical Attention NN
based on paper: Hierarchical Attention networks for document classification

In [47]:
reload(hatt_model)
h_att_model = hatt_model.build_hierarhical_att_model(MAX_SENTS, MAX_SENT_LENGTH, num_filters = 100, filter_sizes=[2,3,4,5],
                         max_vocab=MAX_NB_WORDS, embedding_dim=EMBEDDING_DIM , 
                         num_classes=N_TOP)

  output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_16 (InputLayer)            (None, 250)           0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 250, 100)      6659500     input_16[0][0]                   
____________________________________________________________________________________________________
bidirectional_11 (Bidirectional) (None, 250, 100)      45300       embedding[0][0]                  
____________________________________________________________________________________________________
permute_11 (Permute)             (None, 100, 250)      0           bidirectional_11[0][0]           
___________________________________________________________________________________________

In [None]:
# Train the model
h_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f682e63bfd0>

In [None]:
pred_train = h_att_model.predict(X_train, batch_size=200)
pred_dev = h_att_model.predict(X_val, batch_size=200)

## Performance Evaluation

In [42]:
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.514      0.508
0.030:      0.525      0.518
0.040:      0.534      0.526
0.050:      0.542      0.534
0.055:      0.546      0.538
0.058:      0.548      0.541
0.060:      0.549      0.542
0.080:      0.565      0.557
0.100:      0.579      0.571
0.200:      0.634      0.624
0.300:      0.661      0.648
0.400:      0.660      0.643
0.500:      0.619      0.603
0.600:      0.539      0.521
0.700:      0.429      0.417


In [51]:
h_att_model.save('models/h_att_model_20_epochs_5k_B.h5')
#model = load_model('models/cnn_20_epochs.h5')

## Notes:
* This is looking good! with just ONE Epoch, we get a F1score of 60%, almost the same than 5 Epochs with the CNN mocdel (f1=63%)
* It was super slow though, that is why I didn't run the 5 epcchs.. it took like 2 hours to run 1 epoch.. I'll work on improving that (hopefully it can be improved)

### first run with defuatl GRU(100) and Dense (200)

```
Epoch 1/1
3688/3688 [==============================] - 3534s - loss: 0.7549 - acc: 0.7228 - val_loss: 0.7396 - val_acc: 0.7249

F1 scores
threshold | training | dev  
0.020:      0.556      0.551
0.030:      0.579      0.574
0.040:      0.585      0.579
0.050:      0.597      0.591
0.055:      0.601      0.593
0.058:      0.602      0.594
0.060:      0.608      0.602
0.080:      0.563      0.558
0.100:      0.529      0.522
0.200:      0.000      0.000
0.300:      0.000      0.000
0.500:      0.000      0.000
```
5 epochs ,internal embeddings, sigmoid,  5,000 records.. word GRU = 5,adam optimizer 
```


Epoch 1/5
325s - loss: 0.6031 - acc: 0.7653 - val_loss: 0.4463 - val_acc: 0.7737
Epoch 2/5
323s - loss: 0.4461 - acc: 0.7737 - val_loss: 0.4429 - val_acc: 0.7761
Epoch 3/5
323s - loss: 0.4393 - acc: 0.7779 - val_loss: 0.4366 - val_acc: 0.7836
Epoch 4/5
322s - loss: 0.4272 - acc: 0.7885 - val_loss: 0.4249 - val_acc: 0.7924
Epoch 5/5
324s - loss: 0.4132 - acc: 0.8013 - val_loss: 0.4095 - val_acc: 0.8038

F1 scores
threshold | training | dev  
0.020:      0.514      0.508
0.030:      0.525      0.518
0.040:      0.534      0.526
0.050:      0.542      0.534
0.055:      0.546      0.538
0.058:      0.548      0.541
0.060:      0.549      0.542
0.080:      0.565      0.557
0.100:      0.579      0.571
0.200:      0.634      0.624
0.300:      0.661      0.648
0.400:      0.660      0.643
0.500:      0.619      0.603
0.600:      0.539      0.521
0.700:      0.429      0.417

```

## 5 epochs with GRU =50, no embedding, changed optimization
```
Train on 3688 samples, validate on 1054 samples
Epoch 1/5
3688/3688 [==============================] - 1705s - loss: 0.4686 - acc: 0.7746 - val_loss: 0.4364 - val_acc: 0.7819
Epoch 2/5
3688/3688 [==============================] - 1711s - loss: 0.4299 - acc: 0.7885 - val_loss: 0.4206 - val_acc: 0.7956
Epoch 3/5
3688/3688 [==============================] - 1719s - loss: 0.4078 - acc: 0.8036 - val_loss: 0.4098 - val_acc: 0.8014
Epoch 4/5
3688/3688 [==============================] - 1709s - loss: 0.3962 - acc: 0.8124 - val_loss: 0.4040 - val_acc: 0.8075
Epoch 5/5
3688/3688 [==============================] - 1718s - loss: 0.3905 - acc: 0.8180 - val_loss: 0.4001 - val_acc: 0.8118
```
