In [1]:
## just a workbook, we will add more comments in the notebook for submission

In [1]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model
import hatt_model

Using TensorFlow backend.


In [2]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [3]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,166312,73058,2191-05-08 00:00:00,2386 3363 73313 5121 5180 9971 42789 42731,Admission Date: [**2191-4-28**] ...
1,122462,24709,2193-07-14 00:00:00,76514 769 76525 7766 77081 7716,Admission Date: [**2193-5-21**] Dischar...
2,100881,89934,2122-03-23 00:00:00,5602 42732 5609 4019 60000 55090 45981 42731 4...,Admission Date: [**2122-3-17**] ...
3,104552,5178,2183-11-02 00:00:00,77081 7765,Admission Date: [**2183-10-26**] Discha...
4,181099,12091,2195-07-07 00:00:00,4589 28529 51881 42731 55010 5921 5920 52510 2594,Admission Date: [**2195-7-4**] D...
5,170423,11026,2115-04-19 00:00:00,41011 78551 4275 5849 486 9975 4160 4241 40391,Admission Date: [**2115-4-14**] Dischar...
6,149717,48006,2193-09-10 00:00:00,431 5990 43811 43889 72989 42731 4019 7291 4280,Admission Date: [**2193-9-3**] D...
7,111432,31989,2159-01-09 00:00:00,41011 78551 34510 4260 4275 4280 5990 2851 998...,Admission Date: [**2158-12-18**] ...
8,142926,43254,2169-04-05 00:00:00,5750 5849 486 5119 42732 7907 42731 25000 0413...,Admission Date: [**2169-3-24**] ...
9,177031,29592,2201-07-02 00:00:00,85200 486 51881 1970 1985 87342 3320,Admission Date: [**2201-6-26**] ...


## Pre processing ICD 9 codes

In [4]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,166312,73058,2191-05-08 00:00:00,390-459 460-519 800-999 140-239 320-389 710-739,Admission Date: [**2191-4-28**] ...
1,122462,24709,2193-07-14 00:00:00,760-779,Admission Date: [**2193-5-21**] Dischar...
2,100881,89934,2122-03-23 00:00:00,580-629 240-279 390-459 520-579,Admission Date: [**2122-3-17**] ...
3,104552,5178,2183-11-02 00:00:00,760-779,Admission Date: [**2183-10-26**] Discha...
4,181099,12091,2195-07-07 00:00:00,240-279 390-459 290-319 460-519 520-579 580-629,Admission Date: [**2195-7-4**] D...
5,170423,11026,2115-04-19 00:00:00,580-629 780-789 390-459 460-519 800-999,Admission Date: [**2115-4-14**] Dischar...
6,149717,48006,2193-09-10 00:00:00,580-629 390-459 710-739,Admission Date: [**2193-9-3**] D...
7,111432,31989,2159-01-09 00:00:00,390-459 290-319 800-999 780-789 320-389 580-62...,Admission Date: [**2158-12-18**] ...
8,142926,43254,2169-04-05 00:00:00,240-279 001-139 390-459 460-519 520-579 320-38...,Admission Date: [**2169-3-24**] ...
9,177031,29592,2201-07-02 00:00:00,140-239 320-389 460-519 800-999,Admission Date: [**2201-6-26**] ...


In [5]:
#preprocess icd9 codes
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

## Preprocess Notes

The notes preprocessin here is a little different sice we want to keep dots and other characters to be able to split the notes into sentences

In [6]:
from nltk import tokenize
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [7]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [8]:
note_sentences = []
notes = []

for idx in range(df.shape[0]):
    # for every note
    text = clean_str(df["TEXT"][idx].encode('ascii','ignore'))
    notes.append(text)
    sentences = tokenize.sent_tokenize(text)
    note_sentences.append(sentences)   



In [9]:
note_sentences[0][0:5]

['admission date:  [**2191-4-28**]              discharge date:   [**2191-5-8**] date of birth:  [**2132-3-19**]             sex:   m service: neurosurgery allergies: patient recorded as having no known allergies to drugs attending:[**first name3 (lf) 2724**] chief complaint: 59 yo m with newly discovered t6 mass infectious vs malignant now with le sensory paresthesias and l1 mass major surgical or invasive procedure: l thoracotomy; t6 vertebrectomy; ant cage/plate history of present illness: 59 year old man who has been undergoing evaluation for worsening mid-back pain over about 6 weeks.',
 'over the past couple of weeks, he has developed numbness and tingling throughout his legs which has extended cranially and now involves the entirety of both legs, his groin, and the lower part of his abdomen.',
 'he describes feeling his legs give way at times when trying to climb the stairs.',
 'a lumbar spine mri at [**hospital3 2568**] revealed a mass in his l1 vertebral body thought to be eit

### what will be the MAX_SENTS and MAX_SENT_LENGTH ?

In [10]:
note_sentences_length =[len(x) for x in note_sentences]
print "Average number of sentences in a note: ", np.mean(note_sentences_length)  
print "Max number of sentences in a note: ", max(note_sentences_length)

Average number of sentences in a note:  104.133965844
Max number of sentences in a note:  468


In [11]:
sum(np.array(note_sentences_length) > 150)

892

In [12]:
sentence_flat_list = [sentence for note_sentence in note_sentences for sentence in note_sentence]
sentence_words_length =[len(text_to_word_sequence(sentence)) for sentence in sentence_flat_list]
print "Average number of words in a sentence: ", np.mean(sentence_words_length)  
print "Max number of words in a sentence: ", max(sentence_words_length)

Average number of words in a sentence:  16.920389733
Max number of words in a sentence:  2916


In [13]:
sum(np.array(sentence_words_length) > 250)

1620

In [14]:
MAX_NB_WORDS = None
MAX_SENTS = 150
MAX_SENT_LENGTH  = 250

MAX_VOCAB = None # to limit original number of words (None if no limit)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(notes)




In [15]:
dictionary = tokenizer.word_index

In [16]:
MAX_NB_WORDS = len(tokenizer.word_index)  #vocabulary length
note_matrix = np.zeros((len(notes), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, one_note_sentences in enumerate(note_sentences):
    for j, sentence in enumerate(one_note_sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sentence)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    note_matrix[i,j,k] = tokenizer.word_index[word]
                    k+=1

In [17]:
note_matrix[0][0]

array([   53,    57,  1520,    17,   161,    31,    57,  1520,    15,
          29,    57,     3,   252,  1053,    13,   180,   279,   169,
         118,  1283,   176,    20,   834,    33,   481,    14,   108,
         176,     4,   627,   326,    66,   302,   258,  9866,   355,
         330,  1405,   881,   169,     7,  4070,  4649,  5952,   425,
         696,   489,  2657,   433,     7,  1258,  1792,  6782,     2,
        3772,   425,   333,   196,    32,   341,   199,   274,  4288,
        5952, 10216,  6425,  9148,  5783,    44,     3,   150,   259,
        1405,   183,   271,  1083,   214,    69,   137,  4289,   473,
          12,   632,   650,   310,    58,   265,   732,    22,   152,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Split Files

In [18]:
#creating embeddings
#EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_LOC = '../data/notes.100.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 66340)
('Vocabulary in original embedding:', 21056)
('Vocabulary intersection:', 19916)


In [19]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    note_matrix, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 150, 250), (3688, 19))
('Validation: ', (1054, 150, 250), (1054, 19))
('Test: ', (528, 150, 250), (528, 19))


## Hierarchical Attention NN
based on paper: Hierarchical Attention networks for document classification

In [27]:
reload(hatt_model)
h_att_model = hatt_model.build_hierarhical_att_model(MAX_SENTS, MAX_SENT_LENGTH, num_filters = 100, filter_sizes=[2,3,4,5],
                         max_vocab=MAX_NB_WORDS, embedding_dim=EMBEDDING_DIM , embedding_matrix=EMBEDDING_MATRIX ,
                         num_classes=N_TOP)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 250)           0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 250, 100)      6634100     input_8[0][0]                    
____________________________________________________________________________________________________
bidirectional_8 (Bidirectional)  (None, 250, 100)      45300       embedding_2[0][0]                
____________________________________________________________________________________________________
time_distributed_16 (TimeDistrib (None, 250, 50)       5050        bidirectional_8[0][0]            
___________________________________________________________________________________________

In [28]:
# Train the model
h_att_model.fit(X_train, y_train, batch_size=50, epochs=1, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/1


<keras.callbacks.History at 0x7fd162a0e050>

In [29]:
h_att_model.fit(X_train, y_train, batch_size=50, epochs=4, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fd161114c90>

In [30]:
pred_train = h_att_model.predict(X_train, batch_size=200)
pred_dev = h_att_model.predict(X_val, batch_size=200)

## Performance Evaluation

In [31]:
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.514      0.512
0.030:      0.535      0.533
0.040:      0.551      0.549
0.050:      0.564      0.563
0.055:      0.570      0.568
0.058:      0.573      0.571
0.060:      0.575      0.572
0.080:      0.593      0.591
0.100:      0.605      0.603
0.200:      0.658      0.655
0.300:      0.683      0.675
0.400:      0.680      0.672
0.500:      0.653      0.639
0.600:      0.598      0.585
0.700:      0.524      0.510


In [32]:
h_att_model.save('models/h_att_model_20_epochs_5k_B.h5')
#model = load_model('models/cnn_20_epochs.h5')

## Notes:
* This is looking good! with just ONE Epoch, we get a F1score of 60%, almost the same than 5 Epochs with the CNN mocdel (f1=63%)
* It was super slow though, that is why I didn't run the 5 epcchs.. it took like 2 hours to run 1 epoch.. I'll work on improving that (hopefully it can be improved)

### first run with defuatl GRU(100) and Dense (200)

```
Epoch 1/1
3688/3688 [==============================] - 3534s - loss: 0.7549 - acc: 0.7228 - val_loss: 0.7396 - val_acc: 0.7249

F1 scores
threshold | training | dev  
0.020:      0.556      0.551
0.030:      0.579      0.574
0.040:      0.585      0.579
0.050:      0.597      0.591
0.055:      0.601      0.593
0.058:      0.602      0.594
0.060:      0.608      0.602
0.080:      0.563      0.558
0.100:      0.529      0.522
0.200:      0.000      0.000
0.300:      0.000      0.000
0.500:      0.000      0.000
```
5 epochs ,internal embeddings, sigmoid,  5,000 records.. word GRU = 5,adam optimizer 
```


Epoch 1/5
325s - loss: 0.6031 - acc: 0.7653 - val_loss: 0.4463 - val_acc: 0.7737
Epoch 2/5
323s - loss: 0.4461 - acc: 0.7737 - val_loss: 0.4429 - val_acc: 0.7761
Epoch 3/5
323s - loss: 0.4393 - acc: 0.7779 - val_loss: 0.4366 - val_acc: 0.7836
Epoch 4/5
322s - loss: 0.4272 - acc: 0.7885 - val_loss: 0.4249 - val_acc: 0.7924
Epoch 5/5
324s - loss: 0.4132 - acc: 0.8013 - val_loss: 0.4095 - val_acc: 0.8038

F1 scores
threshold | training | dev  
0.020:      0.514      0.508
0.030:      0.525      0.518
0.040:      0.534      0.526
0.050:      0.542      0.534
0.055:      0.546      0.538
0.058:      0.548      0.541
0.060:      0.549      0.542
0.080:      0.565      0.557
0.100:      0.579      0.571
0.200:      0.634      0.624
0.300:      0.661      0.648
0.400:      0.660      0.643
0.500:      0.619      0.603
0.600:      0.539      0.521
0.700:      0.429      0.417

```

## 5 epochs with GRU =50, no embedding, changed optimization
```
Train on 3688 samples, validate on 1054 samples
Epoch 1/5
3688/3688 [==============================] - 1705s - loss: 0.4686 - acc: 0.7746 - val_loss: 0.4364 - val_acc: 0.7819
Epoch 2/5
3688/3688 [==============================] - 1711s - loss: 0.4299 - acc: 0.7885 - val_loss: 0.4206 - val_acc: 0.7956
Epoch 3/5
3688/3688 [==============================] - 1719s - loss: 0.4078 - acc: 0.8036 - val_loss: 0.4098 - val_acc: 0.8014
Epoch 4/5
3688/3688 [==============================] - 1709s - loss: 0.3962 - acc: 0.8124 - val_loss: 0.4040 - val_acc: 0.8075
Epoch 5/5
3688/3688 [==============================] - 1718s - loss: 0.3905 - acc: 0.8180 - val_loss: 0.4001 - val_acc: 0.8118

F1 scores
threshold | training | dev  
0.020:      0.520      0.512
0.030:      0.536      0.528
0.040:      0.552      0.543
0.050:      0.564      0.556
0.055:      0.570      0.560
0.058:      0.572      0.563
0.060:      0.574      0.564
0.080:      0.593      0.582
0.100:      0.608      0.599
0.200:      0.668      0.650
0.300:      0.683      0.661
0.400:      0.675      0.649
0.500:      0.632      0.603
0.600:      0.550      0.521
0.700:      0.439      0.410
```
