In [1]:
## just a workbook, we will add more comments in the notebook for submission

In [2]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model

Using TensorFlow backend.


In [3]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [4]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,156816,2251,2189-06-27 00:00:00,4241 4019 53081 2449,Admission Date: [**2189-6-22**] Dischar...
1,190759,32680,2152-05-26 00:00:00,34120 5990 99812 2800 2869 4019 2720 3682 92232,Admission Date: [**2152-5-10**] ...
2,181175,27162,2171-09-03 00:00:00,431 5856 3484 48283 40391 75313 34292 00845 41...,Admission Date: [**2171-8-15**] ...
3,160651,72541,2192-06-23 00:00:00,2724 81002 81109 80130 3485 80718 9515 9514 78...,Admission Date: [**2192-6-16**] ...
4,176849,6596,2107-02-26 00:00:00,769 76519,Admission Date: [**2107-2-24**] Dischar...
5,133975,31160,2184-10-12 00:00:00,486 0389 42823 78552 78551 5849 4271 99731 428...,Admission Date: [**2184-9-27**] ...
6,180297,32452,2111-12-25 00:00:00,74685 74190 4430 4139,Admission Date: [**2111-12-15**] ...
7,112656,16732,2137-01-08 00:00:00,44024 9971 4280 496 70715 45829 2930 25060 3572,Admission Date: [**2136-12-30**] Discha...
8,177462,28621,2125-09-10 00:00:00,9654 570 29590 5845 2760 2967,Admission Date: [**2125-8-29**] ...
9,144109,27710,2127-11-10 00:00:00,1983 1970 3314,Admission Date: [**2127-11-1**] ...


## Pre processing ICD 9 codes

In [5]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,156816,2251,2189-06-27 00:00:00,240-279 390-459 520-579,Admission Date: [**2189-6-22**] Dischar...
1,190759,32680,2152-05-26 00:00:00,240-279 390-459 290-319 800-999 320-389 580-629,Admission Date: [**2152-5-10**] ...
2,181175,27162,2171-09-03 00:00:00,240-279 760-779 001-139 390-459 290-319 460-51...,Admission Date: [**2171-8-15**] ...
3,160651,72541,2192-06-23 00:00:00,240-279 390-459 460-519 800-999 780-789 320-389,Admission Date: [**2192-6-16**] ...
4,176849,6596,2107-02-26 00:00:00,760-779,Admission Date: [**2107-2-24**] Dischar...
5,133975,31160,2184-10-12 00:00:00,240-279 001-139 390-459 460-519 800-999 780-78...,Admission Date: [**2184-9-27**] ...
6,180297,32452,2111-12-25 00:00:00,390-459 760-779,Admission Date: [**2111-12-15**] ...
7,112656,16732,2137-01-08 00:00:00,240-279 680-709 390-459 290-319 460-519 800-99...,Admission Date: [**2136-12-30**] Discha...
8,177462,28621,2125-09-10 00:00:00,580-629 240-279 290-319 520-579 800-999,Admission Date: [**2125-8-29**] ...
9,144109,27710,2127-11-10 00:00:00,140-239 320-389,Admission Date: [**2127-11-1**] ...


In [6]:
#preprocess icd9 codes
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

## Preprocess Notes

The notes preprocessin here is a little different sice we want to keep dots and other characters to be able to split the notes into sentences

In [7]:
from nltk import tokenize
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [8]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [9]:
note_sentences = []
notes = []

for idx in range(df.shape[0]):
    # for every note
    text = clean_str(df["TEXT"][idx].encode('ascii','ignore'))
    notes.append(text)
    sentences = tokenize.sent_tokenize(text)
    note_sentences.append(sentences)   



In [10]:
note_sentences[0][0:5]

['admission date:  [**2189-6-22**]       discharge date:  [**2189-6-27**] date of birth:   [**2112-4-17**]       sex:  f service:  cardiothoracic service chief complaint:  ms [**known lastname 35085**] is a direct admit into the operating room where she will undergo minimally invasive aortic valve replacement.',
 'her chief complaint is a worsening heart murmur.',
 'hi[**last name (stitle) 2710**]of present illness:  this is a pleasant, asymptomatic, 77 year old woman with a history of heart murmur since her 40s without symptoms who has been followed by her primary care provider and cardiologist over the years with serial echocardiograms.',
 'the patients echocardiogram last winter appeared worse than the previous so the patient was referred to dr. [**last name (prefixes) **] for aortic valve surgery.',
 'she had cardiac catheterization done in [**2188-8-17**] which showed normal coronaries with severe aortic stenosis following a successful aortic valvuloplasty which increased her aort

### what will be the MAX_SENTS and MAX_SENT_LENGTH ?

In [11]:
note_sentences_length =[len(x) for x in note_sentences]
print "Average number of sentences in a note: ", np.mean(note_sentences_length)  
print "Max number of sentences in a note: ", max(note_sentences_length)

Average number of sentences in a note:  104.830550285
Max number of sentences in a note:  488


In [27]:
sum(np.array(note_sentences_length) > 150)

913

In [13]:
sentence_flat_list = [sentence for note_sentence in note_sentences for sentence in note_sentence]
sentence_words_length =[len(text_to_word_sequence(sentence)) for sentence in sentence_flat_list]
print "Average number of words in a sentence: ", np.mean(sentence_words_length)  
print "Max number of words in a sentence: ", max(sentence_words_length)

Average number of words in a sentence:  16.969297158
Max number of words in a sentence:  3841


In [14]:
sum(np.array(sentence_words_length) > 250)

1678

In [28]:
MAX_NB_WORDS = None
MAX_SENTS = 150
MAX_SENT_LENGTH  = 250

MAX_VOCAB = None # to limit original number of words (None if no limit)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(notes)


In [29]:
dictionary = tokenizer.word_index

In [30]:
MAX_NB_WORDS = len(tokenizer.word_index)  #vocabulary length
note_matrix = np.zeros((len(notes), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, one_note_sentences in enumerate(note_sentences):
    for j, sentence in enumerate(one_note_sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sentence)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    note_matrix[i,j,k] = tokenizer.word_index[word]
                    k+=1

In [31]:
note_matrix[0][0]

array([   54,    56,  1146,    22,   177,    31,    56,  1146,    22,
         179,    56,     3,   251,  1124,    17,   144,   280,   350,
         121,  1296,   121,   353,   337,   777,   107,   235, 35016,
          19,     8,  2191,  2801,   665,     1,   962,   333,   514,
          30,   126,  3304,  2219,   340,   135,   138,   915,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Split Files

In [32]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 66712)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 25296)


In [33]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    note_matrix, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 150, 250), (3688, 19))
('Validation: ', (1054, 150, 250), (1054, 19))
('Test: ', (528, 150, 250), (528, 19))


## Hierarchical Attention NN
based on paper: Hierarchical Attention networks for document classification

In [21]:
import hatt_model

In [37]:
reload(hatt_model)
h_att_model = hatt_model.build_gru_att_model(MAX_SENTS, MAX_SENT_LENGTH, 
                         max_vocab=MAX_NB_WORDS, embedding_dim=EMBEDDING_DIM , embedding_matrix=EMBEDDING_MATRIX,
                         num_classes=N_TOP)

  output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')


(?, 250, 200)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 250)           0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 250, 100)      6671300     input_5[0][0]                    
____________________________________________________________________________________________________
bidirectional_5 (Bidirectional)  (None, 250, 200)      120600      embedding_3[0][0]                
____________________________________________________________________________________________________
time_distributed_7 (TimeDistribu (None, 250, 200)      40200       bidirectional_5[0][0]            
_____________________________________________________________________________

In [38]:
# Train the model
h_att_model.fit(X_train, y_train, batch_size=50, epochs=1, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/1


<keras.callbacks.History at 0x7f5612c87f50>

In [39]:
pred_train = h_att_model.predict(X_train, batch_size=50)
pred_dev = h_att_model.predict(X_val, batch_size=50)

## Performance Evaluation

In [42]:
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.556      0.551
0.030:      0.579      0.574
0.040:      0.585      0.579
0.050:      0.597      0.591
0.055:      0.601      0.593
0.058:      0.602      0.594
0.060:      0.608      0.602
0.080:      0.563      0.558
0.100:      0.529      0.522
0.200:      0.000      0.000
0.300:      0.000      0.000
0.500:      0.000      0.000


## Notes:
* This is looking good! with just ONE Epoch, we get a F1score of 60%, almost the same than 5 Epochs with the CNN mocdel (f1=63%)
* It was super slow though, that is why I didn't run the 5 epcchs.. it took like 2 hours to run 1 epoch.. I'll work on improving that (hopefully it can be improved)