In [1]:
## just a workbook, we will add more comments in the notebook for submission

In [1]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model

Using TensorFlow backend.


In [2]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [3]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,134273,25127,2179-03-26 00:00:00,8080 80704 8058 83501 8020 42789 87340 87342,Admission Date: [**2179-3-21**] Dischar...
1,120698,40729,2115-03-28 00:00:00,6826 5849 2760 42732 0389 99592 78552 5180 438...,Admission Date: [**2115-3-24**] ...
2,157454,19851,2125-05-15 00:00:00,5070 40391 5856 99662 6822 07032 42832 4280 25...,Admission Date: [**2125-5-7**] D...
3,109748,73473,2128-05-10 00:00:00,5070 51881 5849 5609 42731 2851 5781 185 2689 ...,Admission Date: [**2128-5-7**] D...
4,128866,67042,2122-07-21 00:00:00,2761 2930 34690 2768 27541 2753,Admission Date: [**2122-7-19**] ...
5,153552,11460,2198-07-31 00:00:00,9582 4280 5990 5121 80502 4538 8088 70703 2948...,Admission Date: [**2198-7-26**] ...
6,157346,9718,2106-04-24 00:00:00,431 4010 3315 496 34590,Admission Date: [**2106-4-23**] ...
7,195031,32028,2191-03-01 00:00:00,48241 2536 4941 515 4019 53081 2859 34700 501,Admission Date: [**2191-2-17**] ...
8,196730,28705,2169-05-24 00:00:00,769 7750,Admission Date: [**2169-5-22**] Dischar...
9,120849,32649,2169-02-12 00:00:00,03842 5770 5070 42823 4280 57420 4359 2930 252...,Admission Date: [**2169-1-27**] ...


## Pre processing ICD 9 codes

In [4]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,134273,25127,2179-03-26 00:00:00,390-459 800-999,Admission Date: [**2179-3-21**] Dischar...
1,120698,40729,2115-03-28 00:00:00,240-279 001-139 290-319 390-459 680-709 460-51...,Admission Date: [**2115-3-24**] ...
2,157454,19851,2125-05-15 00:00:00,240-279 001-139 290-319 390-459 680-709 460-51...,Admission Date: [**2125-5-7**] D...
3,109748,73473,2128-05-10 00:00:00,240-279 390-459 290-319 460-519 520-579 140-23...,Admission Date: [**2128-5-7**] D...
4,128866,67042,2122-07-21 00:00:00,240-279 290-319 320-389,Admission Date: [**2122-7-19**] ...
5,153552,11460,2198-07-31 00:00:00,240-279 680-709 390-459 290-319 460-519 800-99...,Admission Date: [**2198-7-26**] ...
6,157346,9718,2106-04-24 00:00:00,390-459 460-519 320-389,Admission Date: [**2106-4-23**] ...
7,195031,32028,2191-03-01 00:00:00,240-279 390-459 290-319 460-519 520-579 320-389,Admission Date: [**2191-2-17**] ...
8,196730,28705,2169-05-24 00:00:00,760-779,Admission Date: [**2169-5-22**] Dischar...
9,120849,32649,2169-02-12 00:00:00,240-279 001-139 390-459 290-319 460-519 520-579,Admission Date: [**2169-1-27**] ...


In [5]:
#preprocess icd9 codes
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

## Preprocess Notes

The notes preprocessin here is a little different sice we want to keep dots and other characters to be able to split the notes into sentences

In [6]:
from nltk import tokenize
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [7]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [8]:
note_sentences = []
notes = []

for idx in range(df.shape[0]):
    # for every note
    text = clean_str(df["TEXT"][idx].encode('ascii','ignore'))
    notes.append(text)
    sentences = tokenize.sent_tokenize(text)
    note_sentences.append(sentences)   



In [9]:
note_sentences[0][0:5]

['admission date:  [**2179-3-21**]       discharge date:  [**2179-3-25**] date of birth:   [**2131-5-1**]       sex:  m service:  trauma surgery history of present illness:  the patient is a 47-year-old man involved in a motor vehicle crash, car versus tree.',
 'the motor vehicle collision was at a high speed estimated at 65 mph with the tree broken in half.',
 'at the scene, the car was described as in a accordion.',
 'there was unknown loss of consciousness with the accident.',
 'after the accident, ems was activated, and the patient was taken to an outside medical center for evaluation.']

### what will be the MAX_SENTS and MAX_SENT_LENGTH ?

In [10]:
note_sentences_length =[len(x) for x in note_sentences]
print "Average number of sentences in a note: ", np.mean(note_sentences_length)  
print "Max number of sentences in a note: ", max(note_sentences_length)

Average number of sentences in a note:  105.629981025
Max number of sentences in a note:  586


In [13]:
sum(np.array(note_sentences_length) > 300)

44

In [16]:
sentence_flat_list = [sentence for note_sentence in note_sentences for sentence in note_sentence]
sentence_words_length =[len(text_to_word_sequence(sentence)) for sentence in sentence_flat_list]
print "Average number of words in a sentence: ", np.mean(sentence_words_length)  
print "Max number of words in a sentence: ", max(sentence_words_length)

Average number of words in a sentence:  16.9302998186
Max number of words in a sentence:  5456


In [17]:
sum(np.array(sentence_words_length) > 250)

1633

In [18]:
MAX_NB_WORDS = None
MAX_SENTS = 300
MAX_SENT_LENGTH  = 250

MAX_VOCAB = None # to limit original number of words (None if no limit)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(notes)




In [19]:
dictionary = tokenizer.word_index

In [20]:
MAX_NB_WORDS = len(tokenizer.word_index)  #vocabulary length
note_matrix = np.zeros((len(notes), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, one_note_sentences in enumerate(note_sentences):
    for j, sentence in enumerate(one_note_sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sentence)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    note_matrix[i,j,k] = tokenizer.word_index[word]
                    k+=1

In [21]:
note_matrix[0][0]

array([  55,   57, 1365,   13,  186,   32,   57, 1365,   13,   83,   57,
          3,  257, 1668,   15,    6,  280,  164,  119, 1057,  203,   44,
          3,  154,  260,    1,   20,   19,    8, 1389,  190,  265, 1084,
       2026,   11,    8, 1163, 3644, 5607, 1979, 1911, 4221,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## Split Files

In [22]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 66721)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 25421)


In [23]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    note_matrix, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 300, 250), (3688, 19))
('Validation: ', (1054, 300, 250), (1054, 19))
('Test: ', (528, 300, 250), (528, 19))


## Hierarchical Attention NN
based on paper: Hierarchical Attention networks for document classification

In [24]:
import hatt_model

In [32]:
reload(hatt_model)
h_att_model = hatt_model.build_gru_att_model(MAX_SENTS, MAX_SENT_LENGTH, 
                         max_vocab=MAX_NB_WORDS, embedding_dim=EMBEDDING_DIM , embedding_matrix=EMBEDDING_MATRIX,
                         num_classes=N_TOP)

  output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')


(?, 250, 200)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 250)           0                                            
____________________________________________________________________________________________________
embedding_8 (Embedding)          (None, 250, 100)      6672200     input_8[0][0]                    
____________________________________________________________________________________________________
bidirectional_8 (Bidirectional)  (None, 250, 200)      120600      embedding_8[0][0]                
____________________________________________________________________________________________________
time_distributed_8 (TimeDistribu (None, 250, 200)      40200       bidirectional_8[0][0]            
_____________________________________________________________________________

In [None]:
# Train the model
h_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=2)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
