In [2]:
## just a workbook, we will add more comments in the notebook for submission

In [3]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model

Using TensorFlow backend.


In [4]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [5]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,135478,3328,2187-12-25 00:00:00,25083 9974 99686 5601 29281 25043 4019 2449,Admission Date: [**2187-12-12**] Discha...
1,115742,25540,2178-01-13 00:00:00,9351 5304 51881 4280 42731 9982 4820 48241 349...,Admission Date: [**2177-12-20**] ...
2,135467,83124,2104-10-05 00:00:00,2762 42823 41401 2411 2767 25052 36201 25042 5...,Admission Date: [**2104-10-3**] ...
3,148209,10280,2181-03-07 00:00:00,4822 41071 99662 03811 5121 99591 53240 5180 5...,Admission Date: [**2181-2-6**] D...
4,193597,15450,2147-07-04 00:00:00,4019 28521 4401 412 4275 5845 4280 2762 5859 4...,Admission Date: [**2147-7-3**] D...
5,131754,79655,2125-04-14 00:00:00,56081 2762 5715 41401 4142 412 25000 40390 585...,Admission Date: [**2125-4-10**] ...
6,174772,53787,2160-10-11 00:00:00,49322 5849 27651 4659 40390 5859 25002 2724 56...,Admission Date: [**2160-10-7**] ...
7,136013,60106,2145-02-12 00:00:00,51884 0389 78552 51909 5070 99592 49390 5715 4...,"Name: [**Known lastname 5685**],[**Known firs..."
8,117578,5503,2168-12-20 00:00:00,41401 4139 9973 486 2720 4019 412,Admission Date: [**2168-12-14**] Discha...
9,167023,84615,2109-09-22 00:00:00,5552 56722 0389 99591 5772 56981 2841 78959 51...,Admission Date: [**2109-9-14**] ...


## Pre processing ICD 9 codes

In [6]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,135478,3328,2187-12-25 00:00:00,240-279 390-459 290-319 520-579 800-999,Admission Date: [**2187-12-12**] Discha...
1,115742,25540,2178-01-13 00:00:00,240-279 390-459 290-319 460-519 520-579 800-99...,Admission Date: [**2177-12-20**] ...
2,135467,83124,2104-10-05 00:00:00,240-279 390-459 520-579 320-389 580-629 710-739,Admission Date: [**2104-10-3**] ...
3,148209,10280,2181-03-07 00:00:00,001-139 390-459 290-319 460-519 520-579 800-99...,Admission Date: [**2181-2-6**] D...
4,193597,15450,2147-07-04 00:00:00,580-629 240-279 390-459 290-319,Admission Date: [**2147-7-3**] D...
5,131754,79655,2125-04-14 00:00:00,240-279 580-629 390-459 290-319 520-579,Admission Date: [**2125-4-10**] ...
6,174772,53787,2160-10-11 00:00:00,240-279 390-459 290-319 460-519 520-579 580-629,Admission Date: [**2160-10-7**] ...
7,136013,60106,2145-02-12 00:00:00,240-279 001-139 390-459 290-319 460-519 520-57...,"Name: [**Known lastname 5685**],[**Known firs..."
8,117578,5503,2168-12-20 00:00:00,240-279 390-459 460-519 800-999,Admission Date: [**2168-12-14**] Discha...
9,167023,84615,2109-09-22 00:00:00,240-279 001-139 290-319 460-519 520-579 800-99...,Admission Date: [**2109-9-14**] ...


In [7]:
#preprocess icd9 codes
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

## Preprocess Notes

The notes preprocessin here is a little different sice we want to keep dots and other characters to be able to split the notes into sentences

In [8]:
from nltk import tokenize
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [9]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [10]:
note_sentences = []
notes = []

for idx in range(df.shape[0]):
    # for every note
    text = clean_str(df["TEXT"][idx].encode('ascii','ignore'))
    notes.append(text)
    sentences = tokenize.sent_tokenize(text)
    note_sentences.append(sentences)   



In [11]:
note_sentences[0][0:5]

['admission date:  [**2187-12-12**]       discharge date:  [**2187-12-25**] date of birth:   [**2141-2-6**]       sex:  m service:  1 chief complaint: 1.  insulin dependent diabetes mellitus.',
 '2.  pancreas after a kidney transplant.',
 'history of present illness:   the patient is a 46 year old male status post renal transplant for insulin dependent diabetes mellitus now here for a pancreas after kidney transplant.',
 'the patient denies nausea or vomiting.',
 'the patient denies recent history of the flu or lymphadenopathy.']

### what will be the MAX_SENTS and MAX_SENT_LENGTH ?

In [12]:
note_sentences_length =[len(x) for x in note_sentences]
print "Average number of sentences in a note: ", np.mean(note_sentences_length)  
print "Max number of sentences in a note: ", max(note_sentences_length)

Average number of sentences in a note:  104.415749526
Max number of sentences in a note:  531


In [13]:
sum(np.array(note_sentences_length) > 300)

42

In [14]:
len(note_sentences[0][0].split ())

21

In [15]:
# number of words in a sentence
print len(note_sentences[0][0].split())
print len(note_sentences[0][10].split())
print len(note_sentences[1][5].split())
print len(note_sentences[1][10].split())

21
8
10
12


In [16]:
sentence_flat_list = [sentence for note_sentence in note_sentences for sentence in note_sentence]
sentence_words_length =[len(text_to_word_sequence(sentence)) for sentence in sentence_flat_list]
print "Average number of words in a sentence: ", np.mean(sentence_words_length)  
print "Max number of words in a sentence: ", max(sentence_words_length)

Average number of words in a sentence:  16.9173534495
Max number of words in a sentence:  2624


In [17]:
sum(np.array(sentence_words_length) > 250)

1595

In [18]:
MAX_NB_WORDS = None
MAX_SENTS = 300
MAX_SENT_LENGTH  = 250

MAX_VOCAB = None # to limit original number of words (None if no limit)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(notes)




In [19]:
dictionary = tokenizer.word_index

In [20]:
MAX_NB_WORDS = len(tokenizer.word_index)  #vocabulary length
note_matrix = np.zeros((len(notes), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, one_note_sentences in enumerate(note_sentences):
    for j, sentence in enumerate(one_note_sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sentence)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    note_matrix[i,j,k] = tokenizer.word_index[word]
                    k+=1

In [21]:
note_matrix[0][0]

array([  54,   56, 1258,   41,   41,   32,   56, 1258,   41,   84,   56,
          4,  253, 1230,    9,   22,  279,  165,  121,    6,  352,  342,
          6,  317, 1602,  465, 1063,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## Split Files

In [22]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 66494)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 25436)


In [23]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    note_matrix, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 300, 250), (3688, 19))
('Validation: ', (1054, 300, 250), (1054, 19))
('Test: ', (528, 300, 250), (528, 19))


## Hierarchical Attention NN
based on paper: Hierarchical Attention networks for document classification

In [25]:
import hatt_model

In [26]:
reload(hatt_model)
h_att_model = hatt_model.build_gru_att_model(MAX_SENTS, MAX_SENT_LENGTH, 
                         max_vocab=MAX_NB_WORDS, embedding_dim=EMBEDDING_DIM , embedding_matrix=EMBEDDING_MATRIX,
                         num_classes=N_TOP)

  output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
  name=name)


model fitting - Hierachical Attention GRU
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 300, 250)      0                                            
____________________________________________________________________________________________________
time_distributed_2 (TimeDistribu (None, 300, 50000)    6873050     input_2[0][0]                    
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 300, 200)      30060600    time_distributed_2[0][0]         
____________________________________________________________________________________________________
time_distributed_3 (TimeDistribu (None, 300, 200)      40200       bidirectional_2[0][0]            
_________________________________________________

In [None]:
# Train the model
h_att_model.fit(X_train, y_train, batch_size=50, epochs=1, validation_data=(X_val, y_val), verbose=2)

Train on 3688 samples, validate on 1054 samples
Epoch 1/1
