In [1]:
## just a workbook, we will add more comments in the notebook for submission

In [20]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding, LSTM, Bidirectional
from keras.layers.merge import Concatenate
from keras.preprocessing import sequence


# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [3]:
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
print df.shape
df.head(10)

(5270, 5)


Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,142565,56446,2198-03-12 00:00:00,0389 486 5849 78552 2761 5119 5109 2910 51881 ...,Admission Date: [**2198-2-28**] ...
1,158269,62415,2122-08-13 00:00:00,2113 56729 9986 42832 4280 99749 99859 42731 4...,Admission Date: [**2122-8-1**] D...
2,160831,66362,2106-08-02 00:00:00,5761 5856 40391 57450 25000 317,Admission Date: [**2106-7-28**] ...
3,142640,10254,2192-01-08 00:00:00,5781 42731 5070 70703 4242 42830 5845 5990 276...,Admission Date: [**2191-12-12**] ...
4,132744,9061,2198-10-17 00:00:00,99662 0388 486 5070 1179 56981 56081 5990 9985...,Admission Date: [**2198-8-21**] ...
5,107362,22087,2140-12-28 00:00:00,43311 34290 5990 5997 4414 2765 2449 4019 2720,Admission Date: [**2140-12-19**] Discha...
6,110703,14124,2120-10-19 00:00:00,9654 5070 9651 9690 9694 96561 9630 9693,Admission Date: [**2120-10-15**] Discha...
7,172000,51027,2165-01-26 00:00:00,5849 4275 34830 2762 40311 42822 25080 4280 58...,Admission Date: [**2165-1-10**] ...
8,125657,81038,2144-03-20 00:00:00,56081 5849 5990 04112 78830 28850 42789,Admission Date: [**2144-3-10**] ...
9,198664,18288,2113-01-17 00:00:00,1570 1962 49320 5772 57410 4019 2749,Admission Date: [**2113-1-10**] ...


## Pre processing ICD 9 codes

In [4]:
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))
df.head(10)

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,142565,56446,2198-03-12 00:00:00,240-279 001-139 390-459 290-319 460-519 800-99...,Admission Date: [**2198-2-28**] ...
1,158269,62415,2122-08-13 00:00:00,240-279 390-459 290-319 460-519 520-579 800-99...,Admission Date: [**2122-8-1**] D...
2,160831,66362,2106-08-02 00:00:00,580-629 240-279 390-459 290-319 520-579,Admission Date: [**2106-7-28**] ...
3,142640,10254,2192-01-08 00:00:00,240-279 001-139 290-319 390-459 680-709 460-51...,Admission Date: [**2191-12-12**] ...
4,132744,9061,2198-10-17 00:00:00,240-279 760-779 001-139 390-459 460-519 520-57...,Admission Date: [**2198-8-21**] ...
5,107362,22087,2140-12-28 00:00:00,580-629 240-279 390-459 320-389,Admission Date: [**2140-12-19**] Discha...
6,110703,14124,2120-10-19 00:00:00,460-519 800-999,Admission Date: [**2120-10-15**] Discha...
7,172000,51027,2165-01-26 00:00:00,240-279 390-459 290-319 520-579 780-789 320-38...,Admission Date: [**2165-1-10**] ...
8,125657,81038,2144-03-20 00:00:00,001-139 390-459 290-319 520-579 780-789 580-629,Admission Date: [**2144-3-10**] ...
9,198664,18288,2113-01-17 00:00:00,240-279 390-459 140-239 460-519 520-579,Admission Date: [**2113-1-10**] ...


In [5]:
#preprocess icd9 codes
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

## Preprocess Notes

The notes preprocessin here is a little different sice we want to keep dots and other characters to be able to split the notes into sentences

In [6]:
from nltk import tokenize
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [7]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [8]:
note_sentences = []
notes = []

for idx in range(df.shape[0]):
    # for every note
    text = clean_str(df["TEXT"][idx].encode('ascii','ignore'))
    notes.append(text)
    sentences = tokenize.sent_tokenize(text)
    note_sentences.append(sentences)   



In [21]:
note_sentences[0][0:5]

['admission date: [**2152-6-7**]        discharge date: [**2152-6-12**] date of birth:  [**2101-7-10**]        sex:  f service:  vsu chief complaint:  calf claudication bilaterally, left greater than right.',
 'left heel rest pain.',
 'history of present illness:  this 50 year-old female with known peripheral vascular disease who underwent a left femoral artery angioplasty and stenting 4 years ago at [**hospital3 9717**], presents with progressive claudication of the calves bilaterally, left greater than right.',
 'left heel rest pain over the last 4 years.',
 'the patients  rest pain is relieved with dangling her foot.']

### what will be the MAX_SENTS and MAX_SENT_LENGTH ?

In [9]:
note_sentences_length =[len(x) for x in note_sentences]
print "Average number of sentences in a note: ", np.mean(note_sentences_length)  
print "Max number of sentences in a note: ", max(note_sentences_length)

Average number of sentences in a note:  103.609108159
Max number of sentences in a note:  547


In [10]:
sum(np.array(note_sentences_length) > 300)

46

In [11]:
sentence_flat_list = [sentence for note_sentence in note_sentences for sentence in note_sentence]
sentence_words_length =[len(text_to_word_sequence(sentence)) for sentence in sentence_flat_list]
print "Average number of words in a sentence: ", np.mean(sentence_words_length)  
print "Max number of words in a sentence: ", max(sentence_words_length)

Average number of words in a sentence:  16.9046097213
Max number of words in a sentence:  3600


In [12]:
sum(np.array(sentence_words_length) > 250)

1576

In [13]:
MAX_NB_WORDS = None
MAX_SENTS = 300
MAX_SENT_LENGTH  = 250

MAX_VOCAB = None # to limit original number of words (None if no limit)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(notes)




In [14]:
dictionary = tokenizer.word_index

In [15]:
MAX_NB_WORDS = len(tokenizer.word_index)  #vocabulary length
note_matrix = np.zeros((len(notes), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, one_note_sentences in enumerate(note_sentences):
    for j, sentence in enumerate(one_note_sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sentence)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    note_matrix[i,j,k] = tokenizer.word_index[word]
                    k+=1

In [16]:
note_matrix[0][0]

array([   55,    56,  1053,     9,   167,    30,    56,  1053,    13,
          41,    56,     3,   246,  1832,    13,   167,   274,   155,
         116,   416,   174,   538,  1112,   328,    65,   295,   250,
          69,   350,   331,   778,  1347,   570,   345,   195,    31,
         339,   198,   877,  1547, 12837,    49,   849,  1876,   784,
         520,   423,    49,    79,   217,   423,    44,     3,   150,
         259,   401,   104,   226, 26647,    18,     8,  1380,   901,
         155,     7,  1640,   476,   941,  2165,     7,   778,     2,
         265,    58,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Split Files

In [21]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    note_matrix, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 300, 250), (3688, 19))
('Validation: ', (1054, 300, 250), (1054, 19))
('Test: ', (528, 300, 250), (528, 19))


In [31]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

In [57]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
X_val  = sequence.pad_sequences(X_val, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)
print('X_val shape:', X_val.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

Pad sequences (samples x time)
('x_train shape:', (3688, 100, 250))
('x_test shape:', (528, 100, 250))
('X_val shape:', (1054, 100, 250))


In [51]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 66150)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 25236)


In [60]:
word_index = tokenizer.word_index
MAX_SEQUENCE_LENGTH = 1000
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[EMBEDDING_MATRIX],
                            #input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(2, activation='softmax')(l_lstm)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - Bidirectional LSTM")
print("X_train.shape ", X_train.shape)
print("y_train.shape ", y_train.shape)

print("X_test.shape ", X_test.shape)
print("y_test.shape ", y_test.shape)

print("X_val.shape ", X_val.shape)
print("y_val.shape ", y_val.shape)

print(X_train.t)

model.summary()
model.fit(X_train, y_train)


model fitting - Bidirectional LSTM
('X_train.shape ', (3688, 100, 250))
('y_train.shape ', (3688, 19))
('X_test.shape ', (528, 100, 250))
('y_test.shape ', (528, 19))
('X_val.shape ', (1054, 100, 250))
('y_val.shape ', (1054, 19))
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_16 (Embedding)     (None, 1000, 100)         6615100   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 200)               160800    
_________________________________________________________________
dense_16 (Dense)             (None, 2)                 402       
Total params: 6,776,302
Trainable params: 6,776,302
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking input: expected input_14 to have 2 dimensions, but got array with shape (3688, 100, 250)