In [1]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers

#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate


Using TensorFlow backend.


In [2]:
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [3]:
N_TOP = 20 
full_df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)
df = full_df.head(1000)

In [4]:
#preprocess icd9 codes
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)


In [5]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data, MAX_SEQ_LENGTH)
print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Vocabulary size: 22476
Average note length: 1748.878
Max note length: 5641
Final Vocabulary: 22476
Final Max Sequence Length: 5000


In [6]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (699, 5000), (699, 20))
('Validation: ', (200, 5000), (200, 20))
('Test: ', (101, 5000), (101, 20))


In [7]:
# Delete temporary variables to free some memory
del df, data, labels

In [8]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
embedding_matrix, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 22476)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 14345)


## CNN for text classification

Based on the following papers and links:
* "Convolutional Neural Networks for Sentence Classification"   
* "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification"
* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras/blob/master/sentiment_cnn.py

In [13]:
#### set parameters:
num_filters = 100
filter_sizes = [2,3,4,5]
training_dropout_keep_prob = 0.9
num_classes=N_TOP
batch_size = 50
epochs = 5
external_embeddings = True
EMBEDDING_TRAINABLE = True

In [14]:
#Embedding
model_input = Input(shape=(MAX_SEQ_LENGTH, ))
if external_embeddings:
    # use embedding_matrix plus local training
    z = Embedding(MAX_VOCAB + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LENGTH,
                            trainable=EMBEDDING_TRAINABLE)(model_input)
else:
    # train embeddings 
    z =  Embedding(MAX_VOCAB + 1, 
                   EMBEDDING_DIM, 
                   input_length=MAX_SEQ_LENGTH, 
                   name="embedding")(model_input)

# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    window_pool_size =  MAX_SEQ_LENGTH  - sz + 1 
    conv = MaxPooling1D(pool_size=window_pool_size)(conv)  
    conv = Flatten()(conv)
    conv_blocks.append(conv)

#concatenate
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(training_dropout_keep_prob)(z)

#score prediction
#z = Dense(num_classes, activation="relu")(z)  I don't think this is necessary
model_output = Dense(num_classes, activation="softmax")(z)

#creating model
model = Model(model_input, model_output)
# what to use for tf.nn.softmax_cross_entropy_with_logits?
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
validation_data=(X_val, y_val), verbose=2)

Train on 699 samples, validate on 200 samples
Epoch 1/5
25s - loss: 1.9658 - acc: 0.8042 - val_loss: 0.6713 - val_acc: 0.8238
Epoch 2/5
26s - loss: 1.7716 - acc: 0.8065 - val_loss: 0.7155 - val_acc: 0.8212
Epoch 3/5
25s - loss: 1.5525 - acc: 0.8109 - val_loss: 0.6016 - val_acc: 0.8240
Epoch 4/5
25s - loss: 1.3251 - acc: 0.8132 - val_loss: 0.5844 - val_acc: 0.8238
Epoch 5/5
25s - loss: 1.1638 - acc: 0.8148 - val_loss: 0.5617 - val_acc: 0.8238


<keras.callbacks.History at 0x7fdb90775f10>

In [15]:
pred_train = model.predict(X_train, batch_size=50)
pred_dev = model.predict(X_val, batch_size=50)

In [16]:
def get_f1_score(y_true,y_hat,threshold, average):
    hot_y = np.where(np.array(y_hat) > threshold, 1, 0)
    return f1_score(np.array(y_true), hot_y, average=average)

print 'F1 scores'
print 'threshold | training | dev  '
f1_score_average = 'micro'
for threshold in [ 0.02, 0.03,0.04,0.05,0.055,0.058,0.06, 0.08, 0.1,0.2,0.3, 0.5]:
    train_f1 = get_f1_score(y_train, pred_train,threshold,f1_score_average)
    dev_f1 = get_f1_score(y_val, pred_dev,threshold,f1_score_average)
    print '%1.3f:      %1.3f      %1.3f' % (threshold,train_f1, dev_f1)

F1 scores
threshold | training | dev  
0.020:      0.334      0.333
0.030:      0.362      0.360
0.040:      0.366      0.374
0.050:      0.373      0.380
0.055:      0.374      0.382
0.058:      0.376      0.376
0.060:      0.376      0.378
0.080:      0.387      0.371
0.100:      0.366      0.350
0.200:      0.179      0.171
0.300:      0.020      0.020
0.500:      0.000      0.000


### Results with external embeddings = True , no additional training,  top 20
```
F1 scores
threshold | training | dev  
0.020:      0.337      0.329
0.030:      0.360      0.353
0.040:      0.365      0.374
0.050:      0.372      0.375
0.055:      0.370      0.377
0.058:      0.369      0.375
0.060:      0.368      0.375
0.080:      0.348      0.361
0.100:      0.309      0.319
0.200:      0.198      0.208
0.300:      0.157      0.138
0.500:      0.000      0.000
```

### Results with external embeddings = False, top 20
```
F1 scores
threshold | training | dev  
0.030:      0.462      0.345
0.040:      0.557      0.367
0.050:      0.604      0.386
0.055:      0.599      0.386
0.058:      0.592      0.391
0.060:      0.591      0.392
0.080:      0.574      0.378
0.100:      0.543      0.343
0.500:      0.003      0.000
```

### Results with external embedding and training them , top 20
```
F1 scores
threshold | training | dev  
0.020:      0.334      0.333
0.030:      0.362      0.360
0.040:      0.366      0.374
0.050:      0.373      0.380
0.055:      0.374      0.382
0.058:      0.376      0.376
0.060:      0.376      0.378
0.080:      0.387      0.371
0.100:      0.366      0.350
0.200:      0.179      0.171
0.300:      0.020      0.020
0.500:      0.000      0.000

```

### Results with external Embeddings = False, top 10, 
We can compare this setup with the LSTM published in the paper "Applying Deep Learning to ICD-9 Multi-label Classification from Medical Records", they got a F1-score of about 0.4168, we are getting 0.447

``` 
F1 scores
threshold | training | dev  
0.020:      0.444      0.409
0.030:      0.491      0.413
0.040:      0.554      0.426
0.050:      0.611      0.433
0.055:      0.642      0.437
0.058:      0.658      0.438
0.060:      0.669      0.434
0.080:      0.783      0.441
0.100:      0.867      0.447
0.300:      0.400      0.116
0.500:      0.095      0.004
```

