In [35]:
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers

#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [53]:
N_TOP = 20
full_df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)
df = full_df.head(1000)

In [54]:
#preprocess icd9 codes
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)


In [55]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data, MAX_SEQ_LENGTH)
print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 22476
Average note length: 1748.878
Max note length: 5641
Final Vocabulary: 22476
Final Max Sequence Length: 5000


In [56]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (699, 5000), (699, 20))
('Validation: ', (200, 5000), (200, 20))
('Test: ', (101, 5000), (101, 20))


In [None]:
# Delete temporary variables to free some memory
del df, data, labels

In [60]:
#creating embeddings
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
embedding_matrix, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 22476)
('Vocabulary in original embedding:', 400000)
('Vocabulary intersection:', 14345)


In [73]:
embedding_matrix[0]

TypeError: 'tuple' object is not callable

## CNN for text classification

Based on the following papers and links:
* "Convolutional Neural Networks for Sentence Classification"   
* "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification"
* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras/blob/master/sentiment_cnn.py

In [78]:
#### set parameters:
num_filters = 100
filter_sizes = [2,3,4,5]
training_dropout_keep_prob = 0.9
num_classes=20
batch_size = 50
epochs = 5
external_embeddings = True

In [80]:
#Embedding
if external_embeddings:
    # use embedding_matrix
    model_input = Input(shape= (MAX_SEQ_LENGTH, EMBEDDING_DIM))
    X_train_input = np.stack([np.stack([embedding_matrix[word_id] for word_id in x]) for x in X_train])
    X_val_input = np.stack([np.stack([embedding_matrix[word_id] for word_id in x]) for x in X_val])
    z = model_input
else:
    # train embeddings
    model_input = Input(shape=(MAX_SEQ_LENGTH, ))
    X_train_input = X_train
    X_val_input = X_val 
    z =  Embedding(MAX_VOCAB + 1, 
                   EMBEDDING_DIM, 
                   input_length=MAX_SEQ_LENGTH, 
                   name="embedding")(model_input)

# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    window_pool_size =  MAX_SEQ_LENGTH  - sz + 1 
    conv = MaxPooling1D(pool_size=2)(conv)  #pool_size?
    conv = Flatten()(conv)
    conv_blocks.append(conv)

#concatenate
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(training_dropout_keep_prob)(z)

#score prediction
#z = Dense(num_classes, activation="relu")(z)  I don't think this is necessary
model_output = Dense(num_classes, activation="softmax")(z)

#creating model
model = Model(model_input, model_output)
# what to use for tf.nn.softmax_cross_entropy_with_logits?
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X_train_input, y_train, batch_size=batch_size, epochs=epochs,
validation_data=(X_val_input, y_val), verbose=2)

Train on 699 samples, validate on 200 samples
Epoch 1/5
29s - loss: 2.7167 - acc: 0.8148 - val_loss: 2.9682 - val_acc: 0.8123
Epoch 2/5
29s - loss: 2.8413 - acc: 0.8168 - val_loss: 2.9271 - val_acc: 0.8078
Epoch 3/5
29s - loss: 2.7164 - acc: 0.8168 - val_loss: 2.7181 - val_acc: 0.8078
Epoch 4/5
29s - loss: 2.4198 - acc: 0.8166 - val_loss: 2.4163 - val_acc: 0.8063
Epoch 5/5
29s - loss: 2.0446 - acc: 0.8278 - val_loss: 1.7713 - val_acc: 0.8210


<keras.callbacks.History at 0x7f71d4df6d90>

### Results with external embeddings = True
```
Train on 699 samples, validate on 200 samples
Epoch 1/5
29s - loss: 2.7167 - acc: 0.8148 - val_loss: 2.9682 - val_acc: 0.8123
Epoch 2/5
29s - loss: 2.8413 - acc: 0.8168 - val_loss: 2.9271 - val_acc: 0.8078
Epoch 3/5
29s - loss: 2.7164 - acc: 0.8168 - val_loss: 2.7181 - val_acc: 0.8078
Epoch 4/5
29s - loss: 2.4198 - acc: 0.8166 - val_loss: 2.4163 - val_acc: 0.8063
Epoch 5/5
29s - loss: 2.0446 - acc: 0.8278 - val_loss: 1.7713 - val_acc: 0.8210
```

### Results with external embeddings = False
```
Train on 699 samples, validate on 200 samples
Epoch 1/5
37s - loss: 0.9028 - acc: 0.8270 - val_loss: 0.5943 - val_acc: 0.8238
Epoch 2/5
36s - loss: 0.5272 - acc: 0.8320 - val_loss: 0.5536 - val_acc: 0.8238
Epoch 3/5
36s - loss: 0.5040 - acc: 0.8320 - val_loss: 0.5544 - val_acc: 0.8238
Epoch 4/5
35s - loss: 0.4820 - acc: 0.8320 - val_loss: 0.5517 - val_acc: 0.8238
Epoch 5/5
35s - loss: 0.4518 - acc: 0.8323 - val_loss: 0.5523 - val_acc: 0.8238
```

In [81]:
pred_train = model.predict(X_train_input, batch_size=50)
pred_dev = model.predict(X_val_input, batch_size=50)

In [83]:
def get_f1_score(y_true,y_hat,threshold, average):
    hot_y = np.where(np.array(y_hat) > threshold, 1, 0)
    return f1_score(np.array(y_true), hot_y, average=average)

print 'F1 scores'
print 'threshold | training | dev  '
f1_score_average = 'micro'
for threshold in [ 0.02, 0.03,0.04,0.05,0.055,0.058,0.06, 0.08, 0.1, 0.5]:
    train_f1 = get_f1_score(y_train, pred_train,threshold,f1_score_average)
    dev_f1 = get_f1_score(y_val, pred_dev,threshold,f1_score_average)
    print '%1.3f:      %1.3f      %1.3f' % (threshold,train_f1, dev_f1)

F1 scores
threshold | training | dev  
0.020:      0.326      0.314
0.030:      0.329      0.303
0.040:      0.332      0.298
0.050:      0.331      0.292
0.055:      0.330      0.288
0.058:      0.327      0.289
0.060:      0.326      0.289
0.080:      0.320      0.277
0.100:      0.316      0.265
0.500:      0.211      0.197


### Results with external embeddings = True 
```
F1 scores
threshold | training | dev  
0.020:      0.326      0.314
0.030:      0.329      0.303
0.040:      0.332      0.298
0.050:      0.331      0.292
0.055:      0.330      0.288
0.058:      0.327      0.289
0.060:      0.326      0.289
0.080:      0.320      0.277
0.100:      0.316      0.265
0.500:      0.211      0.197
```

### Results with external embeddings = False
```
F1 scores
threshold | training | dev  
0.030:      0.462      0.345
0.040:      0.557      0.367
0.050:      0.604      0.386
0.055:      0.599      0.386
0.058:      0.592      0.391
0.060:      0.591      0.392
0.080:      0.574      0.378
0.100:      0.543      0.343
0.500:      0.003      0.000
```
