# Clean(er) Pipeline

This is an attempt to merge the pipelines from Zenobia and Guillaume

## Importing Modules

In [153]:
# General imports
import numpy as np
import pandas as pd
import re, nltk, string, os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import datetime, time
import matplotlib
from collections import Counter
from matplotlib import pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

In [154]:
# NN imports
# Upgrade the package called dask
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Embedding
from keras.models import Model

In [155]:
# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Select data corresponding to the top ICD codes

Here, we filter for only the top `n_top` ICD codes   

Note: We offer the option to exclude notes that do not contain any of the top codes. However, it may actually be more rigorous to keep them, no?

In [156]:
# Inputs
N_TOP = 20
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])
df = df.head(10000) # Speeding up

In [157]:
df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)

In [158]:
df.shape

(8350, 5)

In [159]:
top_codes[0:5]

['4019', '4280', '42731', '41401', '5849']

In [160]:
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,100001,58526,2117-09-17 00:00:00,5849,Admission Date: [**2117-9-11**] ...
1,100003,54610,2150-04-21 00:00:00,4019 2851,Admission Date: [**2150-4-17**] ...
2,100006,9895,2108-04-17 00:00:00,51881 486,Admission Date: [**2108-4-6**] Discharg...
3,100007,23018,2145-04-07 00:00:00,4019 486,Admission Date: [**2145-3-31**] ...
4,100009,533,2162-05-21 00:00:00,4019 41401 25000 2720 2859,Admission Date: [**2162-5-16**] ...


## Vectorize ICD9 codes

Here we vectorize and move it to an `np.array` because that is what TensorFlow prefers

In [161]:
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

In [162]:
labels[0:5]

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

## Clean, Vectorize and Pad Notes

Here, we vectorize the text and pad with 0s so that notes appear of the same length

In [163]:
# Inputs for tokenization
MAX_VOCAB = None # to limit original vocabulary to most common words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)

In [164]:
# Clean
df.TEXT = vectorization.clean_notes(df, 'TEXT')

In [165]:
# Vectorize
data, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)

Vocabulary size: 57167
Average note length: 1716.31568862
Max note length: 8725


In [166]:
# Pad and turn into a matrix
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data, MAX_SEQ_LENGTH)

In [167]:
print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Final Vocabulary: 57167
Final Max Sequence Length: 5000


In [168]:
data[0:5] 

array([[   0,    0,    0, ..., 2909,   24,   83],
       [   0,    0,    0, ...,    1,  374,   38],
       [   0,    0,    0, ...,    1,    1,  772],
       [   0,    0,    0, ...,   32,  374,   38],
       [   0,    0,    0, ...,   69,  374,   38]], dtype=int32)

## Split into Sets

Here we split into sets and free up some memory

In [169]:
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)

In [170]:
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

Train:  (5844, 5000) (5844, 20)
Validation:  (1670, 5000) (1670, 20)
Test:  (836, 5000) (836, 20)


In [171]:
# Delete temporary variables to free some memory
del df, data, labels

## Creating Embedding Matrix

Creates an embedding matrix based on a pretrained vector

List of pretrained vectors http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/ for embedding Google cannot be downloaded, so I used Glove:    
- Go to https://nlp.stanford.edu/projects/glove/
- Download a pretrained model, e.g. `glove.6B.zip`, and put the unzipped files in `/data`

In [172]:
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose

In [187]:
embedding_matrix, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM,
                                                                  verbose = True, sigma = None)

Vocabulary in notes: 57167
Vocabulary in original embedding: 400000
Vocabulary intersection: 24757


In [188]:
embedding_matrix.shape

(57168, 100)

## Simple Neural Network

Simple Neural to show that it works
- softmax with categorical cross entropy and adam gave f1 = 0.1696042216358839

In [189]:
EMBEDDING_TRAINABLE = True

In [190]:
# We build the embedding layer separately because it's a little more complex than the others
embedding_layer = Embedding(len(dictionary) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LENGTH,
                            trainable=EMBEDDING_TRAINABLE)

In [202]:
sequence_input = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
embedded_sequences = Flatten()(embedded_sequences)
preds = Dense(len(top_codes), activation='sigmoid')(embedded_sequences)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [203]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=2, batch_size=128)

Train on 5844 samples, validate on 1670 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11f979ba8>

In [204]:
pred_val = model.predict(X_val, batch_size=128)

In [205]:
np.max(pred_val)

1.0

In [206]:
f1_score(y_val, np.where(pred_val>0.5, 1, 0), average = 'micro')

0.27048167970358172

Previous one

In [207]:
sequence_input = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
embedded_sequences = Flatten()(embedded_sequences)
preds = Dense(len(top_codes), activation='softmax')(embedded_sequences)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [208]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=2, batch_size=128)

Train on 5844 samples, validate on 1670 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11da7a710>

In [209]:
pred_val = model.predict(X_val, batch_size=128)

In [210]:
np.max(pred_val)

1.0

In [212]:
f1_score(y_val, np.where(pred_val>0.05, 1, 0), average = 'micro')

0.18699088145896656