# Clean(er) Pipeline

This is an attempt to merge the pipelines from Zenobia and Guillaume

## Importing Modules

In [215]:
# General imports
import numpy as np
import pandas as pd
import re, nltk, string, os
from sklearn.model_selection import train_test_split
import ggplot, datetime, time
import matplotlib
from collections import Counter
from matplotlib import pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

In [216]:
# NN imports
# Upgrade the package called dask
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Embedding
from keras.models import Model

In [217]:
# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Select data corresponding to the top ICD codes

Here, we filter for only the top `n_top` ICD codes   

Note: We offer the option to exclude notes that do not contain any of the top codes. However, it may actually be more rigorous to keep them, no?

In [218]:
# Inputs
N_TOP = 20
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [219]:
df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)

In [220]:
df.shape

(43992, 5)

In [221]:
top_codes[0:5]

['4019', '4280', '42731', '41401', '5849']

In [222]:
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,100001,58526,2117-09-17 00:00:00,5849,Admission Date: [**2117-9-11**] ...
1,100003,54610,2150-04-21 00:00:00,4019 2851,Admission Date: [**2150-4-17**] ...
2,100006,9895,2108-04-17 00:00:00,51881 486,Admission Date: [**2108-4-6**] Discharg...
3,100007,23018,2145-04-07 00:00:00,4019 486,Admission Date: [**2145-3-31**] ...
4,100009,533,2162-05-21 00:00:00,4019 41401 25000 2720 2859,Admission Date: [**2162-5-16**] ...


## Vectorize ICD9 codes

Here we vectorize and move it to an `np.array` because that is what TensorFlow prefers

In [223]:
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)

In [224]:
labels[0:5]

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

## Clean, Vectorize and Pad Notes

Here, we vectorize the text and pad with 0s so that notes appear of the same length

In [225]:
# Inputs for tokenization
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = None # to limit length of word sequence (None if no limit)

In [226]:
# Clean
df.TEXT = vectorization.clean_notes(df, 'TEXT')

In [227]:
# Vectorize
data, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)

Vocabulary size: 130488
Average note length: 1728.09244863
Max note length: 10924


In [228]:
# Pad and turn into a matrix
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data, MAX_SEQ_LENGTH)

In [229]:
print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Final Vocabulary: 130488
Final Max Sequence Length: 10924


In [230]:
data[0:5] 

array([[   0,    0,    0, ..., 2764,   25,   82],
       [   0,    0,    0, ...,    1,  379,   38],
       [   0,    0,    0, ...,    1,    1,  803],
       [   0,    0,    0, ...,   32,  379,   38],
       [   0,    0,    0, ...,   68,  379,   38]], dtype=int32)

## Split into Sets

Here we split into sets and free up some memory

In [231]:
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)

In [232]:
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

Train:  (30794, 10924) (30794, 20)
Validation:  (8798, 10924) (8798, 20)
Test:  (4400, 10924) (4400, 20)


In [233]:
# Delete temporary variables to free some memory
del df, data, labels

## Creating Embedding Matrix

Creates an embedding matrix based on a pretrained vector

List of pretrained vectors http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/ for embedding Google cannot be downloaded, so I used Glove:    
- Go to https://nlp.stanford.edu/projects/glove/
- Download a pretrained model, e.g. `glove.6B.zip`, and put the unzipped files in `/data`

In [234]:
EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose

In [235]:
embedding_matrix, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)

Vocabulary in notes: 130488
Vocabulary in original embedding: 400000
Vocabulary intersection: 36214


In [236]:
embedding_matrix.shape

(130489, 100)

## Simple Neural Network

Simple Neural to show that it works

In [238]:
# We build the embedding layer separately because it's a little more complex than the others
embedding_layer = Embedding(len(dictionary) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LENGTH,
                            trainable=False) # Here put as not trainable, but probably should after

In [240]:
sequence_input = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
embedded_sequences = Flatten()(embedded_sequences)
preds = Dense(len(top_codes), activation='softmax')(embedded_sequences)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=128)

In [None]:
pred_val = model.predict(X_val, batch_size=128)

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_val, np.where(pred_val>0.5,1,0), average = 'micro')