# Embedding

In [1]:
import numpy as np
import pandas as pd
import re, nltk, string
from sklearn.model_selection import train_test_split
import ggplot, datetime, time
import matplotlib
from collections import Counter
from matplotlib import pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

  from pandas.lib import Timestamp
  from pandas.core import datetools


## Select data for top ICD code

In [2]:
n_top = 20

After some tests, we find that it is preferable to look for the top values before converting the top column to a list. In the spirit of not modifying the coding too early, we keep the ICD9 column as a string too.

In [3]:
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [4]:
df.shape

(52696, 5)

In [5]:
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,100001,58526,2117-09-17 00:00:00,25013 3371 5849 5780 25063 5363 4580 25043 403...,Admission Date: [**2117-9-11**] ...
1,100003,54610,2150-04-21 00:00:00,53100 2851 07054 5715 45621 53789 4019 53550 7823,Admission Date: [**2150-4-17**] ...
2,100006,9895,2108-04-17 00:00:00,49320 51881 486 20300 2761 7850 3090,Admission Date: [**2108-4-6**] Discharg...
3,100007,23018,2145-04-07 00:00:00,56081 5570 9973 486 4019,Admission Date: [**2145-3-31**] ...
4,100009,533,2162-05-21 00:00:00,41401 99604 4142 25000 27800 4148 4111 2859 40...,Admission Date: [**2162-5-16**] ...


In [6]:
def find_top_codes(df, col_name, n):
    """ Find the top codes from a columns of strings
    Returns a list of strings to make sure codes are treated as classes down the line """
    string_total = df[col_name].str.cat(sep=' ')
    counter_total = Counter(string_total.split(' '))
    return [word for word, word_count in counter_total.most_common(n)]

In [7]:
def select_codes_in_string(string, top_codes):
    """ Creates a sring of the codes which are both in the original string
    and in the top codes list """
    r = ''
    for code in top_codes:
        if code in string:
            r += ' ' + code
    return r.strip()

In [8]:
def filter_top_codes(df, col_name, n, filter_empty = True):
    """ Creates a dataframe with the codes column containing only the top codes
    and filters out the lines without any of the top codes if True
    
    Note: we may actually want to keep even the empty lines """
    r = df.copy()
    top_codes = find_top_codes(r, col_name, n)
    r[col_name] = r[col_name].apply(lambda x: select_codes_in_string(x, top_codes))
    if filter_empty:
        r = r.loc[r[col_name] != '']
    return r, top_codes

In [9]:
df, top_codes = filter_top_codes(df, 'ICD9', n_top, filter_empty = True)

In [10]:
top_codes

['4019',
 '4280',
 '42731',
 '41401',
 '5849',
 '25000',
 '2724',
 '51881',
 '5990',
 '53081',
 '2720',
 '2859',
 '2449',
 '486',
 '2851',
 '2762',
 '496',
 '99592',
 '5070',
 '0389']

In [11]:
df.shape

(43992, 5)

In [12]:
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,100001,58526,2117-09-17 00:00:00,5849,Admission Date: [**2117-9-11**] ...
1,100003,54610,2150-04-21 00:00:00,4019 2851,Admission Date: [**2150-4-17**] ...
2,100006,9895,2108-04-17 00:00:00,51881 486,Admission Date: [**2108-4-6**] Discharg...
3,100007,23018,2145-04-07 00:00:00,4019 486,Admission Date: [**2145-3-31**] ...
4,100009,533,2162-05-21 00:00:00,4019 41401 25000 2720 2859,Admission Date: [**2162-5-16**] ...


## Resources for vectorizing the text

Pretrained:   
- http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/   
- https://code.google.com/archive/p/word2vec/downloads # Cannot be downloaded   
- https://github.com/3Top/word2vec-api   
- https://nlp.stanford.edu/projects/glove/ # using glove       

Methods   
- https://keras.io/layers/embeddings/   
- https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

## Implementation of the Keras Method with Glove (based on Keras blog)

In [13]:
def vectorize_code(x, code_list):
    r = []
    for code in code_list:
        if code in x: r.append(1)
        else: r.append(0)
    return np.asarray(r)

In [14]:
df.ICD9_vector = df.ICD9.apply(lambda x: vectorize_code(x, top_codes))

In [15]:
df.ICD9_vector[0:5]

0    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
2    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...
3    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
4    [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...
Name: ICD9, dtype: object

In [16]:
# Install Keras and Tensorflow
# Upgrade the package called dask
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.TEXT)
sequences = tokenizer.texts_to_sequences(df.TEXT)

In [18]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 218822 unique tokens.


In [19]:
data = pad_sequences(sequences)

In [20]:
#labels = keras.utils.to_categorical(np.asarray(df.ICD9_vector))
labels = np.transpose(np.column_stack(df.ICD9_vector))

In [21]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (43992, 12292)
Shape of label tensor: (43992, 20)


In [22]:
VALIDATION_SPLIT = 0.2
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
# That's the technique used in the blog post, but we should use the sklearn function here

In [23]:
embeddings_index = {}
f = open('../data/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [24]:
EMBEDDING_DIM = 100 # given the glove that we chose
MAX_SEQUENCE_LENGTH = data.shape[1] # Max of 13000 words per note should probably be reduced

In [25]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## Simple Neural Network

In [37]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False) # Here put as not trainable, but probably should after

In [38]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Model

In [39]:
x_val.shape 

(8798, 12292)

In [40]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
embedded_sequences = Flatten()(embedded_sequences)
preds = Dense(len(top_codes), activation='softmax')(embedded_sequences)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])


model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)

Train on 35194 samples, validate on 8798 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11f77ac50>

In [41]:
pred_val = model.predict(x_val, batch_size=128)

In [47]:
from sklearn.metrics import f1_score

In [55]:
f1_score(y_val, np.where(pred_val>0.5,1,0), average = 'micro')

0.15180615117472179