In [112]:
import csv
import random
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import *
import re
from tensorflow.contrib import learn
import sys, os
import tensorflow as tf
import cnn_model

from sklearn.metrics import label_ranking_loss
from sklearn.metrics import f1_score
import shutil

General Sources:
http://ruder.io/deep-learning-nlp-best-practices/index.html#classification

### Reading File

In [4]:
#with open('../../../psql_files/disch_notes_all_icd9.csv', 'rb') as csvfile:
csv.field_size_limit(sys.maxsize)
with open('../baseline/psql_files/dis_notes_icd9.csv', 'rb') as csvfile:
    discharge_notes_reader = csv.reader(csvfile)
    discharge_notes_list = list(discharge_notes_reader)    
random.shuffle(discharge_notes_list)

print "Number of records in the dataset: ", len (discharge_notes_list)

Number of records in the dataset:  45837


we will take only 10,000 records to compare with NN baseline

In [34]:
#starting for 1,000 just for programming
number_records = 1000

In [35]:
discharge_notes_icd9 = np.asarray(discharge_notes_list[0:number_records])
print 'Number of discharge clinical notes: ', len(discharge_notes_icd9)
discharge_notes= discharge_notes_icd9[:,3]
discharge_labels = discharge_notes_icd9[:,4]

Number of discharge clinical notes:  1000


## Pre Processing

## Stats about Notes  (TODO:)
* vocabulary of size
* find out notes that are too large, outliers to take out (otherwise the embeddings will pad a lot of zeroes to the other note-vectors(

## Converting icd9 labels to vectors

In [36]:
#counts by icd9_codes
icd9_codes = Counter()
for label in discharge_labels:
    for icd9_code in label.split():
        icd9_codes[icd9_code] += 1
print icd9_codes

Counter({'4019': 444, '42731': 305, '41401': 301, '4280': 285, '5849': 205, '2724': 195, '25000': 190, '51881': 171, '5990': 169, '53081': 142, '2720': 135, '2859': 116, '486': 116, '2449': 109, '496': 94, '2851': 92, '2762': 92, '5070': 84, '99592': 80, '0389': 68})


In [37]:
# list of unique icd9_codes and lookups for its index in the vector
unique_icd9_codes = list (icd9_codes)
index_to_icd9 = dict(enumerate(unique_icd9_codes))
icd9_to_id = {v:k for k,v in index_to_icd9.iteritems()}
print 'List of unique icd9 codes from all labels: ', unique_icd9_codes

List of unique icd9 codes from all labels:  ['2859', '4019', '2724', '25000', '99592', '2851', '2762', '2449', '4280', '0389', '41401', '53081', '51881', '5990', '2720', '42731', '486', '5070', '496', '5849']


In [38]:
#transforming list of icd_codes into a vector
def get_icd9_array(icd9_codes):
    icd9_index_array = [0]*len(unique_icd9_codes)
    for icd9_code in icd9_codes.split():
        index = icd9_to_id [icd9_code]
        icd9_index_array[index] = 1
    return icd9_index_array

In [39]:
labels_vector= list(map(get_icd9_array,discharge_labels))

## Pre-processing notes

https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py


(1) Clean the text data using the same code as the original paper.
https://github.com/yoonkim/CNN_sentence

(2) Pad each note to the maximum note length, which turns out to be NN. We append special <PAD> tokens to all other notes to make them NN words. Padding sentences to the same length is useful because it allows us to efficiently batch our data since each example in a batch must be of the same length.
(3) Build a vocabulary index and map each word to an integer between 0 and 18,765 (the vocabulary size). Each sentence becomes a vector of integers

In [40]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [41]:
def note_preprocessing(data_notes):
    notes_stripped = [s.strip() for s in data_notes]
    notes_clean = [clean_str(note) for note in notes_stripped ]
    # Build vocabulary
    note_words_length =  [len(x.split(" ")) for x in notes_clean]
    max_document_length = max( note_words_length)  
    average_length = np.mean(note_words_length)
    return max_document_length, average_length, notes_clean

In [42]:
#preprocess documents
max_document_length, average_document_length, notes_processed = note_preprocessing(discharge_notes)


print ' max document length: ', max_document_length
print 'average document length: ', average_document_length

#create vocabulary processor
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    
# convert words to ids, and each document is padded
notes_ids = np.array(list(vocab_processor.fit_transform(notes_processed)))

# vocabulary size
vocabulary_size = len(vocab_processor.vocabulary_)
print 'Vocabulary_size: ', vocabulary_size

 max document length:  6956
average document length:  1882.131
Vocabulary_size:  28272


### question?
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV   
what do we do if the test data has a document with a bigger length than the max for the padding? 

### transforming to embeddings using word2vec

From: "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"

"We pre-train our embeddings with word2vec on all discharge notes available in the MIMIC-III database.   
The word embeddings of all words in the text to classify are concatenated and used as input to the
convolutional layer. Convolutions detect a signal from a combination of adjacent inputs. We
combine multiple convolutions of different lengths to evaluate phrases that are anywhere from
two to five words long,"   

(tf-idf is removing negations..  embedding is taking care of mispellings.. we may need further training-tuning because of medical terms)

https://code.google.com/archive/p/word2vec/
    
Pre-trained word and phrase vectors

"We are publishing pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in [2]. The archive is available here: GoogleNews-vectors-negative300.bin.gz."   

### for now we wil train our own embeddings, but word2vec will be better

## Split Files

In [43]:
def split_file(data, train_frac = 0.7, dev_frac = 0.15):   
    train_split_idx = int(train_frac * len(data))
    dev_split_idx = int ((train_frac + dev_frac)* len(data))
    train_data = data[:train_split_idx]
    dev_data = data[train_split_idx:dev_split_idx]
    test_data = data[dev_split_idx:]
    return train_data, dev_data, test_data


train_notes, dev_notes, test_notes = split_file (notes_ids)
train_labels, dev_labels, test_labels = split_file (labels_vector)
print 'Training set samples:', len (train_notes)
print 'Dev set samples:', len (dev_notes)
print 'Test set samples:', len (test_notes)

Training set samples: 700
Dev set samples: 150
Test set samples: 150


## CNN Training

here is an example of a CNN to classify text.. our model will have different values for d (embedding-size, region sizes, etc)
<img src="CNN_for_text2.png"/>

This is the CNN used with the MIMIC discharge summaries
<img src="mimic_CNN_text_classification.png"/>

### sources:
http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/  
http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/   
https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py   
https://www.tensorflow.org/get_started/mnist/pros   
https://www.tensorflow.org/api_docs/python/tf/nn/conv2d   
 

From: "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"

"For the CNN model, we used 100 filters for each of the widths 2, 3, 4, and 5.   
To prevent overfitting, we set the dropout probability to 0.5 and used L2-normalization to normalize word
embeddings to have a max norm of 3.64   
The model was trained using adadelta with an initial learning rate of 1 for 20 epochs"

In [147]:
def run_epoch(lm, session, X, y, batch_size):
    for batch in xrange(0, X.shape[0], batch_size):
        # x SHAPE:   [batch_size, sequence_length, embedding_size]
        print 'running batch: ', batch 
        X_batch = X[batch : batch + batch_size]
        y_batch = y[batch : batch + batch_size]
        feed_dict = {lm.input_x:X_batch,lm.input_y:y_batch,lm.dropout_keep_prob:0.5}
        #loss, train_op_value =  session.run( [lm.loss,lm.train],feed_dict=feed_dict ) 
        loss, _, step = session.run([lm.loss, lm.train_op, lm.global_step], feed_dict)
        print 'loss: ', loss

In [148]:
def predict_icd9_codes(lm, session, x_data, y_data):
    total_y_hat = []
    for batch in xrange(0, x_data.shape[0], batch_size):
        X_batch = x_data[batch : batch + batch_size]
        Y_batch = y_data[batch : batch + batch_size]
        y_hat_out = session.run(lm.y_hat, feed_dict={lm.input_x:X_batch,lm.input_y:Y_batch, lm.dropout_keep_prob: 1.0})
        total_y_hat.extend(y_hat_out)
    return  total_y_hat

In [167]:
#build tensorflow graphs
reload(cnn_model)

# Model parameters

model_params = dict(vocab_size= vocabulary_size, sequence_length=max_document_length, learning_rate=1.0,\
                    embedding_size=128, num_classes=20, filter_sizes=[2,3,4,5], num_filters=100)

# Build and Train Model
cnn = cnn_model.NNLM(**model_params)
cnn.BuildCoreGraph()
cnn.BuildTrainGraph()

In [168]:
TF_SAVEDIR = "tf_saved"
trained_filename = os.path.join(TF_SAVEDIR, "cnn_trained")

In [187]:
batch_size = 500
num_epochs = 20


with cnn.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=cnn.graph) as session:
    session.run(initializer)
    #training
    for epoch_num in xrange(num_epochs):
        print 'epoch_num:' , epoch_num
        run_epoch(cnn, session, train_notes, train_labels, batch_size)
    saver.save(session, trained_filename)
    print 'predicting training now '
    train_y_hat = predict_icd9_codes(cnn, session, train_notes, train_labels)   
    print 'predicting dev set now'
    dev_y_hat = predict_icd9_codes(cnn, session, dev_notes, dev_labels)
    print 'done!'



epoch_num: 0
running batch:  0
loss:  44.6726
running batch:  500
loss:  37.2105
epoch_num: 1
running batch:  0
loss:  40.1158
running batch:  500
loss:  34.6308
epoch_num: 2
running batch:  0
loss:  39.0987
running batch:  500
loss:  34.7382
epoch_num: 3
running batch:  0
loss:  37.4993
running batch:  500
loss:  33.7612
epoch_num: 4
running batch:  0
loss:  36.073
running batch:  500
loss:  31.124
epoch_num: 5
running batch:  0
loss:  35.0206
running batch:  500
loss:  30.9945
epoch_num: 6
running batch:  0
loss:  35.4492
running batch:  500
loss:  30.9623
epoch_num: 7
running batch:  0
loss:  35.1409
running batch:  500
loss:  30.8021
epoch_num: 8
running batch:  0
loss:  34.6098
running batch:  500
loss:  30.3799
epoch_num: 9
running batch:  0
loss:  35.2391
running batch:  500
loss:  31.1178
epoch_num: 10
running batch:  0
loss:  34.4539
running batch:  500
loss:  30.7931
epoch_num: 11
running batch:  0
loss:  34.8087
running batch:  500
loss:  32.2258
epoch_num: 12
running batch:

In [188]:
train_y_hat[2]

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.], dtype=float32)

In [189]:
train_labels[2]

[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## Performance Evaluation


In [190]:
# ranking loss
training_ranking_loss = label_ranking_loss(train_labels, train_y_hat)
print "Training ranking loss: ", training_ranking_loss
dev_ranking_loss = label_ranking_loss(dev_labels, dev_y_hat)
print "Development ranking loss: ", dev_ranking_loss

Training ranking loss:  0.999915966387
Development ranking loss:  1.0


In [191]:
#choosing a threshold 
def get_hot_vector (probs_list, threshold):
    vector  = []
    for prob in probs_list:
        train_y_hat_hot = [ 1 if  p > threshold else 0 for p in prob]
        vector.append(train_y_hat_hot)
    return vector

In [192]:
threshold= 0.5
hot_y_hat = get_hot_vector(train_y_hat, threshold)
hot_dev_y_hat = get_hot_vector(dev_y_hat, threshold)

In [193]:
train_f1 = f1_score(np.array(train_labels), np.array(hot_y_hat), average='micro')
print "training f1 score: ", train_f1

training f1 score:  0.289658542545


In [194]:
dev_f1 = f1_score(np.array(dev_labels), np.array(hot_dev_y_hat), average='micro')
print "dev f1 score: ", dev_f1

dev f1 score:  0.296906045984
