In [6]:
import csv
import random
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import *
import re
from tensorflow.contrib import learn
import sys, os
import tensorflow as tf
import cnn_model
import utils

from sklearn.metrics import label_ranking_loss
from sklearn.metrics import f1_score
import shutil

General Sources:
http://ruder.io/deep-learning-nlp-best-practices/index.html#classification

### Reading File

In [7]:
#with open('../../../psql_files/disch_notes_all_icd9.csv', 'rb') as csvfile:
csv.field_size_limit(sys.maxsize)
with open('../baseline/psql_files/dis_notes_icd9.csv', 'rb') as csvfile:
    discharge_notes_reader = csv.reader(csvfile)
    discharge_notes_list = list(discharge_notes_reader)    
random.shuffle(discharge_notes_list)

print "Number of records in the dataset: ", len (discharge_notes_list)

Number of records in the dataset:  45837


we will take only 10,000 records to compare with NN baseline

In [8]:
#starting for 1,000 just for programming
number_records = 1000

In [9]:
discharge_notes_icd9 = np.asarray(discharge_notes_list[0:number_records])
print 'Number of discharge clinical notes: ', len(discharge_notes_icd9)
discharge_notes= discharge_notes_icd9[:,3]
discharge_labels = discharge_notes_icd9[:,4]

Number of discharge clinical notes:  1000


## Pre Processing

## Stats about Notes  (TODO:)
* vocabulary of size
* find out notes that are too large, outliers to take out (otherwise the embeddings will pad a lot of zeroes to the other note-vectors(

## Converting icd9 labels to vectors

In [10]:
#transforming list of icd_codes into a vector
def get_icd9_array(icd9_codes):
    icd9_index_array = [0]*len(unique_icd9_codes)
    for icd9_code in icd9_codes.split():
        index = icd9_to_id [icd9_code]
        icd9_index_array[index] = 1
    return icd9_index_array

In [11]:
#counts by icd9_codes
icd9_codes = Counter()
for label in discharge_labels:
    for icd9_code in label.split():
        icd9_codes[icd9_code] += 1
print icd9_codes

# list of unique icd9_codes and lookups for its index in the vector
unique_icd9_codes = list (icd9_codes)
index_to_icd9 = dict(enumerate(unique_icd9_codes))
icd9_to_id = {v:k for k,v in index_to_icd9.iteritems()}
print '  '
print 'List of unique icd9 codes from all labels: ', unique_icd9_codes

#convert icd9 codes into ids
labels_vector= list(map(get_icd9_array,discharge_labels))

Counter({'4019': 468, '4280': 294, '41401': 281, '42731': 253, '5849': 215, '2724': 202, '25000': 202, '51881': 185, '5990': 156, '2720': 145, '53081': 143, '2859': 113, '2851': 113, '486': 107, '2449': 106, '5070': 91, '0389': 90, '496': 89, '99592': 84, '2762': 77})
  
List of unique icd9 codes from all labels:  ['2859', '99592', '4019', '2724', '25000', '2720', '2851', '2762', '2449', '4280', '0389', '41401', '42731', '51881', '53081', '486', '496', '5070', '5849', '5990']


## Pre-processing notes

https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py


(1) Clean the text data using the same code as the original paper.
https://github.com/yoonkim/CNN_sentence

(2) Pad each note to the maximum note length, which turns out to be NN. We append special <PAD> tokens to all other notes to make them NN words. Padding sentences to the same length is useful because it allows us to efficiently batch our data since each example in a batch must be of the same length.
(3) Build a vocabulary index and map each word to an integer between 0 and 18,765 (the vocabulary size). Each sentence becomes a vector of integers

In [45]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def note_preprocessing(data_notes):
    notes_stripped = [s.strip() for s in data_notes]
    notes_clean = [clean_str(note) for note in notes_stripped ]
    notes_canonicalized = [" ".join (utils.canonicalize_words(note.split(" "))) for note in notes_clean ]
    
    note_words_length =  [len(x.split(" ")) for x in notes_canonicalized]
    max_document_length = max( note_words_length)  
    average_length = np.mean(note_words_length)
    return max_document_length, average_length, notes_canonicalized

In [46]:
#preprocess documents
max_document_length, average_document_length, notes_processed = note_preprocessing(discharge_notes)


print ' max document length: ', max_document_length
print 'average document length: ', average_document_length

#create vocabulary processor
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    
# convert words to ids, and each document is padded
notes_ids = np.array(list(vocab_processor.fit_transform(notes_processed)))

# vocabulary size
vocabulary_size = len(vocab_processor.vocabulary_)
print 'Vocabulary_size: ', vocabulary_size

 max document length:  7929
average document length:  1961.516
Vocabulary_size:  23128


In [47]:
notes_processed[0]

"admission date DGDGDGDG DG DG discharge date DGDGDGDG DG DG date of birth DGDGDGDG DG DGDG sex f service medicine history of present illness the patient is a DGDG year old woman with a history of coronary artery disease status post myocardial infarction in DGDGDGDG , status post recent left anterior descending stent with an ejection fraction of DGDG , hypertension , known carotid stenosis , who was admitted to trauma surgical intensive care unit on DG DG , after falling after a blackout and hitting her head the patient had trauma to the head and face the patient had one to two minutes of loss of consciousness the patient denies preceding chest pain , shortness of breath , lightheadedness , dizziness , diaphoresis , visual loss and vertigo she has no history of syncope or loss of consciousness although she had an episode of transient visual loss in the setting of taking sublingual nitroglycerin on last admission the patient had no post ictal confusion in the emergency department , a he

### question?
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV   
what do we do if the test data has a document with a bigger length than the max for the padding? 

### transforming to embeddings using word2vec

From: "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"

"We pre-train our embeddings with word2vec on all discharge notes available in the MIMIC-III database.   
The word embeddings of all words in the text to classify are concatenated and used as input to the
convolutional layer. Convolutions detect a signal from a combination of adjacent inputs. We
combine multiple convolutions of different lengths to evaluate phrases that are anywhere from
two to five words long,"   

(tf-idf is removing negations..  embedding is taking care of mispellings.. we may need further training-tuning because of medical terms)

https://code.google.com/archive/p/word2vec/
    
Pre-trained word and phrase vectors

"We are publishing pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in [2]. The archive is available here: GoogleNews-vectors-negative300.bin.gz."   

### for now we wil train our own embeddings, but word2vec will be better

## Split Files

In [48]:
def split_file(data, train_frac = 0.7, dev_frac = 0.15):   
    train_split_idx = int(train_frac * len(data))
    dev_split_idx = int ((train_frac + dev_frac)* len(data))
    train_data = data[:train_split_idx]
    dev_data = data[train_split_idx:dev_split_idx]
    test_data = data[dev_split_idx:]
    return train_data, dev_data, test_data


train_notes, dev_notes, test_notes = split_file (notes_ids)
train_labels, dev_labels, test_labels = split_file (labels_vector)
print 'Training set samples:', len (train_notes)
print 'Dev set samples:', len (dev_notes)
print 'Test set samples:', len (test_notes)

Training set samples: 700
Dev set samples: 150
Test set samples: 150


## CNN Training

here is an example of a CNN to classify text.. our model will have different values for d (embedding-size, region sizes, etc)
<img src="CNN_for_text2.png"/>

This is the CNN used with the MIMIC discharge summaries
<img src="mimic_CNN_text_classification.png"/>


"For the CNN model, we used 100 filters for each of the widths 2, 3, 4, and 5.    
To prevent overfitting, we set the dropout probability to 0.5 and used L2-normalization to normalize word
embeddings to have a max norm of 3.64     
The model was trained using adadelta with an initial learning rate of 1 for 20 epochs.   
The CNN model was implemented using Lua and the Torch7 framework.66    
All baseline models were implemented using Python with the scikit-learn library."

### sources:
http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/  
http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/   
https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py   
https://www.tensorflow.org/get_started/mnist/pros   
https://www.tensorflow.org/api_docs/python/tf/nn/conv2d   
 
 multi-label
 https://github.com/may-/cnn-re-tf/blob/master/cnn.py

From: "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"

"For the CNN model, we used 100 filters for each of the widths 2, 3, 4, and 5.   
To prevent overfitting, we set the dropout probability to 0.5 and used L2-normalization to normalize word
embeddings to have a max norm of 3.64   
The model was trained using adadelta with an initial learning rate of 1 for 20 epochs"

In [49]:
def run_epoch(lm, session, X, y, batch_size):
    for batch in xrange(0, X.shape[0], batch_size):
        # x SHAPE:   [batch_size, sequence_length, embedding_size]
        X_batch = X[batch : batch + batch_size]
        y_batch = y[batch : batch + batch_size]
        feed_dict = {lm.input_x:X_batch,lm.input_y:y_batch,lm.dropout_keep_prob:0.5}
        #loss, train_op_value =  session.run( [lm.loss,lm.train],feed_dict=feed_dict ) 
        loss, _, step = session.run([lm.loss, lm.train_op, lm.global_step], feed_dict)
        print 'batch: %d, loss: %5.5f' % (batch, loss) 

In [50]:
def predict_icd9_codes(lm, session, x_data, y_data):
    total_y_hat = []
    for batch in xrange(0, x_data.shape[0], batch_size):
        X_batch = x_data[batch : batch + batch_size]
        Y_batch = y_data[batch : batch + batch_size]
        y_hat_out = session.run(lm.y_hat, feed_dict={lm.input_x:X_batch,lm.input_y:Y_batch, lm.dropout_keep_prob: 1.0})
        total_y_hat.extend(y_hat_out)
    return  total_y_hat

In [51]:
#build tensorflow graphs
reload(cnn_model)

# Model parameters

model_params = dict(vocab_size= vocabulary_size, sequence_length=max_document_length, learning_rate=1.0,\
                    embedding_size=128, num_classes=20, filter_sizes=[2,3,4,5], num_filters=100)

# Build and Train Model
cnn = cnn_model.NNLM(**model_params)
cnn.BuildCoreGraph()
cnn.BuildTrainGraph()

In [52]:
TF_SAVEDIR = "tf_saved"
trained_filename = os.path.join(TF_SAVEDIR, "cnn_trained")

In [53]:
batch_size = 50
num_epochs = 5


with cnn.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=cnn.graph) as session:
    session.run(initializer)
    #training
    for epoch_num in xrange(num_epochs):
        print 'epoch_num:' , epoch_num
        run_epoch(cnn, session, train_notes, train_labels, batch_size)
    saver.save(session, trained_filename)
    print 'predicting training now '
    train_y_hat = predict_icd9_codes(cnn, session, train_notes, train_labels)   
    print 'predicting dev set now'
    dev_y_hat = predict_icd9_codes(cnn, session, dev_notes, dev_labels)
    print 'done!'



epoch_num: 0
batch: 0, loss: 41.75618
batch: 50, loss: 43.49678
batch: 100, loss: 44.85231
batch: 150, loss: 47.05762
batch: 200, loss: 41.92564
batch: 250, loss: 38.48314
batch: 300, loss: 33.19728
batch: 350, loss: 37.97263
batch: 400, loss: 38.17122
batch: 450, loss: 37.48780
batch: 500, loss: 44.62095
batch: 550, loss: 43.39772
batch: 600, loss: 36.83818
batch: 650, loss: 35.93096
epoch_num: 1
batch: 0, loss: 32.72146
batch: 50, loss: 35.67733
batch: 100, loss: 39.73853
batch: 150, loss: 43.30456
batch: 200, loss: 34.22692
batch: 250, loss: 36.67789
batch: 300, loss: 30.86436
batch: 350, loss: 32.74348
batch: 400, loss: 35.99274
batch: 450, loss: 39.22559
batch: 500, loss: 41.25626
batch: 550, loss: 46.58323
batch: 600, loss: 33.44085
batch: 650, loss: 33.82124
epoch_num: 2
batch: 0, loss: 34.25573
batch: 50, loss: 37.50615
batch: 100, loss: 42.85917
batch: 150, loss: 47.14633
batch: 200, loss: 36.98771
batch: 250, loss: 39.02951
batch: 300, loss: 35.77827
batch: 350, loss: 40.5374

In [59]:
print train_labels[0]
print train_y_hat[0]
print sum (train_y_hat[0])

[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[  1.12788871e-06   1.11937751e-11   2.94459522e-01   1.68707527e-06
   4.85506680e-05   2.66462870e-08   2.13917624e-03   2.69456701e-09
   3.77299926e-07   6.53665960e-01   5.19855867e-14   1.84163284e-02
   2.39666998e-02   2.96574039e-03   1.37168263e-05   6.22122070e-06
   2.89892884e-11   8.31788967e-08   4.31381073e-03   9.51213337e-07]
0.999999981916


## Performance Evaluation


In [81]:
# ranking loss
training_ranking_loss = label_ranking_loss(train_labels, train_y_hat)
print "Training ranking loss: ", training_ranking_loss
dev_ranking_loss = label_ranking_loss(dev_labels, dev_y_hat)
print "Development ranking loss: ", dev_ranking_loss

Training ranking loss:  0.315731861733
Development ranking loss:  0.330642802286


## TODO  create a model for thresholding

Large-scale Multi-label Text Classification—Revisiting Neural Networks


"3.3 Thresholding
Once training of the neural network is finished, its output may be interpreted as a probability
distribution p (ojx) over the labels for a given document x. The probability distribution
can be used to rank labels, but additional measures are needed in order to split
the ranking into relevant and irrelevant labels. For transforming the ranked list of labels
into a set of binary predictions, we train a multi-label threshold predictor from training
data. This sort of thresholding methods are also used in [6, 31]
For each document xm, labels are sorted by the probabilities in decreasing order.
Ideally, if NNs successfully learn a mapping function f , all correct (positive) labels
will be placed on top of the sorted list and there should be large margin between the set
of positive labels and the set of negative labels. Using F1 score as a reference measure,
we calculate classification performances at every pair of successive positive labels and
choose a threshold value tm that produces the best performance"

In [54]:
def get_f1_score(y_true,y_hat,threshold, average):
    hot_y = np.where(np.array(y_hat) > threshold, 1, 0)
    return f1_score(np.array(y_true), hot_y, average=average)

In [60]:
print 'F1 scores'
print 'threshold | training | dev  '
f1_score_average = 'micro'
for threshold in [ 0.005, 0.01,0.02,0.03,0.04,0.05,0.06, 0.1, 0.5]:
    train_f1 = get_f1_score(train_labels, train_y_hat,threshold,f1_score_average)
    dev_f1 = get_f1_score(dev_labels, dev_y_hat,threshold,f1_score_average)
    print '%1.3f:      %1.3f      %1.3f' % (threshold,train_f1, dev_f1)

F1 scores
threshold | training | dev  
0.005:      0.350      0.340
0.010:      0.339      0.330
0.020:      0.321      0.299
0.030:      0.306      0.291
0.040:      0.301      0.285
0.050:      0.292      0.285
0.060:      0.287      0.280
0.100:      0.276      0.264
0.500:      0.200      0.192


## Thoughts so far

The CNN loss is stuck, the model is not learning much. The F1 score of 35%,which is close to the baseline model that always predict the top 4 most common icd-9 code and to the NN Baseline.

There is a LSTM model by this paper: "Applying Deep Learning to ICD-9 Multi-label Classification from Medical Records" which did achieve a 42% F1-score. (https://cs224d.stanford.edu/reports/priyanka.pdf)

(note:  I think the CNN should be getting about 40% also)

The "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"  study did get a 70% F1-score, but they don't use the icd9-labels but phenotypes labels they annotated themselved (via a group of medical professionals). (https://arxiv.org/abs/1703.08705). There were ONLY 10 phenotypes.

The discharge summaries are labeled with ICD9-codes that are leaves in the ICD9-hierarchy (which has hundreds of ICD9-codes), then maybe these leave nodes are too specific and difficult to predict, one experiment would be to replaced all the ICD9-codes with their parent in the second or third level in the hierarchy and see if predictions work better that way.