In [21]:
import csv
import random
import numpy as np
from sklearn.feature_extraction.text import *
import pickle
import tensorflow as tf
import nn_model
from sklearn.metrics import label_ranking_loss
from collections import Counter, defaultdict

## File built from MIMIC III database

The dis_notes_icd9.csv file was created from the MIMIC III database by filtering and joining a couple of tables. For details on the pre-processing phase: https://github.com/letslego/W266-ProjectMIMIC/tree/master/baseline

In [2]:
with open('psql_files/dis_notes_icd9.csv', 'rb') as csvfile:
    discharge_notes_reader = csv.reader(csvfile)
    discharge_notes_list = list(discharge_notes_reader)    
random.shuffle(discharge_notes_list)

## Sample of a discharge summary and the ICD( codes)

In [3]:
print 'Sample of a discharge note:'
print "-" *100
admission_id, subject_id, discharge_date, note_text, icd9_codes = discharge_notes_list[1]
print "Admission id: ", admission_id
print "Subject id:", subject_id
print "Discharge date:", discharge_date
print "ICD9 codes assigned to this discharge summary: ", icd9_codes
print "-" *100
print "Discharge Summary Clinical Note: "
print "-" *100
#print note_text 
print "MIMIC data is visible to only authorized users"


Sample of a discharge note:
----------------------------------------------------------------------------------------------------
Admission id:  180733
Subject id: 32517
Discharge date: 2196-11-02 00:00:00
ICD9 codes assigned to this discharge summary:  4019 25000 496 4280
----------------------------------------------------------------------------------------------------
Discharge Summary Clinical Note: 
----------------------------------------------------------------------------------------------------
MIMIC data is visible to only authorized users


# Simple Baseline

This is a multilabel classification since each discharge summary has several icd9 codes assigned. The most simple baseline will be to predict the top N icd9 codes for each discharge summary.

### Spliting files in training, dev and test

(note: moving discharge_notes_list into an np array needs about 8 GB of ram (even though the file is about 450MB) then here we work just with lists)

In [16]:
def split_file(data, train_frac = 0.7, dev_frac = 0.15):   
    train_split_idx = int(train_frac * len(data))
    dev_split_idx = int ((train_frac + dev_frac)* len(data))
    train_data = data[:train_split_idx]
    dev_data = data[train_split_idx:dev_split_idx]
    test_data = data[dev_split_idx:]
    return train_data, dev_data, test_data

In [46]:
notes= [row[3] for row in discharge_notes_list]
labels = [row[4] for row in discharge_notes_list]
train_data_notes, dev_data_notes, test_data_notes = split_file (notes)
train_data_labels, dev_data_labels, test_data_labels = split_file (labels)
print 'Training set samples:', len (train_data_notes)
print 'Dev set samples:', len (dev_data_notes)
print 'Test set samples:', len (test_data_notes)

Training set samples: 32085
Dev set samples: 6876
Test set samples: 6876


### Finding out list of unique icd9 codes and the top 4 

In [50]:
# finding out the top icd9 codes
icd9_codes = Counter()
for label in labels:
    for icd9_code in label.split():
        icd9_codes[icd9_code] += 1
top_4_icd9 = icd9_codes.most_common(4)
print "most common 4 icd9_codes: ", top_4_icd9

top_4_icd9_label = ' '.join(code for code,count in top_4_icd9 )
print 'label for the top 4 icd9 codes: ', top_4_icd9_label

most common 4 icd9_codes:  [('4019', 20721), ('4280', 13507), ('42731', 13150), ('41401', 12672)]
label for the top 4 icd9 codes:  4019 4280 42731 41401


In [56]:
# list of unique icd9_codes and lookups for its index in the vector
unique_icd9_codes = list (icd9_codes)
index_to_icd9 = dict(enumerate(unique_icd9_codes))
icd9_to_id = {v:k for k,v in index_to_icd9.iteritems()}
print 'List of unique icd9 codes from all labels: ', unique_icd9_codes

List of unique icd9 codes from all labels:  ['2859', '4019', '2724', '25000', '99592', '2851', '2762', '2449', '4280', '0389', '41401', '53081', '5849', '2720', '42731', '486', '496', '5070', '51881', '5990']


### Converting icd9 labels to vectors

Each discharge note label is a list of icd9 codes, for example: 
```
4019 25000 496 4280   
```
we will represent it as a vector for the multilabel classification process, the vector would look like:
```
[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
```

In [51]:
#transforming list of icd_codes into a vector
def get_icd9_array(icd9_codes):
    icd9_index_array = [0]*len(unique_icd9_codes)
    for icd9_code in icd9_codes.split():
        icd9_index_array[icd9_to_id [icd9_code]] = 1
    return icd9_index_array

In [53]:
#top 4 common icd9 to vector
icd9_prediction_vector = get_icd9_array(top_4_icd9_label)
print 'icd9 prediction vector: ', icd9_prediction_vector

icd9 prediction vector:  [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]


In [57]:
# true icd9 codes to vector
train_data_labels_vector= list(map(get_icd9_array, train_data_labels))
dev_data_labels_vector = list(map(get_icd9_array, dev_data_labels))
print 'example of a training label vector: ', train_data_labels_vector[0]

example of a training label vector:  [0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [54]:
## assign icd9_prediction_vector to every discharge
train_y_hat_baseline = [icd9_prediction_vector]* len (train_data_labels_vector)
dev_y_hat_baseline = [icd9_prediction_vector]* len (dev_data_labels_vector)

### Performance evaluation

In [55]:
training_ranking_loss = label_ranking_loss(train_data_labels_vector, train_y_hat_baseline)
print "Training ranking loss: ", training_ranking_loss
dev_ranking_loss = label_ranking_loss(dev_data_labels_vector, dev_y_hat_baseline)
print "Development ranking loss: ", dev_ranking_loss

Training ranking loss:  0.651733663207
Development ranking loss:  0.65130016997


#  NN Baseline

### Note: 
the input file is about 450 MB, with 45,837 discharge summaries. The google instance needed 3 CPUs (~ 8 GB memory ) to hold it in an np array. We need np arrays for tensorflow. Here I will be working with the first 5,000 records for the NN Baseline.

In [13]:
# the full file needs about 8GB of memory, let's work with the first 5000
#discharge_notes= np.asarray(discharge_notes_list)
discharge_notes= np.asarray(discharge_notes_list[0:5000])
print 'Number of discharge clinical notes: ', len(discharge_notes)

Number of discharge clinical notes:  5000


In [17]:
discharge_notes= [row[3] for row in discharge_notes]
discharge_labels = [row[4] for row in discharge_notes]

In [18]:
train_notes, dev_notes, test_notes = split_file (discharge_notes)
train_labels, dev_labels, test_labels = split_file (discharge_labels)
print 'Training set samples:', len (train_notes)
print 'Dev set samples:', len (dev_notes)
print 'Test set samples:', len (test_notes)

Training set samples: 3500
Dev set samples: 750
Test set samples: 750


###  TF-IDF Representation of discharge clinical notes

Previous research represents this documents/notes as bag-of-words vectors [1].    
In particular, it takes the 10,000 tokens with the largest tf-idf scores from the training.
 
[1] Diagnosis code assignment: models and evaluation metrics. Journal of the American Medical Informatics

In [9]:
max_number_features = 10000

In [10]:
# TfidfVectorizer
# Convert all characters to lowercase before tokenizing (by default)
# tokenization (by default)
# max_features: consider the top max_features ordered by term frequency across the corpus
vectorizer = TfidfVectorizer(max_features=max_number_features,stop_words='english',max_df=0.9 )  
train_notes_vector = vectorizer.fit_transform(train_notes)
dev_notes_vector = vectorizer.transform(dev_notes)

### Transforming list of ICD codes to vectors

In [12]:
train_labels_vector= list(map(get_icd9_array, train_labels))
dev_labels_vector = list(map(get_icd9_array, dev_labels))
test_labels_vector = list(map(get_icd9_array, test_labels))

### Neural Network for Multilabel classification

In [13]:
def run_epoch(lm, session, X, y, batch_size):
    for batch in xrange(0, X.shape[0], batch_size):
        X_batch = X[batch : batch + batch_size]
        y_batch = y[batch : batch + batch_size]
        feed_dict = {lm.x:X_batch,lm.target_y:y_batch}
        loss, train_op_value =  session.run( [lm.loss,lm.train],feed_dict=feed_dict ) 
    

In [14]:
def predict_icd9_codes(lm, session, x_data, y_dim):
    total_y_hat = []
    for batch in xrange(0, x_data.shape[0], batch_size):
        X_batch = x_data[batch : batch + batch_size]
        y_hat_out = session.run(lm.y_hat, feed_dict={lm.x:X_batch})
        total_y_hat.extend(y_hat_out)
    return  total_y_hat
    

In [25]:
#build tensorflow graphs
reload(nn_model)

# Model parameters
Hidden_dims = [100]
learning_rate = 0.01
y_dim = len(unique_icd9_codes)
model_params = dict(Hidden_dims=Hidden_dims, 
                    learning_rate = learning_rate, vocabulary_size =max_number_features , y_dim=y_dim)

lm = nn_model.NNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

    

In [26]:
X = train_notes_vector.todense()
y = train_labels_vector
batch_size = 50
num_epochs = 50

with lm.graph.as_default():
    initializer = tf.global_variables_initializer()

with tf.Session(graph=lm.graph) as session:
    session.run(initializer)
    #training
    for epoch_num in xrange(num_epochs):
        run_epoch(lm, session, X, y, batch_size)
    #prediction using training and dev data 
    train_y_hat = predict_icd9_codes(lm, session, train_notes_vector.todense(), y_dim)
    dev_y_hat = predict_icd9_codes(lm, session, dev_notes_vector.todense(), y_dim)


### Performance Evaluation

In [27]:
training_ranking_loss = label_ranking_loss(train_labels_vector, train_y_hat)
print "Training ranking loss: ", training_ranking_loss
dev_ranking_loss = label_ranking_loss(dev_labels_vector, dev_y_hat)
print "Development ranking loss: ", dev_ranking_loss

Training ranking loss:  0.328117881062
Development ranking loss:  0.324885189983


The following paper "ICD-9 Coding of Discharge Summaries"  worked with the MIMIC II database to classify ICD9-codes, 
the ranking loss metrics reported are:
<img src="paper_ranking_loss_scores.png">
