In [1]:
import csv
import random
import numpy as np
from sklearn.feature_extraction.text import *
import pickle
import tensorflow as tf
import nn_model

## Note: 
the input file is about 500 MB, 
The google instance needed 4 CPUs (~ 15 GB memory, using about 8 GB  to hold it)

## File built from MIMIC III database

The dis_notes_icd9.csv file was created from the MIMIC III database by filtering and joining a couple of tables. For details on the pre-processing phase: https://github.com/letslego/W266-ProjectMIMIC/tree/master/baseline

In [2]:
with open('psql_files/dis_notes_icd9.csv', 'rb') as csvfile:
    discharge_notes_reader = csv.reader(csvfile)
    discharge_notes_list = list(discharge_notes_reader)
    


In [3]:
# the full file needs about 8GB of memory
#discharge_notes= np.asarray(discharge_notes_list)
discharge_notes= np.asarray(discharge_notes_list[0:5000])

In [4]:
print 'Number of discharge clinical notes: ', len(discharge_notes)

Number of discharge clinical notes:  5000


Number of discharge clinical notes:  45837


In [53]:
print 'Sample of a discharge note:'
print "-" *100
admission_id, subject_id, discharge_date, note_text, icd9_codes = discharge_notes[1]
print "Admission id: ", admission_id
print "Subject id:", subject_id
print "Discharge date:", discharge_date
print "ICD9 codes assigned to this discharge summary: ", icd9_codes
print "-" *100
print "Discharge Summary Note: "
print "-" *100
print note_text


Sample of a discharge note:
----------------------------------------------------------------------------------------------------
Admission id:  100003
Subject id: 54610
Discharge date: 2150-04-21 00:00:00
ICD9 codes assigned to this discharge summary:  4019 2851
----------------------------------------------------------------------------------------------------
Discharge Summary Note: 
----------------------------------------------------------------------------------------------------
Admission Date:  [**2150-4-17**]              Discharge Date:   [**2150-4-21**] Date of Birth:  [**2090-5-19**]             Sex:   M Service: MEDICINE Allergies: Patient recorded as having No Known Allergies to Drugs Attending:[**First Name3 (LF) 12174**] Chief Complaint: coffee ground emesis Major Surgical or Invasive Procedure: EGD Right IJ CVL History of Present Illness: Mr. [**Known lastname 52368**] is a 59M w HepC cirrhosis c/b grade I/II esophageal varices and portal gastropathy (last EGD [**3-/215

## Creating training, dev and test dataset

In [5]:
# spliting 70% for training, 15% for development and 15% for testing 
random.shuffle(discharge_notes)
train_frac = 0.7
dev_frac = 0.15
train_split_idx = int(train_frac * len(discharge_notes))
dev_split_idx = int ((train_frac + dev_frac)* len(discharge_notes))
train_discharge_notes = discharge_notes[:train_split_idx]
dev_discharge_notes = discharge_notes[train_split_idx:dev_split_idx]
test_discharge_notes = discharge_notes[dev_split_idx:]

In [56]:
print 'Training set samples:', len (train_discharge_notes)
print 'Dev set samples:', len (dev_discharge_notes)
print 'Test set samples:', len (test_discharge_notes)

Training set samples: 3500
Dev set samples: 750
Test set samples: 750


In [6]:
# split file into data and its labels
def separate_labels(data):
    labels = data[:,4]
    notes = data[:,3]
    return (notes, labels)
train_notes, train_labels = separate_labels(train_discharge_notes)
dev_notes, dev_labels = separate_labels(dev_discharge_notes)
test_notes, test_labels = separate_labels(test_discharge_notes)

## TF-IDF Representation of discharge clinical notes

Previous research represents this documents/notes as bag-of-words vectors [1].    
In particular, it takes the 10,000 tokens with the largest tf-idf scores from the training.
 
[1] Diagnosis code assignment: models and evaluation metrics. Journal of the American Medical Informatics

In [8]:
max_number_features = 10000

In [9]:
# TfidfVectorizer
# Convert all characters to lowercase before tokenizing (by default)
# tokenization (by default)
# max_features: consider the top max_features ordered by term frequency across the corpus
vectorizer = TfidfVectorizer(max_features=max_number_features,stop_words='english',max_df=0.9 )  
train_notes_vector = vectorizer.fit_transform(train_notes)
dev_notes_vector = vectorizer.transform(dev_notes)

## transforming list of ICD codes to vector

In [21]:
#transforming list of icd_codes into a vector
        
def get_unique_icd9_codes(train_labels, dev_labels, test_labels):
    # get list of unique icd9_codes in the whole input dataset 
    icd9_set = set()
    icd9_codes = np.append(np.append(train_labels, dev_labels, axis=0),test_labels, axis=0)
    for icd9_code_row in icd9_codes:
        for icd9_code in icd9_code_row.split():
            icd9_set.add(icd9_code)  
    return icd9_set


def get_icd9_array(icd9_codes):
    icd9_list = icd9_codes.split()
    icd9_index_array = [0]*len(unique_icd9_codes)
    for icd9_code in icd9_list:
        icd9_index_array[icd9_to_id [icd9_code]] = 1
    return icd9_index_array


unique_icd9_codes = get_unique_icd9_codes(train_labels, dev_labels, test_labels)
index_to_icd9 = dict(enumerate(unique_icd9_codes))
icd9_to_id = {v:k for k,v in index_to_icd9.iteritems()}
    

In [57]:
train_labels_vector= list(map(get_icd9_array, train_labels))
dev_labels_vector = list(map(get_icd9_array, dev_labels))
test_labels_vector = list(map(get_icd9_array, test_labels))

## Neural Network for Multilabel classification

In [39]:
def run_epoch(lm, session, X, y, batch_size):
    for batch in xrange(0, X.shape[0], batch_size):
        X_batch = X[batch : batch + batch_size]
        y_batch = y[batch : batch + batch_size]
        feed_dict = {lm.x:X_batch,lm.target_y:y_batch}
        loss, train_op_value =  session.run( [lm.loss,lm.train],feed_dict=feed_dict ) 
    

In [42]:
def predict_icd9_codes(lm, session, x_data, y_dim):
    total_y_hat = []
    for batch in xrange(0, x_data.shape[0], batch_size):
        X_batch = x_data[batch : batch + batch_size]
        y_hat_out = session.run(lm.y_hat, feed_dict={lm.x:X_batch})
        total_y_hat.extend(y_hat_out)
    return  total_y_hat
    

In [43]:
#build tensorflow graphs
reload(nn_model)

# Model parameters
Hidden_dims = [10, 10]
learning_rate = 0.001
y_dim = len(unique_icd9_codes)
model_params = dict(Hidden_dims=Hidden_dims, 
                    learning_rate = learning_rate, vocabulary_size =max_number_features , y_dim=y_dim)

lm = nn_model.NNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

    

In [58]:
X = train_notes_vector.todense()
y = train_labels_vector
batch_size = 50
num_epochs = 100

with lm.graph.as_default():
    initializer = tf.global_variables_initializer()

with tf.Session(graph=lm.graph) as session:
    session.run(initializer)
    #training
    for epoch_num in xrange(num_epochs):
        run_epoch(lm, session, X, y, batch_size)
    #prediction using training and dev data 
    train_y_hat = predict_icd9_codes(lm, session, train_notes_vector.todense(), y_dim)
    dev_y_hat = predict_icd9_codes(lm, session, dev_notes_vector.todense(), y_dim)


## Performance Evaluation

In [None]:
# TODO

In [75]:
def apply_threshold (y_hat):
    y_label = [1 if y > 0.49 else 0 for y in y_hat ]
    return y_label

In [77]:
apply_threshold(train_y_hat[0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [61]:
train_labels_vector[0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]

### dev

In [78]:
apply_threshold(dev_y_hat[0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [63]:
dev_labels_vector[0]

[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]