In [1]:
import csv
import random
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import *
import re
from tensorflow.contrib import learn

ImportError: No module named 'tensorflow'

General Sources:
http://ruder.io/deep-learning-nlp-best-practices/index.html#classification

### Reading File

In [None]:
with open('../baseline/psql_files/dis_notes_icd9.csv', 'rb') as csvfile:
    discharge_notes_reader = csv.reader(csvfile)
    discharge_notes_list = list(discharge_notes_reader)    
random.shuffle(discharge_notes_list)

print "Number of records in the dataset: ", len (discharge_notes_list)

we will take only 10,000 records to compare with NN baseline

### Splitting Files

In [None]:
def split_file(data, train_frac = 0.7, dev_frac = 0.15):   
    train_split_idx = int(train_frac * len(data))
    dev_split_idx = int ((train_frac + dev_frac)* len(data))
    train_data = data[:train_split_idx]
    dev_data = data[train_split_idx:dev_split_idx]
    test_data = data[dev_split_idx:]
    return train_data, dev_data, test_data

In [None]:
#starting for 1,000 just for programming
number_records = 1000

In [6]:

discharge_notes_nparray= np.asarray(discharge_notes_list[0:number_records])
print 'Number of discharge clinical notes: ', len(discharge_notes_nparray)

Number of discharge clinical notes:  1000


In [7]:
discharge_notes= discharge_notes_nparray[:,3]
discharge_labels = discharge_notes_nparray[:,4]
train_notes, dev_notes, test_notes = split_file (discharge_notes)
train_labels, dev_labels, test_labels = split_file (discharge_labels)
print 'Training set samples:', len (train_notes)
print 'Dev set samples:', len (dev_notes)
print 'Test set samples:', len (test_notes)

Training set samples: 700
Dev set samples: 150
Test set samples: 150


## Stats about Notes  (TODO:)
* vocabulary of size
* find out notes that are too large, outliers to take out (otherwise the embeddings will pad a lot of zeroes to the other note-vectors(

In [8]:
number_words = []
for note in discharge_notes:
    number_words.append(len(note.split()))
print np.mean(number_words)

1640.448


## Converting icd9 labels to vectors

In [9]:
#counts by icd9_codes
icd9_codes = Counter()
for label in discharge_labels:
    for icd9_code in label.split():
        icd9_codes[icd9_code] += 1
print icd9_codes

Counter({'4019': 472, '42731': 307, '4280': 305, '41401': 266, '5849': 227, '2724': 194, '25000': 189, '51881': 160, '5990': 154, '53081': 147, '2720': 131, '2851': 125, '486': 113, '2449': 110, '2859': 106, '5070': 89, '2762': 88, '496': 84, '99592': 81, '0389': 77})


In [10]:
# list of unique icd9_codes and lookups for its index in the vector
unique_icd9_codes = list (icd9_codes)
index_to_icd9 = dict(enumerate(unique_icd9_codes))
icd9_to_id = {v:k for k,v in index_to_icd9.iteritems()}
print 'List of unique icd9 codes from all labels: ', unique_icd9_codes

List of unique icd9 codes from all labels:  ['2859', '99592', '4280', '2724', '25000', '2720', '2851', '2762', '2449', '4019', '0389', '41401', '42731', '5990', '53081', '486', '496', '5070', '51881', '5849']


In [11]:
#transforming list of icd_codes into a vector
def get_icd9_array(icd9_codes):
    icd9_index_array = [0]*len(unique_icd9_codes)
    for icd9_code in icd9_codes.split():
        index = icd9_to_id [icd9_code]
        icd9_index_array[index] = 1
    return icd9_index_array

In [12]:
train_labels_vector= list(map(get_icd9_array, train_labels))
dev_labels_vector = list(map(get_icd9_array, dev_labels))
test_labels_vector = list(map(get_icd9_array, test_labels))

## Pre-processing notes

https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py


(1) Clean the text data using the same code as the original paper.
https://github.com/yoonkim/CNN_sentence

(2) Pad each note to the maximum note length, which turns out to be NN. We append special <PAD> tokens to all other notes to make them NN words. Padding sentences to the same length is useful because it allows us to efficiently batch our data since each example in a batch must be of the same length.
(3) Build a vocabulary index and map each word to an integer between 0 and 18,765 (the vocabulary size). Each sentence becomes a vector of integers

In [48]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [66]:
def note_preprocessing(data_notes):
    notes_stripped = [s.strip() for s in data_notes]
    notes_clean = [clean_str(note) for note in notes_stripped ]
    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in notes_clean])
    print ' max document length: ', max_document_length
    return max_document_length, notes_clean

In [67]:
#preprocess documents
max_document_length, train_notes_processed = note_preprocessing(train_notes)

#create vocabulary processor
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    
# convert words to ids, and each document is padded
train_notes_x = np.array(list(vocab_processor.fit_transform(train_notes_processed)))

 max document length:  8280


In [75]:
vocabulary_size = len(vocab_processor.vocabulary_)
print 'Vocabulary_size: ', vocabulary_size

Vocabulary_size:  24254


### transforming to embeddings using word2vec

From: "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"

"We pre-train our embeddings with word2vec on all discharge notes available in the MIMIC-III database.   
The word embeddings of all words in the text to classify are concatenated and used as input to the
convolutional layer. Convolutions detect a signal from a combination of adjacent inputs. We
combine multiple convolutions of different lengths to evaluate phrases that are anywhere from
two to five words long,"   

(tf-idf is removing negations..  embedding is taking care of mispellings.. we may need further training-tuning because of medical terms)

https://code.google.com/archive/p/word2vec/
    
Pre-trained word and phrase vectors

"We are publishing pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in [2]. The archive is available here: GoogleNews-vectors-negative300.bin.gz."   

### for now we wil train our own embeddings, but word2vec will be better

## CNN Training

here is an example of a CNN to classify text.. our model will have different values for d (embedding-size, region sizes, etc)
<img src="CNN_for_text2.png"/>

This is the CNN used with the MIMIC discharge summaries
<img src="mimic_CNN_text_classification.png"/>

### sources:
http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/  
https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py   
https://www.tensorflow.org/get_started/mnist/pros   
https://www.tensorflow.org/api_docs/python/tf/nn/conv2d   
 

In [42]:
import tensorflow as tf
import cnn_model

From: "A Comparison of Rule-Based and Deep Learning Models for Patient Phenotyping"

"For the CNN model, we used 100 filters for each of the widths 2, 3, 4, and 5.   
To prevent overfitting, we set the dropout probability to 0.5 and used L2-normalization to normalize word
embeddings to have a max norm of 3.64   
The model was trained using adadelta with an initial learning rate of 1 for 20 epochs"

In [76]:
#build tensorflow graphs
reload(cnn_model)

# Model parameters

model_params = dict(vocab_size= vocabulary_size, sequence_length=max_document_length, embedding_size=128, num_classes=20, filter_sizes=[3,4,5], num_filters=100)

# Build and Train Model
cnn = cnn_model.NNLM(**model_params)
cnn.BuildCoreGraph()
cnn.BuildTrainGraph()

In [81]:
def run_epoch(lm, session, X, y, batch_size):
    for batch in xrange(0, X.shape[0], batch_size):
        # x SHAPE:   [batch_size, sequence_length, embedding_size]
        print 'running batc: ', batch 
        X_batch = X[batch : batch + batch_size]
        y_batch = y[batch : batch + batch_size]
        feed_dict = {lm.input_x:X_batch,lm.input_y:y_batch,lm.dropout_keep_prob:0.5}
        #loss, train_op_value =  session.run( [lm.loss,lm.train],feed_dict=feed_dict ) 
        _, step,loss = session.run([lm.train_op, lm.global_step, lm.loss], feed_dict)

In [82]:
X = train_notes_x
y = train_labels_vector
batch_size = 50
num_epochs = 2

with cnn.graph.as_default():
    initializer = tf.global_variables_initializer()

with tf.Session(graph=cnn.graph) as session:
    session.run(initializer)
    #training
    for epoch_num in xrange(num_epochs):
        print 'epoch_num:' , epoch_num
        run_epoch(cnn, session, X, y, batch_size)


epoch_num: 0
running batc:  0
running batc:  50
running batc:  100
running batc:  150
running batc:  200
running batc:  250
running batc:  300
running batc:  350
running batc:  400
running batc:  450
running batc:  500
running batc:  550
running batc:  600
running batc:  650
epoch_num: 1
running batc:  0
running batc:  50
running batc:  100
running batc:  150
running batc:  200
running batc:  250
running batc:  300
running batc:  350
running batc:  400
running batc:  450
running batc:  500
running batc:  550
running batc:  600
running batc:  650
