In [1]:
import csv
import re
import string

In [2]:
from timeit import default_timer as timer

In [None]:
def preprocess_text(text):
    """
    """
    text = text.strip().lower()
    text = re.sub('\d+|\d+\.\d+', '[NUM]', text)

    # PHI tags are in format [**INFORMATION**]
    text = re.sub('(\[\*\*.*?\*\*\])', '[PHI]', text)
#     phi_tags = re.findall('(\[\*\*.*?\*\*\])', text)
#     for i, tag in enumerate(phi_tags):
#         text = text.replace(tag, '__PHI_{}__'.format(i))
        
    text = re.sub('---+', '\n\n-----\n\n', text)
    text = re.sub('___+', '\n\n_____\n\n', text)
    text = re.sub('[\?\.\!]+(?=[\?\.\!])', '', text)
    text = re.sub('\\+', ' ', text)
    text = re.sub('-|/', '_', text)
    text = re.sub('_+', '_', text)
    text = re.sub('\*|\(|\)', ' ', text)
    text = re.sub('['+'!"#$%&\'()*+,-./:;<=>?@\\^`{|}~'+']', '', text)
    text = re.sub('\s_\s', ' ', text)
    text = re.sub('\r+|\n+|\u0085+|\u2028+|\u2029+', ' ', text) # Replace newlines
    text = re.sub('\s+|\u00A0+', ' ', text) # Replace multiple spaces w/ single    
    segments = text.split(' ')
    
    return segments

In [3]:
NOTEEVENTS = "../data/NOTEEVENTS.csv"

In [None]:
notes = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        note = preprocess_text(note[-1])
        notes.append(note)
#         if note[6] == 'Discharge summary':
#             note = preprocess_text(note[-1])
#             notes.append(note)

In [4]:
import gensim
from gensim.models.fasttext import FastText
from timeit import default_timer as timer

In [None]:
model = FastText(size=300, window=10, min_count=5)

In [None]:
model.build_vocab(lines)

In [None]:
print(model)

In [None]:
start = timer()

model.train(lines, total_examples=len(lines), epochs=5)

end = timer()
print(end - start)

In [None]:
print(model)

In [None]:
model.save('saved_model_gensim2')

In [None]:
model.wv.doesnt_match(['cipro', 'dexamethasone', 'radiology', 'sitagliptin'])

In [None]:
with open("processed_text.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(notes)

In [7]:
def clean_string1(text):
    """
    """
    text = text.strip().lower().replace('-', '_').replace('.', '_').replace(' ', '_').rstrip('_')
    return text

def preprocess_text2(query):
    """
    """
    query = re.sub('\d+|\d+\.\d+', '[NUM]', query)
    query = re.sub('(\[\*\*.*?\*\*\])', '[PHI]', query)
    query = query.strip('"').strip('?').strip("'").strip('(').strip(')').strip(':')
    query = re.sub('['+'!"#$%&\'()*+,-./:;<=>?@\\^`{|}~'+']', '', query)
    word_list = query.split()
    word_list = [clean_string1(word) for word in word_list]
    return word_list

In [None]:
notes2 = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        note = preprocess_text2(note[-1])
        notes2.append(note)

In [None]:
with open("processed_text.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(notes2)

In [None]:
notes2 = []
with open("processed_text.csv", "r") as f:
    reader = csv.reader(f)
    for note in reader:
        notes2.append(note)

In [None]:
notes2[2]

## FastText model

In [5]:
import gensim
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
from timeit import default_timer as timer

In [None]:
ft_model = FastText(size=300, window=10, min_count=5)

In [None]:
start = timer()

ft_model.build_vocab(notes2)

end = timer()
print(end-start)
print(ft_model)

In [None]:
start = timer()

ft_model.train(notes2, total_examples=len(notes2), epochs=5)

end = timer()
print(end - start)

In [None]:
ft_model.save('fasttext_embeddings')

## Word2Vec model

In [None]:
w2v_model = Word2Vec(size=300, window=10, min_count=5, sg=1)

In [None]:
start = timer()

w2v_model.build_vocab(notes2)

end = timer()
print(end-start)
print(ft_model)

In [None]:
start = timer()

w2v_model.train(notes2, total_examples=len(notes2), epochs=5)

end = timer()
print(end - start)

In [None]:
w2v_model.save('w2v_embeddings')

## Preparing input data/fake labels

In [67]:
word_vectors = Word2Vec.load('w2v_embeddings')
# word_vectors = FastText.load('fasttext_embeddings')

  app.launch_new_instance()


In [71]:
embedding_dim = word_vectors.wv.vectors.shape[1]

In [8]:
input_notes = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        if note[6] == 'Discharge summary':
            note = preprocess_text2(note[-1])
            input_notes.append(note)

In [29]:
import numpy as np
from sklearn.model_selection import train_test_split

# for local testing subset notes to 5000
subset_notes = input_notes[:5000][:]

N = len(subset_notes)
# Create fake labels
p = [0.4, 0.28, 0.22, 0.19, 0.16]
y = np.array([[np.random.binomial(1, p[i]) for i in range(len(p))] for x in range(N)])
# Create 70/30 train/test split
x_train, x_test, y_train, y_test = train_test_split(subset_notes, y, test_size=0.3)

In [38]:
MAXLEN = 0
for note in subset_notes:
    if len(note) > MAXLEN:
        MAXLEN = len(note)

In [78]:
from keras.preprocessing.text import Tokenizer
max_words = len(w2v_model.wv.vocab)
max_words = 15000
token = Tokenizer(max_words)
token.fit_on_texts(subset_notes)
vocab_size = max_words + 1

sequences = token.texts_to_sequences(x_train)
test_sequences = token.texts_to_sequences(x_test)
## Convert to sequences ##
from keras.preprocessing.sequence import pad_sequences
seq_len = 3000
X = pad_sequences(sequences, maxlen=seq_len)
X_test = pad_sequences(test_sequences, maxlen=seq_len)

In [79]:
embeddings_index = {}
vocab = token.word_index.keys()
for word in vocab:
    if word in w2v_model.wv.vocab:
      coefs = np.asarray(w2v_model.wv[word], dtype='float32')
      embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 46242 word vectors.


In [80]:
word_index = token.word_index
embedding_matrix = np.zeros((vocab_size , embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector

In [81]:
num_classes = y_test.shape[1]

In [None]:
from keras.optimizers import Adam
from keras.models import Sequential,Model
from keras.layers import Dense, Activation, Dropout, Embedding, Input, Dropout, Bidirectional, GaussianNoise
from keras.layers.recurrent import LSTM, GRU

## Build the model ##
input = Input(shape=(seq_len,))
x = Embedding(input_dim = vocab_size , output_dim = embedding_dim, weights=[embedding_matrix], trainable=False)(input)
x = GaussianNoise(0.75)(x)
x = Bidirectional(GRU(units = 128, recurrent_dropout=0.2, dropout=0.2, activation = 'relu', return_sequences=True))(x)
x = Bidirectional(GRU(units = 128, recurrent_dropout=0.2, dropout=0.2, activation = 'relu'))(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(num_classes, activation='sigmoid')(x)
model = Model(input,x)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X,y_train,epochs=25, batch_size = 128) 

## Label creation

In [1]:
# Import all libraries needed for the tutorial

# General syntax to import specific functions in a library: 
##from (library) import (specific library function)
from pandas import DataFrame, read_csv

# General syntax to import a library but no functions: 
##import (library) as (give the library a nickname/alias)
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
import numpy as np


# Enable inline plotting
%matplotlib inline

In [3]:
#read in the two csv's

df1 = pd.read_csv("DIAGNOSES_ICD.csv")
df2 = pd.read_csv("NOTEEVENTS.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# record the top codes in a list

codes = ["4019","4280","42731", "41401", "5849"]

In [5]:
#make reduced version of df1 to only contain rows that have top icd9 cods

df1_reduced = df1[df1["ICD9_CODE"].isin(codes)] 
dummy = pd.get_dummies(df1_reduced['ICD9_CODE'])

#combine the dummy df with the reduced df1 matrix

dummy_combined = pd.concat([df1_reduced, dummy],axis=1)
dummy_combined.head(10)
#dummy_combined.shape

#now drop unused columns
dummy_combined_reduced = dummy_combined.drop(['ROW_ID','SUBJECT_ID','SEQ_NUM','ICD9_CODE'], axis = 1)
dummy_combined_reduced.head(15)

Unnamed: 0,HADM_ID,4019,41401,42731,4280,5849
31,174105,0,1,0,0,0
36,178393,0,1,0,0,0
51,114585,0,0,0,1,0
53,114585,1,0,0,0,0
67,140784,0,0,0,1,0
70,140784,1,0,0,0,0
75,164853,0,0,0,0,1
89,164853,1,0,0,0,0
111,195632,0,1,0,0,0
112,195632,1,0,0,0,0


In [6]:
#now filter to get single instances of HADM_ID
dcr_final = dummy_combined_reduced.drop_duplicates(subset = "HADM_ID", keep = "first")

#now join the two tables together 
df_final=pd.merge(df2,dcr_final,left_on="HADM_ID", right_on='HADM_ID',how='left')

In [7]:
#make all NaN in dummy columns zero
df_final[codes] = df_final[codes].fillna(0)

#filter category to just be discharge summary
df_final = df_final[df_final.CATEGORY == 'Discharge summary']

# removed any hadmid that have more than one entry in database
df_final = df_final.drop_duplicates(subset = "HADM_ID", keep = False)

#random sample of the data
sub_df_final = df_final.sample(1000)


In [8]:
sub_df_final.to_csv("top_icd9.csv")

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,4019,41401,42731,4280,5849
11947,15610,3133,135744,2173-08-11,,,Discharge summary,Report,,,Admission Date: [**2173-7-5**] Discharge ...,0.0,0.0,0.0,0.0,0.0
29575,38527,87410,174275,2194-10-01,,,Discharge summary,Report,,,Admission Date: [**2194-9-28**] ...,1.0,0.0,0.0,0.0,0.0
40143,35583,46267,113271,2146-04-11,,,Discharge summary,Report,,,Admission Date: [**2146-4-9**] D...,0.0,0.0,0.0,0.0,0.0
11675,16434,52875,168988,2183-03-04,,,Discharge summary,Report,,,Admission Date: [**2183-2-26**] ...,0.0,0.0,0.0,0.0,1.0
1525,1490,29334,119868,2169-06-09,,,Discharge summary,Report,,,Admission Date: [**2169-6-2**] D...,0.0,0.0,1.0,0.0,0.0
5293,5065,23725,160422,2157-09-20,,,Discharge summary,Report,,,Admission Date: [**2157-9-12**] ...,0.0,0.0,0.0,0.0,0.0
16472,24391,8980,101170,2106-02-26,,,Discharge summary,Report,,,Admission Date: [**2106-2-5**] D...,1.0,0.0,0.0,0.0,0.0
13844,14938,1646,170199,2160-08-01,,,Discharge summary,Report,,,Admission Date: [**2160-7-28**] Discharge...,0.0,1.0,0.0,0.0,0.0
35929,30682,23168,123395,2130-04-11,,,Discharge summary,Report,,,Admission Date: [**2130-4-11**] Dischar...,0.0,0.0,0.0,0.0,0.0
7265,7184,47956,154518,2167-05-03,,,Discharge summary,Report,,,Admission Date: [**2167-4-28**] ...,0.0,0.0,0.0,0.0,0.0
