In [1]:
import csv
import re
import string

In [2]:
from timeit import default_timer as timer

In [None]:
def preprocess_text(text):
    """
    """
    text = text.strip().lower()
    text = re.sub('\d+|\d+\.\d+', '[NUM]', text)

    # PHI tags are in format [**INFORMATION**]
    text = re.sub('(\[\*\*.*?\*\*\])', '[PHI]', text)
#     phi_tags = re.findall('(\[\*\*.*?\*\*\])', text)
#     for i, tag in enumerate(phi_tags):
#         text = text.replace(tag, '__PHI_{}__'.format(i))
        
    text = re.sub('---+', '\n\n-----\n\n', text)
    text = re.sub('___+', '\n\n_____\n\n', text)
    text = re.sub('[\?\.\!]+(?=[\?\.\!])', '', text)
    text = re.sub('\\+', ' ', text)
    text = re.sub('-|/', '_', text)
    text = re.sub('_+', '_', text)
    text = re.sub('\*|\(|\)', ' ', text)
    text = re.sub('['+'!"#$%&\'()*+,-./:;<=>?@\\^`{|}~'+']', '', text)
    text = re.sub('\s_\s', ' ', text)
    text = re.sub('\r+|\n+|\u0085+|\u2028+|\u2029+', ' ', text) # Replace newlines
    text = re.sub('\s+|\u00A0+', ' ', text) # Replace multiple spaces w/ single    
    segments = text.split(' ')
    
    return segments

In [3]:
NOTEEVENTS = "../data/NOTEEVENTS.csv"

In [None]:
notes = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        note = preprocess_text(note[-1])
        notes.append(note)
#         if note[6] == 'Discharge summary':
#             note = preprocess_text(note[-1])
#             notes.append(note)

In [4]:
import gensim
from gensim.models.fasttext import FastText
from timeit import default_timer as timer

In [None]:
model = FastText(size=300, window=10, min_count=5)

In [None]:
model.build_vocab(lines)

In [None]:
print(model)

In [None]:
start = timer()

model.train(lines, total_examples=len(lines), epochs=5)

end = timer()
print(end - start)

In [None]:
print(model)

In [None]:
model.save('saved_model_gensim2')

In [None]:
model.wv.doesnt_match(['cipro', 'dexamethasone', 'radiology', 'sitagliptin'])

In [None]:
with open("processed_text.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(notes)

In [7]:
def clean_string1(text):
    """
    """
    text = text.strip().lower().replace('-', '_').replace('.', '_').replace(' ', '_').rstrip('_')
    return text

def preprocess_text2(query):
    """
    """
    query = re.sub('\d+|\d+\.\d+', '[NUM]', query)
    query = re.sub('(\[\*\*.*?\*\*\])', '[PHI]', query)
    query = query.strip('"').strip('?').strip("'").strip('(').strip(')').strip(':')
    query = re.sub('['+'!"#$%&\'()*+,-./:;<=>?@\\^`{|}~'+']', '', query)
    word_list = query.split()
    word_list = [clean_string1(word) for word in word_list]
    return word_list

In [None]:
notes2 = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        note = preprocess_text2(note[-1])
        notes2.append(note)

In [None]:
with open("processed_text.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(notes2)

In [None]:
notes2 = []
with open("processed_text.csv", "r") as f:
    reader = csv.reader(f)
    for note in reader:
        notes2.append(note)

In [None]:
notes2[2]

## FastText model

In [5]:
import gensim
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
from timeit import default_timer as timer

In [None]:
ft_model = FastText(size=300, window=10, min_count=5)

In [None]:
start = timer()

ft_model.build_vocab(notes2)

end = timer()
print(end-start)
print(ft_model)

In [None]:
start = timer()

ft_model.train(notes2, total_examples=len(notes2), epochs=5)

end = timer()
print(end - start)

In [None]:
ft_model.save('fasttext_embeddings')

## Word2Vec model

In [None]:
w2v_model = Word2Vec(size=300, window=10, min_count=5, sg=1)

In [None]:
start = timer()

w2v_model.build_vocab(notes2)

end = timer()
print(end-start)
print(ft_model)

In [None]:
start = timer()

w2v_model.train(notes2, total_examples=len(notes2), epochs=5)

end = timer()
print(end - start)

In [None]:
w2v_model.save('w2v_embeddings')

## Preparing input data/fake labels

In [67]:
word_vectors = Word2Vec.load('w2v_embeddings')
# word_vectors = FastText.load('fasttext_embeddings')

  app.launch_new_instance()


In [71]:
embedding_dim = word_vectors.wv.vectors.shape[1]

In [8]:
input_notes = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        if note[6] == 'Discharge summary':
            note = preprocess_text2(note[-1])
            input_notes.append(note)

In [29]:
import numpy as np
from sklearn.model_selection import train_test_split

# for local testing subset notes to 5000
subset_notes = input_notes[:5000][:]

N = len(subset_notes)
# Create fake labels
p = [0.4, 0.28, 0.22, 0.19, 0.16]
y = np.array([[np.random.binomial(1, p[i]) for i in range(len(p))] for x in range(N)])
# Create 70/30 train/test split
x_train, x_test, y_train, y_test = train_test_split(subset_notes, y, test_size=0.3)

In [38]:
MAXLEN = 0
for note in subset_notes:
    if len(note) > MAXLEN:
        MAXLEN = len(note)

In [78]:
from keras.preprocessing.text import Tokenizer
max_words = len(w2v_model.wv.vocab)
max_words = 15000
token = Tokenizer(max_words)
token.fit_on_texts(subset_notes)
vocab_size = max_words + 1

sequences = token.texts_to_sequences(x_train)
test_sequences = token.texts_to_sequences(x_test)
## Convert to sequences ##
from keras.preprocessing.sequence import pad_sequences
seq_len = 3000
X = pad_sequences(sequences, maxlen=seq_len)
X_test = pad_sequences(test_sequences, maxlen=seq_len)

In [79]:
embeddings_index = {}
vocab = token.word_index.keys()
for word in vocab:
    if word in w2v_model.wv.vocab:
      coefs = np.asarray(w2v_model.wv[word], dtype='float32')
      embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 46242 word vectors.


In [80]:
word_index = token.word_index
embedding_matrix = np.zeros((vocab_size , embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector

In [81]:
num_classes = y_test.shape[1]

In [None]:
from keras.optimizers import Adam
from keras.models import Sequential,Model
from keras.layers import Dense, Activation, Dropout, Embedding, Input, Dropout, Bidirectional, GaussianNoise
from keras.layers.recurrent import LSTM, GRU

## Build the model ##
input = Input(shape=(seq_len,))
x = Embedding(input_dim = vocab_size , output_dim = embedding_dim, weights=[embedding_matrix], trainable=False)(input)
x = GaussianNoise(0.75)(x)
x = Bidirectional(GRU(units = 128, recurrent_dropout=0.2, dropout=0.2, activation = 'relu', return_sequences=True))(x)
x = Bidirectional(GRU(units = 128, recurrent_dropout=0.2, dropout=0.2, activation = 'relu'))(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(num_classes, activation='sigmoid')(x)
model = Model(input,x)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X,y_train,epochs=25, batch_size = 128) 

## Label creation

In [84]:
#Chris Hilger's code

# Import all libraries needed for the tutorial

# General syntax to import specific functions in a library: 
##from (library) import (specific library function)
from pandas import DataFrame, read_csv

# General syntax to import a library but no functions: 
##import (library) as (give the library a nickname/alias)
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
import numpy as np


# Enable inline plotting
%matplotlib inline

df1 = pd.read_csv("../data/DIAGNOSES_ICD.csv")
df2 = pd.read_csv("../data/NOTEEVENTS.csv")

# record the top codes in a list
codes = ["4019","4280","42731", "41401", "5849"]

#make reduced version of df1 to only contain rows that have top icd9 cods
df1_reduced = df1[df1["ICD9_CODE"].isin(codes)] 

dummy = pd.get_dummies(df1_reduced['ICD9_CODE'])

#combine the dummy df with the reduced df1 matrix
dummy_combined = pd.concat([df1_reduced, dummy],axis=1)

dummy_combined.head(10)
#dummy_combined.shape

#now drop unused columns
dummy_combined_reduced = dummy_combined.drop(['ROW_ID','SUBJECT_ID','SEQ_NUM','ICD9_CODE'], axis = 1)
dummy_combined_reduced.head(10)

#now filter to get single instances of HADM_ID
dcr_final = dummy_combined_reduced.drop_duplicates(subset = "HADM_ID", keep = "first")

#now join the two tables together 
df_final=pd.merge(df2,dcr_final,left_on="HADM_ID", right_on='HADM_ID',how='left')

#remove NAs from the data, (for doctors notes that were for mapped to nontop5 icd9 codes)
df_final = df_final[np.isfinite(df_final['4019'])]


#df_final.to_csv("top_icd9.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [126]:
dummy = pd.get_dummies(df1['ICD9_CODE'])[codes]
dummy_combined = pd.concat([df1, dummy], axis=1)
dummy_combined = dummy_combined.groupby(['HADM_ID'], as_index=False).sum().drop(['ROW_ID','SUBJECT_ID','SEQ_NUM'], axis = 1)

In [135]:
dummy_combined.head()

Unnamed: 0,HADM_ID,4019,4280,42731,41401,5849
0,100001,0,0,0,0,1
1,100003,1,0,0,0,0
2,100006,0,0,0,0,0
3,100007,1,0,0,0,0
4,100009,1,0,0,1,0


In [132]:
len(input_notes)

59652

In [None]:
input_notes = []
with open(NOTEEVENTS, "r") as file:
    reader = csv.reader(file)
    for i, note in enumerate(reader):
        if note[6] == 'Discharge summary':
            note = preprocess_text2(note[-1])
            input_notes.append(note)

In [141]:
discharge_df = df2[df2['CATEGORY'] == 'Discharge summary']

In [155]:
len(discharge_df.drop_duplicates(subset = "HADM_ID", keep = "last"))

52726

In [156]:
len(dummy_combined)

58976

In [160]:
discharge_df[discharge_df.duplicated('HADM_ID')]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
13,224,5350,169684.0,2143-04-30,,,Discharge summary,Report,,,Admission Date: [**2143-4-25**] Discharge...
21,232,9805,177212.0,2131-07-23,,,Discharge summary,Report,,,Admission Date: [**2131-7-5**] D...
35,246,710,114242.0,2182-02-28,,,Discharge summary,Report,,,Admission Date: [**2182-2-18**] Dischar...
55,202,4127,167565.0,2193-06-10,,,Discharge summary,Report,,,Admission Date: [**2193-6-9**] Discharg...
60,207,5239,129387.0,2189-02-26,,,Discharge summary,Report,,,Admission Date: [**2189-2-18**] Dischar...
62,209,5239,125055.0,2189-03-27,,,Discharge summary,Report,,,Admission Date: [**2189-3-17**] Dischar...
87,12,60614,116703.0,2175-10-04,,,Discharge summary,Report,,,Admission Date: [**2175-9-29**] ...
106,31,25995,152664.0,2128-05-07,,,Discharge summary,Report,,,Admission Date: [**2128-5-5**] Discharg...
141,66,3319,159001.0,2157-04-01,,,Discharge summary,Report,,,Admission Date: [**2157-3-20**] Dischar...
156,81,3506,140045.0,2195-05-28,,,Discharge summary,Report,,,Admission Date: [**2195-2-5**] Discharg...


In [162]:
discharge_df.iloc[441]['TEXT']

"Admission Date:  [**2123-12-31**]       Discharge Date:  [**2124-1-10**]\n\nDate of Birth:   [**2085-3-7**]       Sex:  M\n\nService:  Medicine\n\nADDENDUM:  The patient is a 38 year old [**Country 4574**] male with\nAIDS, left upper lobe aspergilloma and lower extremity\nparaparesis, who was originally admitted on [**2123-10-26**] with fever and cough.  He was subsequently found to have\nan left upper lobe aspergilloma which was initially treated\nwith amphotericin, which led to the patient having seizures.\nHe was then placed in a phenobarbital coma, which slowly\nresolved and was started on itraconazole therapy.  Please\nrefer to the dictation summary dictated on [**2124-1-5**],\ndictated by Dr. [**First Name8 (NamePattern2) **] [**Last Name (NamePattern1) **].\n\n1.  Neurologic:  On [**2123-12-29**], the patient began\ncomplaining of dizziness.  He did not describe a room\nspinning sensation.  He felt like his head was falling to the\nside, but no particular side, consistently.  H

In [163]:
discharge_df.iloc[440]['TEXT']

'Admission Date:  [**2123-10-27**]     Discharge Date:\n\nDate of Birth:   [**2085-3-7**]     Sex:  M\n\nService:\n\nHISTORY OF PRESENT ILLNESS:  This is a 38-year-old man with\nAIDS referred by Dr. [**First Name8 (NamePattern2) **] [**Last Name (NamePattern1) 4569**] who has fevers to greater\nthan 104, likely pneumonia or other pulmonary process\nincreasing for one month.  He reports increased cough,\nusually nonproductive, but occasional production of bloody\nsputum.  In addition, he has some dark stool which he states\nis maroon in color in the last few weeks as well as nausea\nand vomiting.  He states that sometimes he vomits blood.\nReports left upper quadrant pain times one month with eating.\nDenies dyspnea or chest pain.  He states some pain in his\nchest with cough only and that\'s resolved, mild headache like\na hot plate on his forehead, mild neck pain, positive urinary\nfrequency and dysuria times weeks.  Today, he has had\ndiarrhea, 30 minutes after meals.  He states he h