In [1]:
import numpy as np
import os
import pandas as pd
import re
import time
import sys
import torch
import torch.nn as nn

from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from torch.utils.data import Dataset

In [34]:
embedding_vector_size = 100
num_study_docs = 1341

In [3]:
t1 = time.time()
clinical_notes_df = pd.read_csv("NOTEEVENTS.csv")
# CHARTDATE and CHARTTIME apparrently have mixed types, but that does not matter for us
t2 = time.time()
print(t2 - t1)

  clinical_notes_df = pd.read_csv("NOTEEVENTS.csv")


78.38705682754517


In [4]:
print(clinical_notes_df.shape)
print(clinical_notes_df.head(1))

(2083180, 11)
   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE CHARTTIME STORETIME   
0     174       22532  167853.0  2151-08-04       NaN       NaN  \

            CATEGORY DESCRIPTION  CGID  ISERROR   
0  Discharge summary      Report   NaN      NaN  \

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  


In [5]:
annotations_df = pd.read_csv("annotations.csv")

In [6]:
annotations_df.shape

(1610, 18)

In [7]:
annotations_df.head(1)

Unnamed: 0,Hospital.Admission.ID,subject.id,chart.time,cohort,Obesity,Non.Adherence,Developmental.Delay.Retardation,Advanced.Heart.Disease,Advanced.Lung.Disease,Schizophrenia.and.other.Psychiatric.Disorders,Alcohol.Abuse,Other.Substance.Abuse,Chronic.Pain.Fibromyalgia,Chronic.Neurological.Dystrophies,Advanced.Cancer,Depression,Dementia,Unsure
0,118003,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0


In [8]:
# Get all documents wtih a combination of hospital admission ID and subject ID matching one
# used in the study
merged_df = pd.merge(annotations_df, clinical_notes_df, left_on = ["Hospital.Admission.ID", "subject.id"], right_on = ["HADM_ID", "SUBJECT_ID"])

In [9]:
merged_df.shape

(56839, 29)

In [10]:
merged_df["CATEGORY"].value_counts()

CATEGORY
Nursing/other        21066
Radiology            13135
Nursing               7520
Physician             5059
ECG                   5026
Discharge summary     1976
Respiratory           1155
Echo                  1021
Nutrition              325
General                290
Rehab Services         166
Social Work             64
Case Management         36
Name: count, dtype: int64

In [11]:
# Only the discharge summaries are relevant
merged_df = merged_df[merged_df["CATEGORY"] == "Discharge summary"]

In [12]:
merged_df.shape

(1976, 29)

In [13]:
# Number of unique combinations of hospital admission ID and subject ID
merged_df.groupby(["HADM_ID", "SUBJECT_ID"]).size().reset_index().rename(columns = {0 : 'count'}).shape

(1560, 3)

In [14]:
temp = merged_df.groupby(["HADM_ID", "SUBJECT_ID"]).size().reset_index().rename(columns = {0 : 'count'})

# Number of unique combinations of hospital admission ID and subject ID
# which have more than one discharge summary
print(temp[temp["count"] > 1].shape)

# Number of unique combinations of hospital admission ID and subject ID
# which have only one discharge summary
print(temp[temp["count"] == 1].shape)

(219, 3)
(1341, 3)


In [15]:
# A dataframe containing the hospital admission ID and subject ID for all
# patients used in the study and who have only one discharge summary in MIMIC-III
# We have no way of telling which set of labels corresponds to which discharge summary
# for each such combination of IDs, due to the labels being somewhat mislabeled (the chart.time
# field contains either a copy of the hospital admission ID or 999999 for each set of labels,
# instead of an actual time), so we can only use discharge summaries from these patients
ids_for_non_duplicate = temp[temp["count"] == 1][["HADM_ID", "SUBJECT_ID"]]

In [16]:
# This contains the hospital admission ID, subject ID, discharge summary text, and labels
# for all of the patients we will be using in our study
labelled_corpus_df = pd.merge(merged_df, ids_for_non_duplicate, left_on = ["HADM_ID", "SUBJECT_ID"], right_on = ["HADM_ID", "SUBJECT_ID"])
labelled_corpus_df = labelled_corpus_df[["HADM_ID",
                                         "SUBJECT_ID",
                                         "TEXT",
                                         "Advanced.Cancer",
                                         "Advanced.Heart.Disease",
                                         "Advanced.Lung.Disease",
                                         "Chronic.Neurological.Dystrophies",
                                         "Chronic.Pain.Fibromyalgia",
                                         "Alcohol.Abuse",
                                         "Other.Substance.Abuse",
                                         "Obesity",
                                         "Schizophrenia.and.other.Psychiatric.Disorders",
                                         "Depression"]]

In [17]:
labelled_corpus_df.head(1)

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression
0,118003.0,3644,Admission Date: [**2200-4-7**] Discharge ...,0,0,0,0,1,0,0,0,0,1


In [18]:
labelled_corpus_df.iloc[0]["TEXT"]

"Admission Date:  [**2200-4-7**]     Discharge Date:  [**2200-4-10**]\n\nDate of Birth:   [**2146-9-21**]     Sex:  F\n\nService:  CARDIAC INTENSIVE CARE MEDICINE\n\nCHIEF COMPLAINT:  The patient was admitted to the Cardiac\nIntensive Care Unit Medicine Service on [**2200-4-7**], with the\nchief complaint of acute myocardial infarction and fever.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 53 year old\nwhite female with a history of coronary artery disease,\nhypertension, hypercholesterolemia and two pack per day\ntobacco use with previous coronary artery bypass graft\nsurgery presenting to an outside hospital on [**2200-4-6**], with a\ntwo day history of fevers and confusion.  The patient had a\nCT scan of the chest at that time which revealed pneumonia by\nreport in the left lower lobe.\n\nWhile in the outside hospital Emergency Department, the\npatient complained of chest pain.  The patient states that\nshe has had this pain for approximately two weeks with no\nrelief.  She was

In [19]:
# From provided code
# Did not remove commas??
# Separates "patient's" into "patient 's"??? Purpose???? -> should be caught with n-grams
def clean_str(string):
    """
    Tokenization/string cleaning.
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower() # We include lower()??? # .lower() word2vec is case sensitive

clean_str(labelled_corpus_df.iloc[0]["TEXT"])

"Admission Date 2200 4 7 Discharge Date 2200 4 10 Date of Birth 2146 9 21 Sex F Service CARDIAC INTENSIVE CARE MEDICINE CHIEF COMPLAINT The patient was admitted to the Cardiac Intensive Care Unit Medicine Service on 2200 4 7 , with the chief complaint of acute myocardial infarction and fever HISTORY OF PRESENT ILLNESS The patient is a 53 year old white female with a history of coronary artery disease , hypertension , hypercholesterolemia and two pack per day tobacco use with previous coronary artery bypass graft surgery presenting to an outside hospital on 2200 4 6 , with a two day history of fevers and confusion The patient had a CT scan of the chest at that time which revealed pneumonia by report in the left lower lobe While in the outside hospital Emergency Department , the patient complained of chest pain The patient states that she has had this pain for approximately two weeks with no relief She was given Levofloxacin for apparent community acquired pneumonia and cardiac enzymes w

In [20]:
# Takes ~25 seconds once, then just 1.6344 seconds when run later?
t1 = time.time()
labelled_corpus_df["Cleaned Text"] = labelled_corpus_df.apply(lambda row : clean_str(row["TEXT"]), axis = 1)
labelled_corpus_df = labelled_corpus_df.drop(['TEXT'], axis=1)
t2 = time.time()
print(t2 - t1)

2.2439024448394775


In [21]:
labelled_corpus_df.iloc[100]["Cleaned Text"]

"Admission Date 2131 3 26 Discharge Date 2131 3 29 Date of Birth 2079 11 1 Sex M Service MEDICINE Allergies Bactrim Levaquin Location ( un ) Juice Attending First Name3 ( LF ) 348 Chief Complaint hypoglycemic episode , cough Major Surgical or Invasive Procedure History of Present Illness Mr Known lastname 7264 is a 51 year old male with past medical history significant for type I diabetes and mental retardation who was brought to ED from his group home after an episode of hypoglycemia with FSG of 37 and repeat FSG of 40 even after having his dinner EMS was called and he was given 12 16 amp of dextrose enroute to Hospital1 18 Per caregivers , patient 's mental status was at usual baseline In the ED , initial vs were T Age over 90 F , P 86 , BP 104 63 , RR 20 , O2 saturation rate is 97 room air Glucose trend in ED included 0030 fs Telephone Fax ( 1 ) 7265 fs Telephone Fax ( 1 ) 7266 fs 173 He also had a fever to 103F , noted cough on exam and tachypnea to mid 30s range No ABG was done in

In [77]:
# Save the dataframe containing the documents of interest and their labels
labelled_corpus_df.to_csv('labelled_corpus_df.csv')

In [22]:
# List of lists, where each internal list contains the words of a document
# Fairly fast
labelled_corpus_lol = [row.split() for row in labelled_corpus_df['Cleaned Text']]

In [23]:
len(labelled_corpus_lol) # 1341 documents

1341

In [24]:
labelled_corpus_lol[0]

['Admission',
 'Date',
 '2200',
 '4',
 '7',
 'Discharge',
 'Date',
 '2200',
 '4',
 '10',
 'Date',
 'of',
 'Birth',
 '2146',
 '9',
 '21',
 'Sex',
 'F',
 'Service',
 'CARDIAC',
 'INTENSIVE',
 'CARE',
 'MEDICINE',
 'CHIEF',
 'COMPLAINT',
 'The',
 'patient',
 'was',
 'admitted',
 'to',
 'the',
 'Cardiac',
 'Intensive',
 'Care',
 'Unit',
 'Medicine',
 'Service',
 'on',
 '2200',
 '4',
 '7',
 ',',
 'with',
 'the',
 'chief',
 'complaint',
 'of',
 'acute',
 'myocardial',
 'infarction',
 'and',
 'fever',
 'HISTORY',
 'OF',
 'PRESENT',
 'ILLNESS',
 'The',
 'patient',
 'is',
 'a',
 '53',
 'year',
 'old',
 'white',
 'female',
 'with',
 'a',
 'history',
 'of',
 'coronary',
 'artery',
 'disease',
 ',',
 'hypertension',
 ',',
 'hypercholesterolemia',
 'and',
 'two',
 'pack',
 'per',
 'day',
 'tobacco',
 'use',
 'with',
 'previous',
 'coronary',
 'artery',
 'bypass',
 'graft',
 'surgery',
 'presenting',
 'to',
 'an',
 'outside',
 'hospital',
 'on',
 '2200',
 '4',
 '6',
 ',',
 'with',
 'a',
 'two',
 'da

In [25]:
# Most hyperparameters specified in journal
# embedding_vector_size wasn't specified -> he said 300 in email
t_start = time.time()
w2v_model = Word2Vec(labelled_corpus_lol,
                     sg=0,
                     window=10,
                     negative=10,
                     min_count=5,
                     epochs=15,
                     vector_size=embedding_vector_size,
                     workers=3)
t_end = time.time()

print(t_end - t_start) # ~45 seconds for embedding_vector_size = 50. 66.4061 seconds for embedding_vector_size = 300. 50.2814359664917 for 100

50.2814359664917


In [26]:
# Just get the vectors
word_vectors = w2v_model.wv
del w2v_model

# These vectors were constructed using only the 1341 documents we are currently using, and not all discharge summaries
word_vectors.save("word_vectors_only_from_study_documents.wordvectors")

In [27]:
# Don't need to run word2vec again
# word_vectors = KeyedVectors.load("word_vectors_only_from_study_documents.wordvectors", mmap='r')

In [28]:
print(word_vectors['alcohol'])

[-3.1913362  -0.7937584  -3.4305966   1.3779067   1.0334337   3.8803656
  2.0882313   1.5921046  -0.9834959   0.05798558 -2.747063    0.26504967
 -0.32764825 -4.9664536   1.2170032  -3.2212317  -0.37966898  1.470269
  2.4975111  -0.3585035   1.5303808  -3.5003257  -1.8195837   6.3704357
  2.0175116   0.5442167   2.562562   -2.0317864  -0.08685263  1.1242979
 -3.2902176   1.9299203   0.84349376 -0.5805749   0.48449987 -0.49465993
 -1.541613    5.019491    1.8745767   1.1560003  -2.1119165  -4.795454
 -3.5808349   1.3001137   2.738111    2.1493132  -0.05681613 -0.592943
  5.17941     0.17885032  0.41830295 -2.6522846  -3.9515848   0.8653915
  1.4369856   0.5619563  -2.2285926  -4.5738487   0.3616729  -5.2329664
  4.8648844  -2.1206367   4.608313   -1.9105221  -5.087372   -2.356315
  0.6225381  -7.5958996   3.5571334   0.44847414 -1.4302235   0.93212014
 -0.50478613  1.5071125  -0.73268026  1.5939337  -2.4759367  -3.1776161
  0.05058745 -1.1722823   2.0113988  -1.5730745  -4.967529   -6.3

In [29]:
word_vectors.most_similar("alcohol", topn = 5)

[('ETOH', 0.7669126987457275),
 ('tobacco', 0.7483139038085938),
 ('illicit', 0.7401039600372314),
 ('substance', 0.7321199774742126),
 ('EtOH', 0.7297239303588867)]

In [30]:
print("Vocabulary Size")
print(len(word_vectors))

Vocabulary Size
15543


In [31]:
"alcohol" in word_vectors

True

In [32]:
"bee" in word_vectors

False

In [35]:
# Fairly fast
# Creates docs_as_tensors, which features each document as a stack of (horizontal) embeddings for each of its words
t1 = time.time()
docs_as_tensors = [None] * num_study_docs # Number of documents we use

i = 0
for doc in labelled_corpus_lol:
    docs_as_tensors[i] = torch.stack([torch.from_numpy(word_vectors[word].copy()) for word in doc if (word in word_vectors)])
    i += 1

    
t2 = time.time()

print(t2 - t1)

print(len(docs_as_tensors))
print(docs_as_tensors[len(docs_as_tensors) - 1].shape)

20.533469438552856
1341
torch.Size([2440, 100])


In [36]:
max_length = 0
for doc_tensor in docs_as_tensors:
    if doc_tensor.shape[0] > max_length:
        max_length = doc_tensor.shape[0]

print(max_length)

5384


In [37]:
# Pad all documents to the maximum length of any of them
# Takes a really long time?
# 466.3814811706543 seconds for vector size of 300
# 6.639481544494629 seconds for vector size of 100

t1 = time.time()

# study_corpus_tensor[i, :, :] will contain the representation of the i-th document
# padded to the length of the longest document
# study_corpus_tensor[i, j, :] will contain the embedding vector for the j-th word in the i-th document,
# or all 0s, if j is greater than the number of words in the i-th document
study_corpus_tensor = torch.zeros((len(docs_as_tensors), max_length, embedding_vector_size))

for i, doc_tensor in enumerate(docs_as_tensors):
    study_corpus_tensor[i, 0:doc_tensor.shape[0], :] = doc_tensor

t2 = time.time()
print(t2 - t1)

6.639481544494629


In [38]:
study_corpus_tensor.shape

torch.Size([1341, 5384, 100])

In [39]:
study_corpus_tensor.dtype # 4 bytes each

torch.float32

In [40]:
1341 * 5384 * 100 * 4

2887977600

In [63]:
docs_as_tensors[58].shape

torch.Size([5384, 100])

In [74]:
# Save the documents recorded as stacks of word embeddings
# They are incomplete, as we used the word2vec embeddings from the 1341 documents under investigation, and not all
# discharge summaries in MIMIC-III
torch.save(study_corpus_tensor, 'embedded_docs_incomplete.pt')

In [78]:
# Don't need to do stuff before this point, maybe
labelled_corpus_df = pd.read_csv("labelled_corpus_df.csv")

In [79]:
study_corpus_tensor = torch.load("embedded_docs_incomplete.pt")

In [82]:
labelled_corpus_df.head(1)

Unnamed: 0.1,Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,Advanced.Cancer,Advanced.Heart.Disease,Advanced.Lung.Disease,Chronic.Neurological.Dystrophies,Chronic.Pain.Fibromyalgia,Alcohol.Abuse,Other.Substance.Abuse,Obesity,Schizophrenia.and.other.Psychiatric.Disorders,Depression,Cleaned Text
0,0,118003.0,3644,Admission Date: [**2200-4-7**] Discharge ...,0,0,0,0,1,0,0,0,0,1,Admission Date 2200 4 7 Discharge Date 2200 4 ...


In [83]:
# Making sure we correctly padded with 0s
for i in range(num_study_docs):
    if torch.all(study_corpus_tensor[i, 0:docs_as_tensors[i].shape[0], :] != docs_as_tensors[i]):
        print("Copied wrong while padding document", i, "!")
    if docs_as_tensors[i].shape[0] < max_length and torch.all(study_corpus_tensor[i, docs_as_tensors[i].shape[0]:, :] != 0):
        print("Did not pad document", i, "with 0s!")

In [84]:
class CustomDatasetEmbedded(Dataset):
    def __init__(self, corpus_tensor, labels):
        """
        Store the corpus (of shape num_docs by max_num_words_per_doc by size_of_word embedding) 
        labels (for a single target variable)
        """

        self.x = corpus_tensor
        self.y = labels

    def __len__(self):

        """
        Return the number of documents
        """
        return len(self.y)

    def __getitem__(self, index):
        """
        Return one document (represented as a tensor, with each row being the embedding for one word in that document),
        and its label (whether the patient described has or does not have some phenotype)
        """
        return (self.x[index, :, :], self.y[index])

In [85]:
labelled_corpus_df["Depression"]

0       1
1       0
2       1
3       1
4       0
       ..
1336    0
1337    0
1338    0
1339    0
1340    0
Name: Depression, Length: 1341, dtype: int64

In [86]:
depression_dataset = CustomDatasetEmbedded(study_corpus_tensor, labelled_corpus_df["Depression"])

In [87]:
train_dataset, test_dataset = torch.utils.data.random_split(depression_dataset, [0.8, 0.2])

In [88]:
len(train_dataset)

1073

In [89]:
len(test_dataset)

268

In [90]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 32, shuffle = True) # Batch size?
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 32)

In [92]:
torch.cuda.is_available()

False

In [None]:
# Not sure of correctness!!!?!?!?!?!????
class CNN_1_gram(nn.Module):
    def __init__(self):
        super(CNN_1_gram, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 1,
                               out_channels = 100,
                               kernel_size = (1, embedding_vector_size)
                               stride = 1,
                               padding = 0)
        self.do = nn.Dropout(p = 0.5) 
        
        # each kernel's feature map is condensed to a single value
        conv1_output_height = study_corpus_tensor.shape[1] + 1 - 1
        self.pool1 = nn.MaxPool2d(kernel_size = (conv1_output_height, embedding_vector_size))
        self.fc= nn.Linear(100, 2) # Input size???? 100, for 100 filters????
        
    def forward(self, x):
        x1 = torch.relu(self.conv1(x))
        x = self.pool(x1)
        x = self.do(x) # WHERE DOES THIS GO????? Onto output of pooling???
        x = torch.flatten(x, start_dim = 1) # check that start_dim is right for our input????
        
        # Can I feed x into multiple convolutional layers in parallel????
        
        # Right????
        # local output = nn.LogSoftMax()(linear(nn.Dropout(opt.dropout_p)(last_layer)))
        x = nn.LogSoftmax(self.fc(x)) 
        # print(x.shape)
        # raise NotImplementedError
        return x