In [119]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from collections import Counter
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [76]:
#read the data into pandas dataframes.
admissionsDf = pd.read_csv("Data/ADMISSIONS.csv")
diagnosesDf = pd.read_csv("Data/DIAGNOSES_ICD.csv")
eventsDf = pd.read_csv("Data/NOTEEVENTS.csv", dtype={"CHARTTIME":"string", "STORETIME":"string"})
patientsDf = pd.read_csv("Data/PATIENTS.csv")

### Data Cleaning
- Drop records for patients under 15 years of age - DONE!
- Use patient's first admission to ICU only - DONE!
- Segregate and deduplicate patient records - DONE!

In [79]:
#sort admissions dataframe by subject id and date and drop duplicate subject id so that only patient record for first visit is retained
admissionsDf.sort_values(["SUBJECT_ID", "ADMITTIME"], ascending=[True, True], inplace=True)
admissionsDf.drop_duplicates(subset=["SUBJECT_ID"], inplace=True)

dobDf = patientsDf[["SUBJECT_ID", "DOB"]] #create a dataframe of patient id and corresponding date of birth
admissionsDf1 = pd.merge(admissionsDf, dobDf, how="left", on="SUBJECT_ID") #merge the date of births to the admissions dataframe and reassign the admissions dataframe
admissionsDf1["ADMITTIME"] = pd.to_datetime(admissionsDf1["ADMITTIME"]) #convert admit time to datetime
admissionsDf1["DOB"] = pd.to_datetime(admissionsDf1["DOB"]) #convert DOB to datetime 

admissionsDf1['AGE'] = (admissionsDf1["ADMITTIME"].values - admissionsDf1["DOB"].values) / np.timedelta64(1,"D") // 365 #calculate the age of each patient at the time of admission


In [82]:
#find the patient id for all patients < 15 years old
under15 = admissionsDf1.loc[abs(admissionsDf1["AGE"]) < 15.0]
under15Patients = list(under15["SUBJECT_ID"]) #7,875 patients under 15 years

#note: conflicting information in the paper about age filter (page 1155 indicates >= 15 years while page 1156 indicates
# > 15 years). We are using >= 15 years as candidate patients

In [83]:
#drop under15 patients from admissions, diagnoses, events and patients
admissionsFiltered = admissionsDf1[~admissionsDf1.SUBJECT_ID.isin(under15Patients)]
diagnosesFiltered = diagnosesDf[~diagnosesDf.SUBJECT_ID.isin(under15Patients)]
eventsFiltered = eventsDf[~eventsDf.SUBJECT_ID.isin(under15Patients)]
patientsFiltered = patientsDf[~patientsDf.SUBJECT_ID.isin(under15Patients)] #38,645 adult (>=15 years) patients

#Note: the FarSight paper incorrectly states there are 7,704 distinct patients (page 1155)

In [85]:
eventsNoError = eventsFiltered.loc[eventsFiltered.ISERROR != 1] #drop events with known errors. i.e. ISERROR = 1
eventsNoDuplicate = eventsNoError.drop_duplicates() #drop duplicate events from the filtered dataframe
patientsNoErrors = sorted(list(eventsFiltered.SUBJECT_ID.unique())) #create list of patients from filtered events which have no errors


In [89]:
#select patient id and hospital admission code from admissions dataframe and use to merge left with diagnoses and events
patientHadmCode = admissionsFiltered[["SUBJECT_ID", "HADM_ID"]]
diagnosesFiltered1 = pd.merge(patientHadmCode, diagnosesFiltered, how="left", on=["SUBJECT_ID", "HADM_ID"])
eventsFiltered1 = pd.merge(patientHadmCode, eventsNoDuplicate, how="left", on=["SUBJECT_ID", "HADM_ID"])
eventsFiltered1[["TEXT"]] = eventsFiltered1[["TEXT"]].fillna("")

In [90]:
#prepare the final dataframes for further analysis
admissions = admissionsFiltered[admissionsFiltered.SUBJECT_ID.isin(patientsNoErrors)] #final admissions dataframe to be used for further analysis
diagnoses = diagnosesFiltered1[diagnosesFiltered1.SUBJECT_ID.isin(patientsNoErrors)].sort_values(by="SUBJECT_ID") #final diagnoses dataframe to be used for further analysis
patients = patientsFiltered[patientsFiltered.SUBJECT_ID.isin(patientsNoErrors)] #final patients dataframe to be used for further analysis
events = eventsFiltered1[eventsFiltered1.SUBJECT_ID.isin(patientsNoErrors)].sort_values(by=["SUBJECT_ID", "CHARTDATE"]) #final events dataframe to be used for further analysis


### Data Aggregation (using FarSight approach)
- Concatenate the chronological notes for each patient - DONE!
- Create a set of all ICD-9 codes for each patient - DONE!

In [93]:
#create a list of sets where each set contains all icd9 codes for a single patient
diag_grouped = diagnoses[["SUBJECT_ID", "ICD9_CODE"]].groupby(by="SUBJECT_ID")["ICD9_CODE"].apply(set).reset_index(name="Code_groups")
icd9_grouped = list(diag_grouped["Code_groups"]) #list of ICD9 codes

In [104]:
#create a list of strings where each string is the concatenated nursing texts for a single patient. 
events_grouped = events[["SUBJECT_ID", "TEXT"]].groupby(by="SUBJECT_ID")["TEXT"].apply(list).reset_index(name="text_groups")
notes_grouped = list(events_grouped["text_groups"]) #list of grouped notes
notes_list_all = [" ".join(i) for i in notes_grouped] #list of concatenated notes for each patient

### Data Preprocessing
- Remove multiple spaces and special characters - DONE!
- Tokenization (using NLTK) - DONE!
- Stopword removal from generated tokens (using the NLTK English stopword corpus) - DONE!
- Remove punctuation marks except hyphens and slashes - DONE!
- Remove references to images - DONE!
- Perform character case folding - DONE!
- Perform medical concept normalization through disambiguation of abbreviations (into long form) using CARD - FOLLOW UP!
- Perform suffix stripping through stemming - DONE!
- Convert stripped tokens into their respective base forms by lemmatization - DONE!
- Discard tokens appearing in less than 10 nursing notes - DONE!

In [107]:
# !!! use a subset of the data from faster testing. comment out this line for final submission
notes_list = notes_list_all[:10]

In [108]:
#define a function to remove multiple spaces, special characters and newlines
def clean_notes(note):
    a = re.sub("[^a-zA-Z0-9-/]+", " ", str(note)) #remove special characters
    b = re.sub("\n", "", str(a)) #remove newlines
    c = re.sub(" +", " ", str(b)) #remove extra spaces
    return c

#map the clean notes funciton to the notes_list
notes_list = list(map(clean_notes, notes_list))

In [109]:
#tokenize the nursing notes text
tokens = [] #empty tokens list
stops = set(stopwords.words('english')) #English stopwords corpus
pStem = PorterStemmer() #instance of stemmer for suffix stripping. Less aggressive stemmer
lStem = LancasterStemmer() #instance of stemmer for suffix stripping. Use either this or pStem
wLemma = WordNetLemmatizer() #instance of lemmatizer
img1=".jpeg"; img2=".jpg"; img3=".png"; img4=".tiff"; img5=".bmp" #image references for lookup

for idx,txt in enumerate(notes_list):
    a = nltk.word_tokenize(txt) #tokenize each nursing note
    tokens.append([wLemma.lemmatize(pStem.stem(word.casefold()), pos="v") for word in a if not word in stops \
                   if not img1 in word if not img2 in word if not img3 in word if not img4 in word if not img5 in word])
    

In [110]:
#eliminate tokens appearing in less than 10 nursing texts
token_counts = Counter([token for l in tokens for token in l])
under_10_tokens = [t for (t,c) in token_counts.items() if c < 10]

for idx, note in enumerate(tokens):
    tokens[idx] = [token for token in note if not token in under_10_tokens]

### NOTE!!!
At this point we have:
- patientsNoErrors: which is a list of patients ids (age >= 15) arranged in ascending order
- icd9_grouped: which is a list of sets of icd9 codes where each set contains the icd9 codes for an individual patient. Each set of icd9 codes is for the patient at the corresponding index in patientsNoErrors. 
- tokens: which is a list of of lists where each list contains the normalized tokens for an individual patient where these tokens are generated using the concatenated nursing notes for said patient. Each list of tokens is for the patient at the corresponding index in patientsNoErrors.

### Clinical feature modeling - VECTOR SPACE MODELING OF CLINICAL NOTES
- Obtain the Doc2Vec style features from the normalized nursing note tokens. Utilize the implementation in the Python Gensim package, with an embedding size of 500 (trained for 25 epochs), determined empirically using grid-search as per the original Farsight paper. - DONE!

In [134]:
#create list of TaggedDocument and train Doc2Vec model 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)] #create list of TaggedDocument
d2vModel = Doc2Vec(vector_size=500, epochs=25, dm=1) #create model
d2vModel.build_vocab(documents) #build the vocab
d2vModel.train(documents, total_examples=d2vModel.corpus_count, epochs=d2vModel.epochs) #train the model

In [139]:
#map the tokens to Doc2Vec style features using the trained model
d2v_tokens = [d2vModel.infer_vector(i) for i in tokens]

### Clinical feature modeling - TOPIC MODELING OF CLINICAL NOTES
- Obtain the Doc2Vec style features from the normalized nursing note tokens. Utilize the implementation in the Python Gensim package, with an embedding size of 500 (trained for 25 epochs), determined empirically using grid-search as per the original Farsight paper. 