In [None]:
import pandas as pd
import numpy as np
import re

In [2]:
# Read data
df = pd.read_csv("noteevents_raw.csv")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [3]:
# Preprocessing
def preprocess(df):
    df = df.apply(lambda x: x.astype(str).str.lower())  # lower case all
#     df['TEXT'] = df['TEXT'].str.replace("[\(\[].*?[\)\]]","")   # remove redacted substrings    
    return df

In [4]:
# Create features  
def createFeats(df): 
    df['allergy'] =  df['TEXT'].str.extract('(?s)allergies:\n(.+?)\n\n', expand=False)
    df['chief complaint'] = df['TEXT'].str.extract('(?s)chief complaint:\n(.+?)\n\n', expand=False)
    df['history of present illness'] = df['TEXT'].str.extract('(?s)(history of present illness|hpi):\n(.+?)\n\n', expand=False)[1]
    df['past medical history'] = df['TEXT'].str.extract('(?s)past medical history:\n(.+?)\n\n', expand=False)
    df['past procedure'] = df['TEXT'].str.extract('(?s)invasive procedure:\n(.+?)\n\n', expand=False)
    df['social history'] = df['TEXT'].str.extract('(?s)social history:\n(.+?)\n\n', expand=False)
    df['family history'] = df['TEXT'].str.extract('(?s)(family history|family hx):\n(.+?)\n\n', expand=False)[1]
    df['initial exam'] = df['TEXT'].str.extract('(?s)(admission labs|physical exam):\n(.+?)\n\n', expand=False)[1]
    df['admission medications'] = df['TEXT'].str.extract('(?s)(admission medications|meds on admission|medications on admission):\n(.+?)\n\n', expand=False)[1]
    df['pertinent results'] = df['TEXT'].str.extract('(?s)pertinent results:\n(.+?)\n\n', expand=False)
    df['discharge medication'] = df['TEXT'].str.extract('(?s)(discharge med|discharge medicine|discharge medication|discharge medications|meds on discharge):\n(.+?)\n\n', expand=False)[1]
    return df

In [5]:
df_preprocessed = preprocess(df)
df_feats = createFeats(df_preprocessed)

del df
del df_preprocessed

In [6]:
# Which categories are useful?
df_feats.groupby('CATEGORY').nunique()

Unnamed: 0_level_0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,...,chief complaint,history of present illness,past medical history,past procedure,social history,family history,initial exam,admission medications,pertinent results,discharge medication
CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
case management,967,576,620,867,907,967,1,88,27,2,...,0,0,0,0,0,0,0,0,0,0
consult,98,49,50,66,65,98,1,22,24,1,...,0,0,0,0,0,0,0,0,0,0
discharge summary,59652,41127,52726,28282,1,1,1,2,1,1,...,19485,38391,39394,24807,33634,19611,36312,36821,35523,37183
ecg,209051,35366,44186,37519,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
echo,45794,22316,23586,23427,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
general,8301,2908,3171,5714,6952,8301,1,269,590,2,...,4,0,0,0,0,0,0,0,0,0
nursing,223556,7704,9071,24407,84295,223543,1,51,486,2,...,2,2,0,0,0,0,0,0,0,0
nursing/other,822497,30005,34891,37316,783197,800994,1,1,1146,1,...,0,26,3,0,0,0,829,0,0,0
nutrition,9418,2823,3168,6377,7296,9418,1,4,41,2,...,0,0,0,0,0,0,0,0,0,0
pharmacy,103,67,69,90,91,103,1,45,12,2,...,0,0,0,0,0,0,0,0,0,0


- Only category "discharge summary" has the text we need + discharge medications. Remove all other categories  

In [7]:
# Remove all other categories
def removeCategories(df):
    df = df.loc[df['CATEGORY'].str.contains("discharge summary")]
    print(f'{len(df)} instances')
    return df
    
df_feats = removeCategories(df_feats)

59652 instances


In [8]:
# Create hypertension drugs multi-label encoding
hypertension_drugs = ['metoprolol','furosemide','lisinopril','amlodipine','atenolol','hydrochlorothiazide','diltiazem','carvedilol']

def createDrugFeats(df):
    # label 1 for each column with specific discharge medication
    for drug in hypertension_drugs:
        df[drug] = df['discharge medication'].str.contains(drug)*1
    # count number of hypertension drugs for each instance
    df['sum'] = np.sum(df[hypertension_drugs],axis=1)
    # keep only rows where count != 0 
    df = df[df['sum'] != 0]
    print(f'{len(df)} instances')
    
    return df

In [9]:
df_drugs = createDrugFeats(df_feats)

19710 instances


In [10]:
# post-processing
def postProcess(df):
    # drop first 10 columns
    df = df.drop(df.columns[0:10], axis=1)
    # fill nan with empty string 
    df = df.fillna("")
    # replace new lines with space
    df = df.replace("\n"," ", regex=True)
    # create full admission notes with existing features
    df["admission_notes"] = [' '.join(row.astype(str)) for row in df[df.columns[1:11]].values]
    # rename TEXT to discharge notes
    df.rename(index=str,columns={'TEXT':'discharge_notes'},inplace=True)
    # reindex
    new_index=['discharge_notes', 
                'admission_notes',
                'allergy',
                'chief complaint',
                'history of present illness',
                'past medical history',
                'past procedure',
                'social history',
                'family history',
                'initial exam',
                'admission medications',
                'pertinent results', 
                'discharge medication',
                'metoprolol',
                'furosemide',
                'lisinopril',
                'amlodipine',
                'atenolol',
                'hydrochlorothiazide',
                'diltiazem',
                'carvedilol',
                'sum']
    df = df.reindex(new_index,axis="columns")
    
    print(f'{len(df)} instances')
    return df

In [11]:
df_final = postProcess(df_drugs)

19710 instances


In [12]:
df_final

Unnamed: 0,discharge_notes,admission_notes,allergy,chief complaint,history of present illness,past medical history,past procedure,social history,family history,initial exam,...,discharge medication,metoprolol,furosemide,lisinopril,amlodipine,atenolol,hydrochlorothiazide,diltiazem,carvedilol,sum
1,admission date: [**2118-6-2**] discharg...,1. copd. last pulmonary function tests in ...,,,,1. copd. last pulmonary function tests in [**...,,,,,...,1. levothyroxine 75 mcg p.o. q.d. 2. citalopra...,0,0,0,0,0,0,1,0,1.0
3,admission date: [**2124-7-21**] ...,amlodipine copd exacerbation/shortness of brea...,amlodipine,copd exacerbation/shortness of breath,"87 yo f with h/o chf, copd on 5 l oxygen at ba...","# copd flare fev1 40% in [**2120**], on 5l oxy...",intubation arterial line placement picc line p...,the patient is married and worked as a clinica...,"(+) fhx cad; father with an mi in his 40's, di...",on admission vitals: t: bp: 116/46 p: 92 o2: 1...,...,1. metoprolol succinate 50 mg tablet sustained...,1,0,0,0,0,0,0,0,1.0
11,admission date: [**2192-4-19**] ...,lisinopril diarrhea mrs. [**known lastname **]...,lisinopril,diarrhea,mrs. [**known lastname **] is an 84 yo f h/o c...,recent admission to [**hospital1 18**] from [*...,none,living alone independently prior to last hospi...,noncontributory.,"tm 95.7, bp 108/50, p 93, r 25, 98% ra perrl. ...",...,1. acetaminophen 325 mg tablet sig: 1-2 tablet...,1,1,0,0,0,0,0,0,2.0
15,admission date: [**2203-11-3**] ...,"penicillins shortness of breath, chest pain 41...",penicillins,"shortness of breath, chest pain","41 y/o m w/ h/o hiv/aids (hiv dx 83, aids 92, ...","- hiv/aids: hiv dignosed in '[**79**], aids di...",pericardiocentesis egd with biopsy,pt was most recently living at [**hospital1 **...,nc,"on admission: vs: t 99.3, bp 132/72 , hr 105 ,...",...,"1. methadone 40 mg tablet, soluble sig: two (2...",1,1,0,0,0,0,0,0,2.0
18,admission date: [**2195-4-14**] ...,atorvastatin / penicillins / codeine / oxycodo...,atorvastatin / penicillins / codeine / oxycodone,left leg swelling/edema,"71f with history of chf, cad, afib on coumadin...",- hypertension - hyperlpidemia - ventricular t...,none,-former tobacco [**12-1**] pack per day x 25 y...,"[**name (ni) 2280**], mother with 'heart trou...",vitals: t: 97.6 bp: 91/57 p: 65 r: 26 o2: 98% ...,...,1. senna-gen 8.6 mg tablet sig: two (2) tablet...,0,1,0,0,0,0,0,0,1.0
19,admission date: [**2130-2-3**] d...,penicillins bright red blood per rectum 69 yo ...,penicillins,bright red blood per rectum,69 yo m presents 8 days s/p anal seton placeme...,"1. crohn's dz, found in [**2125**] on colonosc...",none,married for 46 years and lives with wife. 3 c...,father died at 62 from mi mother died at 52 of...,at time of discharge:,...,"1. protonix 40 mg tablet, delayed release (e.c...",0,0,1,0,1,0,0,0,2.0
20,admission date: [**2131-6-28**] ...,penicillins mr. [**known lastname 2302**] is a...,penicillins,mr. [**known lastname 2302**] is a 70 y.o. mal...,patient had felt well during the last 2 weeks ...,"1. crohn's dz, found in [**2125**] on colonosc...",colonoscopy w/ multiple bx,married for 46 years and lives with wife. 3 c...,father died at 62 from mi mother died at 52 of...,"exam afebrile, bp 100s/60s, 98%ra, hr 70s ...",...,1. acetaminophen-codeine 300-30 mg tablet sig:...,0,0,1,0,1,0,0,0,2.0
21,admission date: [**2131-7-5**] d...,penicillins new diagnosis colon cancer mr. [**...,penicillins,new diagnosis colon cancer,mr. [**known lastname 2302**] is a 70 y.o. mal...,"1. crohn's dz, found in [**2125**] on colonosc...",[**2131-7-11**]: ercp with sphincterotomy [**2...,married for 46 years and lives with wife. 3 c...,father died at 62 from mi mother died at 52 of...,t-96.8 bp-140/70 p-96 rr-20 sats-95% on ra gen...,...,1. atenolol 50 mg tablet sig: one (1) tablet p...,0,1,0,0,1,0,0,0,2.0
22,admission date: [**2190-2-11**] ...,no known allergies / adverse drug reactions dy...,no known allergies / adverse drug reactions,dyspnea,57 yo f with pmhx of htn who presents with sho...,-htn -vitiligo -uterine fibroids,bronchoscopy,"she works for the [**company 2318**], currentl...","mother had [**name (ni) 2320**], died at 74 fr...",vitals: t:98.8 bp:105/66 p: 91 r: 28 o2: 94% ...,...,1. home oxygen 4-5 l continuous oxygen for por...,0,0,0,0,1,0,0,0,1.0
24,admission date: [**2198-4-23**] ...,patient recorded as having no known allergies ...,patient recorded as having no known allergies ...,fever,this is a 75 year old man who is a retired ane...,# coronary artery disease status post cabg x4 ...,1. exploratory laparotomy. 2. pancreatic deb...,"a retired anesthesiologist, worked in pain man...",father had a mi at age 70.,vitals: 102.2 112/p 68 16 93%-ra gen: a+ox...,...,1. ipratropium bromide 0.02 % solution sig: on...,0,0,0,0,0,0,0,1,1.0


In [13]:
# output
df_final.to_csv("discharge_notes_with_medication_full_text_18APR.csv")