In [69]:
import pandas as pd
import numpy as np
import argparse

## Reading in the data files

In [None]:
cohort = pd.read_csv("../data/entire_mechvent_cohort_starttimes15Oct19.csv")
notes_file = "../data/NOTEEVENTS.csv"
notes_df = pd.read_csv(notes_file)

d2_file = "../data/d2_mechvent_cohort08Oct19.csv"
d2 = pd.read_csv(d2_file)

categories = ["Physician ","Nursing", "Nursing/other", "Respiratory "]

In [71]:
'''
parser = argparse.ArgumentParser()
parser.add_argument("--cohort_file", help="path to the cohort information")
parser.add_argument("--notes_file", help="path to the notes file (NOTEVENTS)")
parser.add_argument("--save_path", help="path to save file including filename")
parser.add_argument("--categories", help="list of note categories to be included. Eg, ['physician', 'nursing']")
parser.add_argument("--verbosity", help="set to 1 (default) for verbose descriptions of code, else set to 0", default=1)
args = parser.parse_args()
'''

usage: ipykernel_launcher.py [-h] [--cohort_file COHORT_FILE]
                             [--notes_file NOTES_FILE] [--save_path SAVE_PATH]
                             [--categories CATEGORIES] [--verbosity VERBOSITY]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/2019/pnair6/.local/share/jupyter/runtime/kernel-ed4ea206-e17f-4a63-8387-37b3f8cb68b2.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
'''
cohort = pd.read_csv(args.cohort_file)
notes_df = pd.read_csv(args.notes_file)
output_path = args.save_path
categories = args.categories
verbosity = args.verbosity
'''

  interactivity=interactivity, compiler=compiler, result=result)


In [72]:
# function that calculates the difference between the start of mechanical ventilation and the chart time of notes
# Input : Data table which is a merge between the selected cohort and the notes
# Output: List of time difference between the charttime and start of mechanical ventilation

def get_time_diff(merged):
    from datetime import datetime, timedelta
    from tqdm import tqdm_notebook as tqdm
    
    FMT = '%Y-%m-%d %H:%M:%S'
    t_from_vent_list = []

    for ind,rows in tqdm(merged.iterrows(), total= merged.shape[0]):

        # take difference
        t_from_vent = datetime.strptime(rows.CHARTTIME, FMT)- datetime.strptime(rows.FIRST_VENT_STARTTIME, FMT)

        # convert to hr and add to list
        t_from_vent_list.append(t_from_vent.total_seconds()/timedelta (hours=1).total_seconds())
        
    return t_from_vent_list

In [73]:
# This function carries out the preprocessing of the data. It uses a pipeline as follows:
#        -- Consider only the first ICU stay of patients
#        -- Remove all rows from the notes table with invalid CHARTTIME and HADM_ID
#        -- Merge the cohort table with the notes table on the HADM_ID
#        -- Find the difference between chart time of noted and start of mechanical ventilation
#        -- Extract notes within the first 48 hours of mechanical ventilation
#        -- From this, take only those notes within the required categories

# Input: cohort table, notes table, list of categories, verbosity value
# Output: the pre-processed dataframe following the above pipeline

def pre_process(cohort, notes, categories, verbosity=1):
    
    if verbosity:
        print("Initial size of cohort: " + str(cohort.shape) + "\n")
        print("Initial size of notes table: " + str(notes.shape) + "\n")
        print("\n...............\n")
        print("Obtaining records corresponding to first ICU stay only\n")
        print("Removing rows with invalid admission IDs and null chart time of notes")
        print("Merging cohort with notes\n")
        
    # getting patient records of first icu stay only 
    first_icu_stay = cohort[cohort.FIRST_ICU_STAY == 't']
    
    # remove rows without CHARTTIME and HADM_ID values
    notes_df_filtered = notes[notes.CHARTTIME.notnull() & notes.HADM_ID.notnull()]
    
    
    # merging cohort with notes
    merged = pd.merge(notes_df_filtered, first_icu_stay[['HADM_ID','ICUSTAY_ID','FIRST_VENT_STARTTIME']], on = ['HADM_ID'], how='left')
    merged = merged[merged.FIRST_VENT_STARTTIME.notnull()]
    merged.drop_duplicates(subset='ROW_ID',keep='first',inplace=True)
    if verbosity:
        print("Size of merged table: " + str(merged.shape))
        print("Number of unique admissions " + str(len(merged.HADM_ID.unique())) + "\n")
        print("\n.................\n")
    
    # calculate the difference between chart time of notes and start of mechanical ventilation
    print("Calculating the difference between chart time of notes and start of mechanical ventilation \n")
    t_from_vent_list = get_time_diff(merged)
    merged['TIME_FROM_VENT'] = t_from_vent_list
    if verbosity:
        print(merged.columns)
    
    # getting notes from only first 48 hours
    merged_48hrs = merged[(merged.TIME_FROM_VENT <= 48) & (merged.TIME_FROM_VENT >= 0)]
    if verbosity:
        print("Take notes only from first 48 hours of mechanical ventilation \n")
        print("Number of unique admissions " + str(len(merged_48hrs.HADM_ID.unique())) + "\n")
    
    # retaining only notes of required categories
    res_notes = merged_48hrs[merged_48hrs.CATEGORY.isin(categories)]
    if verbosity:
        print("Retain notes of only required categories " + str(categories) + "\n")
        print("Final cohort size: " + str(res_notes.shape) +"\n")
        print("Final number of unique admissions "+ str(len(res_notes.HADM_ID.unique())) +"\n")
    
    return res_notes

In [68]:
d2_notes = pre_process(d2, notes_df, categories)

Initial size of cohort: (20124, 47)

Initial size of notes table: (2083180, 11)


...............

Obtaining records corresponding to first ICU stay only

Removing rows with invalid admission IDs and null chart time of notes
Merging cohort with notes

Size of merged table: (597494, 13)
Number of unique admissions 8586


.................

Calculating the mechanical ventilation duration 



HBox(children=(IntProgress(value=0, max=597494), HTML(value='')))


Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
       'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT',
       'ICUSTAY_ID', 'FIRST_VENT_STARTTIME', 'TIME_FROM_VENT'],
      dtype='object')
Take notes only from first 48 hours of mechanical ventilation 

Number of unique admissions 8507

Retain notes of only required categories ['Physician ', 'Nursing', 'Nursing/other', 'Respiratory ']

Final cohort size: (78695, 14)

Final number of unique admissions 6831



In [57]:
d2_notes.to_csv("../data/d2_cohort_phy_nurse_resp_Nov1.csv", index=False)

In [None]:
whole_cohort_notes_preprocessed = pre_process(cohort, notes_df, categories)
whole_cohort_notes_preprocessed.to_csv("../data/all_cohort_notes_phy_nurse_resp_Nov1.csv",index=False)

In [None]:
'''
preprocessed_notes = pre_process(cohort, notes_df, categories, verbosity)

preprocessed_notes.to_csv(output_path, index=False)
'''