In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import numpy as np
import pandas as pd

In [3]:
mimic_iv_path = "/cis/home/charr165/Documents/physionet.org/mimiciv/2.2"
mm_dir = "/cis/home/charr165/Documents/multimodal"

output_dir = os.path.join(mm_dir, "preprocessing")
os.makedirs(output_dir, exist_ok=True)

In [4]:
f_path = os.path.join(mimic_iv_path, "hosp", "admissions.csv")
admissions_df = pd.read_csv(f_path, low_memory=False)
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])

icustays_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "icustays.csv"), low_memory=False)
icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])
icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])

procedureevents_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "procedureevents.csv"), low_memory=False)
procedureevents_df['starttime'] = pd.to_datetime(procedureevents_df['starttime'])
procedureevents_df['endtime'] = pd.to_datetime(procedureevents_df['endtime'])
procedureevents_df['storetime'] = pd.to_datetime(procedureevents_df['storetime'], format='mixed')

chartevents_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "chartevents.csv"), low_memory=False)
chartevents_df['charttime'] = pd.to_datetime(chartevents_df['charttime'])
chartevents_df['storetime'] = pd.to_datetime(chartevents_df['storetime'])

In [5]:
hosp_lab_events = pd.read_csv(os.path.join(mimic_iv_path, "hosp", "labevents.csv"), low_memory=False)
hosp_lab_events['charttime'] = pd.to_datetime(hosp_lab_events['charttime'])
hosp_lab_events['storetime'] = pd.to_datetime(hosp_lab_events['storetime'])

# Drop hosp_lab_events where hadm_id is nan
hosp_lab_events = hosp_lab_events.dropna(subset=['hadm_id'])

In [6]:
d_lab_items_df = pd.read_csv(os.path.join(mimic_iv_path, "hosp", "d_labitems.csv"), low_memory=False)

# Drop rows with missing values
d_lab_items_df = d_lab_items_df.dropna()

# Search labels for something that looks like ph
ph_labels = d_lab_items_df[d_lab_items_df['label'].str.contains('Glucose', case=False)]
print(ph_labels)

      itemid                 label                fluid    category
7      50809               Glucose                Blood   Blood Gas
40     50842      Glucose, Ascites              Ascites   Chemistry
129    50931               Glucose                Blood   Chemistry
210    51022  Glucose, Joint Fluid          Joint Fluid   Chemistry
222    51034   Glucose, Body Fluid     Other Body Fluid   Chemistry
241    51053      Glucose, Pleural              Pleural   Chemistry
272    51084        Glucose, Urine                Urine   Chemistry
638    51478               Glucose                Urine  Hematology
908    51790          Glucose, CSF  Cerebrospinal Fluid   Chemistry
1034   51941        Glucose, Stool                Stool   Chemistry
1074   51981               Glucose                Urine   Chemistry
1120   52027  Glucose, Whole Blood                Blood   Blood Gas
1528   52569               Glucose                Blood   Chemistry


In [7]:
d_items_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "d_items.csv"), low_memory=False)
# d_items_df = d_items_df[d_items_df['category'] == "Labs"]
# ph_labels = d_items_df[d_items_df['label'].str.contains('pressure', case=False)]
# print(ph_labels)

def get_procedures_of_interest(df):
    df = df.copy()

    event_list = ['Foley Catheter', 'PICC Line', 'Intubation', 'Peritoneal Dialysis', 
                            'Bronchoscopy', 'EEG', 'Dialysis - CRRT', 'Dialysis Catheter', 
                            'Chest Tube Removed', 'Hemodialysis']
    event_links_df = pd.DataFrame()
    for event in event_list:
        curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

        tmp_dict = {"event": event, "itemid": curr_event_item_id}
        event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

    df = df[df["itemid"].isin(event_links_df['itemid'])]
    df = df.merge(event_links_df, on="itemid", how="left")
    df.drop(columns=["itemid"], inplace=True)
    return df

def get_labs_of_interest(df):
    df = df.copy()

    event_list = ['Glucose', 'Potassium', 'Sodium', 'Chloride', 'Creatinine',
           'Urea Nitrogen', 'Bicarbonate', 'Anion Gap', 'Hemoglobin', 'Hematocrit',
           'Magnesium', 'Platelet Count', 'Phosphate', 'White Blood Cells',
           'Calcium, Total', 'MCH', 'Red Blood Cells', 'MCHC', 'MCV', 'RDW', 
                      'Platelet Count', 'Neutrophils', 'Vancomycin'
                  ]

    event_links_df = pd.DataFrame()
    for event in event_list:
        # print(event)
        curr_event_item_id = d_lab_items_df[d_lab_items_df["label"] == event]["itemid"].values[0]

        tmp_dict = {"event": event, "itemid": curr_event_item_id}
        event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

    df = df[df["itemid"].isin(event_links_df['itemid'])]
    df = df.merge(event_links_df, on="itemid", how="left")
    df.drop(columns=["itemid"], inplace=True)

    return df

def get_vitals_of_interest(df):
    df = df.copy()

    event_list = [ #CHART EVENTS
                  'Heart Rate','Non Invasive Blood Pressure systolic',
                    'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 
                    'Respiratory Rate','O2 saturation pulseoxymetry', 
                    'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response']

    event_links_df = pd.DataFrame()
    for event in event_list:
        # print(event)
        curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

        tmp_dict = {"event": event, "itemid": curr_event_item_id}
        event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

    df = df[df["itemid"].isin(event_links_df['itemid'])]
    df = df.merge(event_links_df, on="itemid", how="left")
    df.drop(columns=["itemid"], inplace=True)

    rename_dict = {
        'Non Invasive Blood Pressure systolic': 'Systolic BP',
        'Non Invasive Blood Pressure diastolic': 'Diastolic BP',
        'Non Invasive Blood Pressure mean': 'Mean BP',
        'O2 saturation pulseoxymetry': 'O2 Saturation'
    }

    df['event'] = df['event'].replace(rename_dict)
    
    return df


# procedureevents_df = get_procedures_of_interest(procedureevents_df)
labevents_df = get_labs_of_interest(chartevents_df)
vitals_df = get_vitals_of_interest(chartevents_df)
labevents_df = labevents_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
vitals_df = vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
# procedureevents_df = procedureevents_df[['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'storetime', 'value', 'event']]

# labs_df = get_labs_of_interest(hosp_lab_events)
# vitals_df = get_vitals_of_interest(chartevents_df)
# labs_vitals_df = labs_vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]

In [8]:
del chartevents_df, hosp_lab_events

In [None]:
# d_items_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "d_items.csv"), low_memory=False)
# # d_items_df = d_items_df[d_items_df['category'] == "Labs"]

# def get_procedures_of_interest(df):
#     df = df.copy()

#     event_list = ['Foley Catheter', 'PICC Line', 'Intubation', 'Peritoneal Dialysis', 
#                             'Bronchoscopy', 'EEG', 'Dialysis - CRRT', 'Dialysis Catheter', 
#                             'Chest Tube Removed', 'Hemodialysis']
#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)
#     return df

# def get_labs_of_interest(df):
#     df = df.copy()

#     event_list = [  #LAB EVENTS
#                   'Glucose (serum)', 'Glucose (whole blood)',
#                   'Potassium (serum)', 'Potassium (whole blood)', 
#                   'Sodium (serum)', 'Sodium (whole blood)',
#                   'Chloride (serum)', 'Chloride (whole blood)',
#                   'Creatinine (serum)', 'Creatinine (whole blood)',
#                   'BUN', #   'Urea Nitrogen', 
#                   'HCO3 (serum)', #   'Bicarbonate', 
#                   'Anion gap', 
#                   'Hemoglobin', 
#                   'Hematocrit (serum)', 'Hematocrit (whole blood - calc)',
#                   'Magnesium', 
#                   'Platelet Count', 
#                   'Alkaline Phosphate', 
#                   'WBC', #'White Blood Cells',
#                   'Calcium non-ionized', 'Ionized Calcium', #'Calcium, Total', 
#                 #   'MCH', 
#                 #   'Red Blood Cells', 
#                 #   'MCHC', 
#                 #   'MCV', 
#                 #   'RDW', 
#                   'Absolute Neutrophil Count', #  'Neutrophils', 
#                   'Vancomycin (Peak)', 'Vancomycin (Random)', 'Vancomycin (Trough)',
#                   # NEW
#                   'PH (Arterial)', 'PH (dipstick)', 'PH (SOFT)', 'PH (Venous)',
#                   'Capillary Refill R', 'Capillary Refill L',
#                   'Temperature Celsius',
#                   'Daily Weight', 'Admission Weight (Kg)',
#                   'Inspired O2 Fraction'
#                   ]

#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         # print(event)
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)

#     rename_dict = {
#         'Glucose (serum)': 'Glucose',
#         'Glucose (whole blood)': 'Glucose',
#         'Potassium (serum)': 'Potassium',
#         'Potassium (whole blood)': 'Potassium',
#         'Sodium (serum)': 'Sodium',
#         'Sodium (whole blood)': 'Sodium',
#         'Chloride (serum)': 'Chloride',
#         'Chloride (whole blood)': 'Chloride',
#         'Creatinine (serum)': 'Creatinine',
#         'Creatinine (whole blood)': 'Creatinine',
#         'BUN': 'Urea Nitrogen',
#         'HCO3 (serum)': 'Bicarbonate',
#         'Hematocrit (serum)': 'Hematocrit',
#         'Hematocrit (whole blood - calc)': 'Hematocrit',
#         'Calcium non-ionized': 'Calcium',
#         'Ionized Calcium': 'Calcium',
#         'Vancomycin (Peak)': 'Vancomycin',
#         'Vancomycin (Random)': 'Vancomycin',
#         'Vancomycin (Trough)': 'Vancomycin',
#         'PH (Arterial)': 'PH',
#         'PH (dipstick)': 'PH',
#         'PH (SOFT)': 'PH',
#         'PH (Venous)': 'PH',
#         'Capillary Refill R': 'Capillary Refill',
#         'Capillary Refill L': 'Capillary Refill',
#         'Temperature Celsius': 'Temperature',
#         'Daily Weight': 'Weight',
#         'Admission Weight (Kg)': 'Weight',
#         'Inspired O2 Fraction': 'Inspired O2 Fraction'
#     }

#     df['event'] = df['event'].replace(rename_dict)

#     return df

# def get_vitals_of_interest(df):
#     df = df.copy()

#     event_list = [ #CHART EVENTS
#                   'Heart Rate','Non Invasive Blood Pressure systolic',
#                     'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 
#                     'Respiratory Rate','O2 saturation pulseoxymetry', 
#                     'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response']

#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         # print(event)
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)

#     rename_dict = {
#         'Non Invasive Blood Pressure systolic': 'Systolic BP',
#         'Non Invasive Blood Pressure diastolic': 'Diastolic BP',
#         'Non Invasive Blood Pressure mean': 'Mean BP',
#         'O2 saturation pulseoxymetry': 'O2 Saturation'
#     }

#     df['event'] = df['event'].replace(rename_dict)
    
#     return df

# def get_labs_vitals(df):
#     df = df.copy()

#     event_list = [ #LAB EVENTS
#                   'Glucose (serum)', 'Glucose (whole blood)',
#                   'Potassium (serum)', 'Potassium (whole blood)', 
#                   'Sodium (serum)', 'Sodium (whole blood)',
#                   'Chloride (serum)', 'Chloride (whole blood)',
#                   'Creatinine (serum)', 'Creatinine (whole blood)',
#                   'BUN', #   'Urea Nitrogen', 
#                   'HCO3 (serum)', #   'Bicarbonate', 
#                   'Anion gap', 
#                   'Hemoglobin', 
#                   'Hematocrit (serum)', 'Hematocrit (whole blood - calc)',
#                   'Magnesium', 
#                   'Platelet Count', 
#                   'Alkaline Phosphate', 
#                   'WBC', #'White Blood Cells',
#                   'Calcium non-ionized', 'Ionized Calcium', #'Calcium, Total', 
#                 #   'MCH', 
#                 #   'Red Blood Cells', 
#                 #   'MCHC', 
#                 #   'MCV', 
#                 #   'RDW', 
#                   'Absolute Neutrophil Count', #  'Neutrophils', 
#                   'Vancomycin (Peak)', 'Vancomycin (Random)', 'Vancomycin (Trough)',
                  
#                   # NEW
#                   'PH (Arterial)', 'PH (dipstick)', 'PH (SOFT)', 'PH (Venous)',
#                   'Capillary Refill R', 'Capillary Refill L',
#                   'Temperature Celsius',
#                   'Daily Weight', 'Admission Weight (Kg)',
#                   'Inspired O2 Fraction',

#                   #CHART EVENTS
#                   'Heart Rate','Non Invasive Blood Pressure systolic',
#                     'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 
#                     'Respiratory Rate','O2 saturation pulseoxymetry', 
#                     'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response'
#                     ]

#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)

#     rename_dict = {
#         'Glucose (serum)': 'Glucose',
#         'Glucose (whole blood)': 'Glucose',
#         'Potassium (serum)': 'Potassium',
#         'Potassium (whole blood)': 'Potassium',
#         'Sodium (serum)': 'Sodium',
#         'Sodium (whole blood)': 'Sodium',
#         'Chloride (serum)': 'Chloride',
#         'Chloride (whole blood)': 'Chloride',
#         'Creatinine (serum)': 'Creatinine',
#         'Creatinine (whole blood)': 'Creatinine',
#         'BUN': 'Urea Nitrogen',
#         'HCO3 (serum)': 'Bicarbonate',
#         'Hematocrit (serum)': 'Hematocrit',
#         'Hematocrit (whole blood - calc)': 'Hematocrit',
#         'Calcium non-ionized': 'Calcium',
#         'Ionized Calcium': 'Calcium',
#         'Vancomycin (Peak)': 'Vancomycin',
#         'Vancomycin (Random)': 'Vancomycin',
#         'Vancomycin (Trough)': 'Vancomycin',
#         'PH (Arterial)': 'PH',
#         'PH (dipstick)': 'PH',
#         'PH (SOFT)': 'PH',
#         'PH (Venous)': 'PH',
#         'Capillary Refill R': 'Capillary Refill',
#         'Capillary Refill L': 'Capillary Refill',
#         'Temperature Celsius': 'Temperature',
#         'Daily Weight': 'Weight',
#         'Admission Weight (Kg)': 'Weight',
#         'Inspired O2 Fraction': 'Inspired O2 Fraction',

#         'Non Invasive Blood Pressure systolic': 'Systolic BP',
#         'Non Invasive Blood Pressure diastolic': 'Diastolic BP',
#         'Non Invasive Blood Pressure mean': 'Mean BP',
#         'O2 saturation pulseoxymetry': 'O2 Saturation'
#     }

#     df['event'] = df['event'].replace(rename_dict)

#     return df



# # procedureevents_df = get_procedures_of_interest(procedureevents_df)
# labevents_df = get_labs_of_interest(chartevents_df)
# vitals_df = get_vitals_of_interest(chartevents_df)
# # labevents_df = labevents_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
# # vitals_df = vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
# # procedureevents_df = procedureevents_df[['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'storetime', 'value', 'event']]



# # labs_vitals_df = get_labs_vitals(chartevents_df)
# # labs_vitals_df = labs_vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]

In [9]:
from tqdm import tqdm

def calc_time_delta_hrs(icu_intime, charttime):
    return (charttime - icu_intime).total_seconds() / 3600



def add_time_delta(df):
    df = df.copy()

    if 'stay_id' in df.columns:
        stay_id_in_cols = True
    else:
        stay_id_in_cols = False
        df['stay_id'] = None
        
    df['icu_time_delta'] = None
    df['hosp_time_delta'] = None

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if 'charttime' in row:
            ref_time = row['charttime']
        elif 'storetime' in row:
            ref_time = row['storetime']

        curr_admission = admissions_df[(admissions_df['subject_id'] == row['subject_id']) & (admissions_df['hadm_id'] == row['hadm_id'])]

        df.loc[index, 'hosp_time_delta'] = calc_time_delta_hrs(curr_admission['admittime'].iloc[0], ref_time)

        if stay_id_in_cols:
            curr_icu_stay = icustays_df[(icustays_df['subject_id'] == row['subject_id']) & (icustays_df['stay_id'] == row['stay_id'])]
            df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(curr_icu_stay['intime'].iloc[0], ref_time)
        else:
            curr_pts_icustays = icustays_df[icustays_df['subject_id'] == row['subject_id']]

            for icu_index, icu_row in curr_pts_icustays.iterrows():
                if icu_row['intime'] <= ref_time <= icu_row['outtime']:
                    df.loc[index, 'stay_id'] = icu_row['stay_id']
                    df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(icu_row['intime'], ref_time)
            

    df = df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'])
    return df


# procedureevents_df = add_time_delta(icustays_df, procedureevents_df)
labevents_df = add_time_delta(icustays_df, labevents_df)
vitals_df = add_time_delta(icustays_df, vitals_df)

# labs_df = add_time_delta(labs_df)
# labs_df = labs_df[['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta', 'icu_time_delta', 'charttime', 'storetime', 'event', 'valuenum']]
# labs_df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'], inplace=True)
# vitals_df = add_time_delta(vitals_df)
# vitals_df = vitals_df[['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta', 'icu_time_delta', 'charttime', 'storetime', 'event', 'valuenum']]
# vitals_df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'], inplace=True)

100%|██████████| 33094639/33094639 [12:17:31<00:00, 747.87it/s]  
100%|██████████| 36259441/36259441 [11:34:27<00:00, 870.21it/s]  


In [None]:
concat_df = pd.concat([labevents_df, vitals_df], axis=0, ignore_index=True)

In [None]:
def convert_events_table_to_ts_array(df):
    # Ensure 'valuenum' or 'value' columns exist
    value_column = 'valuenum' if 'valuenum' in df.columns else 'value'

    # Create a pivot table
    pivot_df = df.pivot_table(index=['hadm_id', 'hosp_time_delta'], 
                              columns='event', 
                              values=value_column, 
                              aggfunc='first').reset_index()

    # Join with the original DataFrame to get other required columns
    keys = ['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta', 'icu_time_delta']
    merged_df = pd.merge(df[keys].drop_duplicates(), pivot_df, on=['hadm_id', 'hosp_time_delta'])

    # Reorder the columns
    cols = merged_df.columns.tolist()
    cols = [col for col in keys if col in cols] + [col for col in cols if col not in keys]
    merged_df = merged_df[cols]

    # Sort the DataFrame
    merged_df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'], inplace=True)

    return merged_df

# procedureevents_ts_df = convert_events_table_to_ts_array(procedureevents_df)
labevents_ts_df = convert_events_table_to_ts_array(labevents_df)
vitals_ts_df = convert_events_table_to_ts_array(vitals_df)

concat_df = convert_events_table_to_ts_array(concat_df)

In [None]:
mm_dir = "/cis/home/charr165/Documents/multimodal"
output_dir = os.path.join(mm_dir, "preprocessing")

# procedureevents_ts_df.to_pickle(os.path.join(output_dir, "ts_procedureevents_icu.pkl"))
labevents_ts_df.to_pickle(os.path.join(output_dir, "ts_labs_icu.pkl"))
vitals_ts_df.to_pickle(os.path.join(output_dir, "ts_vitals_icu.pkl"))

concat_df.to_pickle(os.path.join(output_dir, "ts_labs_vitals.pkl"))