In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from tqdm import tqdm

In [3]:
mimic_iv_notes_parent = "/cis/home/charr165/Documents/physionet.org/files/mimic-iv-note/2.2/note"
mimic_iv_path = "/cis/home/charr165/Documents/physionet.org/mimiciv/2.2"

rad_notes_f_path = os.path.join(mimic_iv_notes_parent, "radiology.csv")
rad_notes_df = pd.read_csv(rad_notes_f_path, low_memory=False)
rad_notes_df['charttime'] = pd.to_datetime(rad_notes_df['charttime'])
rad_notes_df['storetime'] = pd.to_datetime(rad_notes_df['storetime'])

icustays_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "icustays.csv"), low_memory=False)
icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])
icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])

admissions_df = pd.read_csv(os.path.join(mimic_iv_path, "hosp", "admissions.csv"), low_memory=False)
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])

In [6]:
rad_notes_df['hadm_id'] = None
rad_notes_df['stay_id'] = None
rad_notes_df['icu_time_delta'] = None
rad_notes_df['hosp_time_delta'] = None

def calc_time_delta_hrs(icu_intime, charttime):
    return (charttime - icu_intime).total_seconds() / 3600

for index, row in tqdm(rad_notes_df.iterrows(), total=rad_notes_df.shape[0]):
    curr_pts_icustays = icustays_df[icustays_df['subject_id'] == row['subject_id']]
    
    for icu_index, icu_row in curr_pts_icustays.iterrows():
        if icu_row['intime'] <= row['charttime'] <= icu_row['outtime']:
            rad_notes_df.loc[index, 'stay_id'] = icu_row['stay_id']
            rad_notes_df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(icu_row['intime'], row['charttime'])
    
    curr_pts_admissions = admissions_df[admissions_df['subject_id'] == row['subject_id']]

    for hosp_index, hosp_row in curr_pts_admissions.iterrows():
        if hosp_row['admittime'] <= row['charttime'] <= hosp_row['dischtime']:
            rad_notes_df.loc[index, 'hadm_id'] = hosp_row['hadm_id']
            rad_notes_df.loc[index, 'hosp_time_delta'] = calc_time_delta_hrs(hosp_row['admittime'], row['charttime'])

  0%|          | 0/2321355 [00:00<?, ?it/s]

100%|██████████| 2321355/2321355 [42:41<00:00, 906.40it/s] 


In [7]:
mm_dir = "/cis/home/charr165/Documents/multimodal"
output_dir = os.path.join(mm_dir, "preprocessing")

rad_notes_df.to_pickle(os.path.join(output_dir, "notes_text.pkl"))
print(rad_notes_df.shape)

(2321355, 11)


In [9]:
rad_notes_df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,stay_id,icu_time_delta,hosp_time_delta
0,10000032-RR-14,10000032,,RR,14,2180-05-06 21:19:00,2180-05-06 23:32:00,EXAMINATION: CHEST (PA AND LAT)\n\nINDICATION...,,,
1,10000032-RR-15,10000032,22595853,RR,15,2180-05-06 23:00:00,2180-05-06 23:26:00,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...,,,0.616667
2,10000032-RR-16,10000032,22595853,RR,16,2180-05-07 09:55:00,2180-05-07 11:15:00,"INDICATION: ___ HCV cirrhosis c/b ascites, hi...",,,11.533333
3,10000032-RR-18,10000032,,RR,18,2180-06-03 12:46:00,2180-06-03 14:01:00,EXAMINATION: Ultrasound-guided paracentesis.\...,,,
4,10000032-RR-20,10000032,,RR,20,2180-07-08 13:18:00,2180-07-08 14:15:00,EXAMINATION: Paracentesis\n\nINDICATION: ___...,,,
...,...,...,...,...,...,...,...,...,...,...,...
2321350,19999987-RR-17,19999987,23865745,RR,17,2145-11-02 22:37:00,2145-11-03 18:55:00,"HISTORY: ___, with left occipital bleeding. ...",,,0.983333
2321351,19999987-RR-18,19999987,23865745,RR,18,2145-11-03 04:35:00,2145-11-03 10:46:00,INDICATION: ___ female intubated for head ble...,36195440,5.6,6.95
2321352,19999987-RR-19,19999987,23865745,RR,19,2145-11-03 16:40:00,2145-11-04 08:36:00,HISTORY: ___ woman with left occipital hemorr...,36195440,17.683333,19.033333
2321353,19999987-RR-20,19999987,23865745,RR,20,2145-11-04 05:10:00,2145-11-04 08:58:00,PORTABLE CHEST OF ___\n\nCOMPARISON: ___ radi...,36195440,30.183333,31.533333
