In [2]:
import pandas as pd
from tqdm import tqdm, trange

import sys, os
sys.path.append(os.path.abspath("../src/"))
from data_helper import get_days, load_config

config = load_config('../config/mimic_file.yaml')
# MIMIC_parent_path = "/data/padmalab_external/special_project/physionet.org/files/mimiciv/3.0/"
MIMIC_parent_path = config['project']['mimic_parent_math']
patents_df = pd.read_csv(MIMIC_parent_path+'hosp/patients.csv.gz', compression='gzip')
death_df = patents_df[~patents_df['dod'].isna()][['subject_id', 'dod']]
death_df['dod'] = pd.to_datetime(death_df['dod'])

ed_df = pd.read_csv(MIMIC_parent_path+'ed/edstays.csv.gz', compression='gzip')
ed_df['outtime'] = pd.to_datetime(ed_df['outtime'])
ed_df = ed_df.sort_values(by=['subject_id', 'outtime'])
latest_ed_times = ed_df.groupby('subject_id')['outtime'].max().reset_index()

# Hospital Latest date
admission_df = pd.read_csv(MIMIC_parent_path+'hosp/admissions.csv.gz', compression='gzip')
admission_df['dischtime'] = pd.to_datetime(admission_df['dischtime'])
admission_df = admission_df.sort_values(by=['subject_id', 'dischtime'])
latest_admission_times = admission_df.groupby('subject_id')['dischtime'].max().reset_index()

ecg_measurements_df = pd.read_csv('/data/padmalab_external/special_project/MIMIC-IV_ECG/physionet.org/files/mimic-iv-ecg/1.0/machine_measurements.csv')
ecg_measurements_df['ecg_time'] = pd.to_datetime(ecg_measurements_df['ecg_time'])
ecg_measurements_df = ecg_measurements_df.sort_values(by=['subject_id', 'ecg_time'])
latest_ecg_times = ecg_measurements_df.loc[ecg_measurements_df.groupby('subject_id')['ecg_time'].idxmax()]

# print the summary of the data
print ('# patients in ECG:\t', ecg_measurements_df['subject_id'].nunique())
print ('# patients in patents_df:\t', patents_df['subject_id'].nunique())
print ('# patients in admission_df:\t', admission_df['subject_id'].nunique())
print ('# patients in ed_df:\t', ed_df['subject_id'].nunique())

subject_ids = patents_df['subject_id'].unique().tolist()
censor_death_dict = {
    'subject_id': [],
    'censor_death_date': [],
    'death_event': []
}
with tqdm(total=len(subject_ids)) as pbar:
    for subject_id in subject_ids:
        censor_death_dict['subject_id'].append(subject_id)
        temp_patents_df = death_df[death_df['subject_id'] == subject_id]
        event = temp_patents_df.shape[0] > 0
        censor_death_date = pd.NaT
        if event:
            censor_death_date = temp_patents_df.iloc[0]['dod']
        else:
            temp_latest_ecg_times = latest_ecg_times[latest_ecg_times['subject_id'] == subject_id]
            if temp_latest_ecg_times.shape[0] > 0:
                dates = [d for d in [censor_death_date, temp_latest_ecg_times.iloc[0]['ecg_time']] if pd.notna(d)]
                censor_death_date = max(dates) if dates else pd.NaT

            temp_latest_ed_times = latest_ed_times[latest_ed_times['subject_id'] == subject_id]
            if temp_latest_ed_times.shape[0] > 0:
                dates = [d for d in [censor_death_date, temp_latest_ed_times.iloc[0]['outtime']] if pd.notna(d)]                
                censor_death_date = max(dates) if dates else pd.NaT

            temp_latest_admission_times = latest_admission_times[latest_admission_times['subject_id'] == subject_id]
            if temp_latest_admission_times.shape[0] > 0:
                dates = [d for d in [censor_death_date, temp_latest_admission_times.iloc[0]['dischtime']] if pd.notna(d)]                                
                censor_death_date = max(dates) if dates else pd.NaT
                
        censor_death_dict['death_event'].append(event)
        censor_death_dict['censor_death_date'].append(censor_death_date)
        pbar.update(1)
        
death_ISD_df = pd.DataFrame.from_dict(censor_death_dict)
death_ISD_df.to_csv('/data/padmalab_external/special_project/multi_event_data/MIMIC_IV_censor_death_date.csv')        

# patients in ECG:	 161352
# patients in patents_df:	 364627
# patients in admission_df:	 223452
# patients in ed_df:	 205504


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 364627/364627 [09:39<00:00, 629.61it/s]


In [17]:
# len(set(patents_df['subject_id'].unique()).difference(set(ecg_measurements_df['subject_id'].unique())))
# set(ecg_measurements_df['subject_id'].unique()).difference(set(patents_df['subject_id'].unique()))
# len(set(admission_df['subject_id'].unique()).difference(set(patents_df['subject_id'].unique())))
# len(set(ed_df['subject_id'].unique()).difference(set(patents_df['subject_id'].unique())))


0