Step 1 : Download the dataset from [https:/physionet.org/content/mimic-iv-demo/2.2/](https:/physionet.org/content/mimic-iv-demo/2.2/)  
Step 2 : Move the ZIP file to the same folder in which you downloaded the Jupyter Notebook  
Step 3 : Run all the cells in the jupyter notebook to retrieve the json files

In [20]:
import sys
import pandas as pd
import numpy as np
import json
import os

In [21]:
##hosp modules

base_path = "./"

admissions = pd.read_csv(f'{base_path}hosp/admissions.csv')
d_hcpcs = pd.read_csv(f'{base_path}hosp/d_hcpcs.csv')
d_icd_diagnoses = pd.read_csv(f'{base_path}hosp/d_icd_diagnoses.csv')
d_icd_procedures = pd.read_csv(f'{base_path}hosp/d_icd_procedures.csv')
d_labitems = pd.read_csv(f'{base_path}hosp/d_labitems.csv')
diagnoses_icd = pd.read_csv(f'{base_path}hosp/diagnoses_icd.csv')
drgcodes = pd.read_csv(f'{base_path}hosp/drgcodes.csv')
emar_detail = pd.read_csv(f'{base_path}hosp/emar_detail.csv', low_memory=False)
emar = pd.read_csv(f'{base_path}hosp/emar.csv')
hcpcsevents = pd.read_csv(f'{base_path}hosp/hcpcsevents.csv')
labevents = pd.read_csv(f'{base_path}hosp/labevents.csv')
microbiologyevents = pd.read_csv(f'{base_path}hosp/microbiologyevents.csv')
omr = pd.read_csv(f'{base_path}hosp/omr.csv')
patients = pd.read_csv(f'{base_path}hosp/patients.csv')
pharmacy = pd.read_csv(f'{base_path}hosp/pharmacy.csv')
poe_detail = pd.read_csv(f'{base_path}hosp/poe_detail.csv')
poe = pd.read_csv(f'{base_path}hosp/poe.csv')
prescriptions = pd.read_csv(f'{base_path}hosp/prescriptions.csv')
procedures_icd = pd.read_csv(f'{base_path}hosp/procedures_icd.csv')
provider = pd.read_csv(f'{base_path}hosp/provider.csv')
services = pd.read_csv(f'{base_path}hosp/services.csv')
transfers = pd.read_csv(f'{base_path}hosp/transfers.csv')

## icu modules

caregiver = pd.read_csv(f'{base_path}icu/caregiver.csv')
chartevents = pd.read_csv(f'{base_path}icu/chartevents.csv')
d_items = pd.read_csv(f'{base_path}icu/d_items.csv')
datetimeevents = pd.read_csv(f'{base_path}icu/datetimeevents.csv')
icustays = pd.read_csv(f'{base_path}icu/icustays.csv')
ingredientevents = pd.read_csv(f'{base_path}icu/ingredientevents.csv')
inputevents = pd.read_csv(f'{base_path}icu/inputevents.csv')
outputevents = pd.read_csv(f'{base_path}icu/outputevents.csv')
procedureevents = pd.read_csv(f'{base_path}icu/procedureevents.csv')

In [22]:
# Date-time to standard format
date_time_columns = ['admittime', 'dischtime', 'deathtime', 'edregtime', 'edouttime']
for col in date_time_columns:
    if col in admissions.columns:
        admissions[col] = pd.to_datetime(admissions[col], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')

# Directory for JSON
output_dir = 'patients_new'
os.makedirs(output_dir, exist_ok=True)

# Function for creating JSON
def save_subject_json(subject_data, subject_id):
    # Convert the subject's data to a pandas DataFrame and then to JSON
    subject_json = subject_data.to_json(orient='records', date_format='iso')

    # Save the JSON data to a file
    with open(f'{output_dir}/{subject_id}.json', 'w') as file:
        file.write(subject_json)

# Process each unique subject_id
unique_subject_ids = admissions['subject_id'].dropna().unique()
for subject_id in unique_subject_ids:
    # Filter data for the current subject_id
    subject_data = admissions[admissions['subject_id'] == subject_id]

    # Save the subject's data as a JSON file
    save_subject_json(subject_data, subject_id)

print(f'JSON files created in {output_dir}')


JSON files created in patients_new


In [23]:
jfiles_dir = 'patients_new/'

In [24]:
# Function to update the json files according to subject_id
def update_json_sub(events_df, dir, event_key):

    # Convert to pandas for easier handling - smaller dataset
    events_pd =events_df
    # Group according to subject_id and hadm_id and convert the data to a dict
    grouped_events = events_pd.groupby(['subject_id']).apply(lambda x: x.to_dict(orient='records'))

    if not grouped_events.empty:
        for record in grouped_events:
            if record:
                events_structure = {key: None for key in record[0].keys()}
                break
    else:
        events_structure = {}

    l_updated = []
    l_failed = []

    # Read files in the directory sequentially
    for filename in os.listdir(dir):
        if filename.endswith(".json"):
            subject_id = int(filename.split('.')[0])  # Assuming the filename is just the subject_id.json
            jfile_path = os.path.join(dir, filename)

            with open(jfile_path, 'r') as file:
                patient_data = json.load(file)

            # Structure the data
            updated = False
            for admission in patient_data:
                subject_id = admission['subject_id']
                if subject_id in grouped_events.index:
                    events_data = grouped_events.loc[subject_id]
                    if isinstance(events_data, str):
                        events_data = json.loads(events_data)
                    elif not isinstance(events_data, list):
                        events_data = [events_data]
                    admission[event_key] = events_data
                    updated = True
                # Ensure all files have consistent headers
                else:
                    admission[event_key] = events_structure
                    updated = True

            # Write on to the json files to update
            if updated:
                with open(jfile_path, 'w') as file:
                    json.dump(patient_data, file, indent=0)
                l_updated.append(subject_id)
                # print(f'Data updated for subject_id {subject_id}')
            else:
                l_failed.append(subject_id)
                # print(f'No data updated for subject_id {subject_id}')

    print(f'[{event_key}]: Updated data for {len(l_updated)} subjects')
    print(f'[{event_key}]: No data updated for {len(l_failed)} subjects')
    print(f"[{event_key}]: Failed for {l_failed}")
    print("")

In [25]:
update_json_sub(patients, jfiles_dir, 'patients')

omr['chartdate'] = pd.to_datetime(omr['chartdate'], errors='coerce').dt.strftime('%Y-%m-%d')
update_json_sub(omr, jfiles_dir, 'omr')

[patients]: Updated data for 100 subjects
[patients]: No data updated for 0 subjects
[patients]: Failed for []

[omr]: Updated data for 100 subjects
[omr]: No data updated for 0 subjects
[omr]: Failed for []



In [26]:
#Preprocessing emar and emar_detail and creating a single dataframe to be added
emar_detail = emar_detail.rename(columns={'pharmacy_id': 'pharmacy_id_detail'})
emar['charttime'] = pd.to_datetime(emar['charttime'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
emar['scheduletime'] = pd.to_datetime(emar['scheduletime'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
emar['storetime'] = pd.to_datetime(emar['storetime'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')

grouped_emar_detail = emar_detail.groupby(['emar_id']).apply(lambda x: x.to_dict(orient='records'))
grouped_emar_detail = grouped_emar_detail
emar_pd = emar

if not isinstance(grouped_emar_detail, pd.DataFrame):
    # If grouped_emar_detail is a Series with emar_id as index, convert it to a DataFrame
    grouped_emar_detail = grouped_emar_detail.reset_index()

emar_merged = emar_pd.merge(grouped_emar_detail, on='emar_id')

In [27]:
#updating emar data as before
grouped_emar = emar_merged.groupby(['subject_id', 'hadm_id']).apply(lambda x: x.to_dict(orient='records'))

if not grouped_emar.empty:
    for record in grouped_emar:
        if record:
            emar_structure = {key: None for key in record[0].keys()}
            break
else:
    emar_structure = {}

l_updated = []
l_failed = []

for filename in os.listdir(jfiles_dir):
    if filename.endswith(".json"):
        subject_id = int(filename.split('.')[0])
        jfile_path = os.path.join(jfiles_dir, filename)

        with open(jfile_path, 'r') as file:
            patient_data = json.load(file)

        updated = False
        for admission in patient_data:
            hadm_id = admission.get('hadm_id')
            if (subject_id, hadm_id) in grouped_emar.index:
                emar_data = grouped_emar.loc[(subject_id, hadm_id)]
                if isinstance(emar_data, str):
                    emar_data = json.loads(emar_data)
                elif not isinstance(emar_data, list):
                    emar_data = [emar_data]
                admission['emar'] = emar_data
                updated = True
            else:
                admission['emar'] = emar_structure
                updated = True

        if updated:
            with open(jfile_path, 'w') as file:
                json.dump(patient_data, file, indent=0)
            # print(f'Data updated for subject_id {subject_id}')
            l_updated.append(subject_id)
        else:
            # print(f'No data updated for subject_id {subject_id}')
            l_failed.append(subject_id)

print(f'[emar]: Updated data for {len(l_updated)} subjects')
print(f'[emar]: No data updated for {len(l_failed)} subjects')
print(f"[emar]: Failed for {l_failed}")
print("")

[emar]: Updated data for 100 subjects
[emar]: No data updated for 0 subjects
[emar]: Failed for []



In [28]:
def update_all_json_files_once(events_dict, dir):
    """
    Update JSON files for all events specified in the events_dict in a single read and write operation per file.

    Args:
    - events_dict (dict): A dictionary where keys are event keys and values are the corresponding DataFrames.
    - dir (str): Directory where the JSON files are located.
    """
    # Preprocess all DataFrames in the events_dict
    for event_key, events_df in events_dict.items():
        datetime_fields = ['chartdate', 'charttime', 'storetime', 'entertime', 'starttime', 'stoptime', 'ordertime', 'transfertime', 'intime', 'outtime']
        for field in datetime_fields:
            if field in events_df.columns:
                events_df[field] = pd.to_datetime(events_df[field], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')

        events_df = events_df.where(pd.notnull(events_df), None)

        # Convert the DataFrame to a nested dict {(subject_id, hadm_id): [event_data, ...]}
        events_dict[event_key] = events_df.groupby(['subject_id', 'hadm_id']).apply(lambda x: x.to_dict(orient='records')).to_dict()

    l_updated = []
    l_failed = []
    l_event_keys = []

    # Read and update JSON files
    for filename in os.listdir(dir):
        if filename.endswith(".json"):
            subject_id = int(filename.split('.')[0])
            jfile_path = os.path.join(dir, filename)
            with open(jfile_path, 'r') as file:
                patient_data = json.load(file)

            updated = False
            for admission in patient_data:
                hadm_id = admission.get('hadm_id')
                for event_key, grouped_events in events_dict.items():
                    l_event_keys.append(event_key)
                    # Initialize event structure if needed
                    if event_key not in admission:
                        admission[event_key] = []
                    # Update data if available
                    if (subject_id, hadm_id) in grouped_events:
                        admission[event_key] = grouped_events[(subject_id, hadm_id)]
                        updated = True

            # Write back the updated JSON file
            # if updated:
            #     with open(jfile_path, 'w') as file:
            #         json.dump(patient_data, file, indent=4)

            if updated:
                with open(jfile_path, 'w') as file:
                    json.dump(patient_data, file, indent=4)  # Change indent to 4 for better readability
                l_updated.append(subject_id)
            else:
                l_failed.append(subject_id)

    # print(l_event_keys)
    print(f'Updated data for {len(l_updated)} subjects.')
    if l_failed:
        print(f"Update failed for {len(l_failed)} subjects: {l_failed}.")
    print("")
    print("Update complete for all JSON files.")

In [29]:
events_dict = {
    'diagnoses_icd': diagnoses_icd,
    'drgcodes': drgcodes,
    'hcpcsevents': hcpcsevents,
    'labevents': labevents,
    'microbiologyevents': microbiologyevents,
    'pharmacy': pharmacy,
    'poe': poe,
    'prescriptions': prescriptions,
    'procedures_icd': procedures_icd,
    'services': services,
    'transfers': transfers,
    'icustays': icustays
}

# Call the function with the prepared dictionary
update_all_json_files_once(events_dict, jfiles_dir)

Updated data for 100 subjects.

Update complete for all JSON files.


In [30]:
def update_jsons_for_all_events_once(events_dict, dir):
    """
    Update JSON files with event data from multiple DataFrames, handling each file once.
    Groups data by subject_id, hadm_id, and stay_id before updating.

    Args:
    - events_dict (dict): Dictionary mapping event keys to their respective DataFrames.
    - dir (str): Directory containing the JSON files to be updated.
    """
    # Process each DataFrame to group data by subject_id, hadm_id, and stay_id
    grouped_events_dict = {}
    for event_key, events_df in events_dict.items():
        # Convert datetime columns to string format
        datetime_fields = ['charttime', 'storetime', 'starttime', 'endtime', 'value']
        for field in datetime_fields:
            if field in events_df.columns:
                events_df[field] = pd.to_datetime(events_df[field], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')

        events_df = events_df.where(pd.notnull(events_df), None)

        # Group and convert to a nested dict
        grouped_events = events_df.groupby(['subject_id', 'hadm_id', 'stay_id']).apply(lambda x: x.to_dict(orient='records')).to_dict()
        grouped_events_dict[event_key] = grouped_events

    l_updated = []
    l_failed = []
    # Iterate over JSON files once, updating each with all relevant event data
    for filename in os.listdir(dir):
        if filename.endswith(".json"):
            subject_id = int(filename.split('.')[0])
            jfile_path = os.path.join(dir, filename)

            with open(jfile_path, 'r') as file:
                patient_data = json.load(file)

            updated = False
            for admission in patient_data:
                hadm_id = admission.get('hadm_id')
                if 'icustays' in admission:
                    for icustay in admission['icustays']:
                        stay_id = icustay.get('stay_id')
                        for event_key, grouped_events in grouped_events_dict.items():
                            key = (subject_id, hadm_id, stay_id)
                            if key in grouped_events:
                                icustay[event_key] = grouped_events[key]
                                updated = True
                            elif event_key not in icustay:
                                # Initialize empty event data structure if the key does not exist
                                icustay[event_key] = []

            if updated:
                with open(jfile_path, 'w') as file:
                    json.dump(patient_data, file, indent=4)
                l_updated.append(subject_id)
                # print(f'Updated file for subject_id {subject_id}')
            else:
                l_failed.append(subject_id)
                # print(f'No update needed for subject_id {subject_id}')

    print(f'Updated data for {len(l_updated)} subjects.')
    if l_failed:
        print(f"Update failed for {len(l_failed)} subjects: {l_failed}.")
    print("")

In [31]:
events_dict = {
    'chartevents': chartevents,
    'datetimeevents': datetimeevents,
    'inputevents': inputevents,
    'outputevents': outputevents,
    'procedureevents': procedureevents,
    'ingredientevents': ingredientevents,
}

update_jsons_for_all_events_once(events_dict, jfiles_dir)

Updated data for 100 subjects.



In [32]:
files = os.listdir(jfiles_dir)

for file in files:
    path = os.path.join(jfiles_dir, file)

    with open(path, "r") as f:
        data = f.read()

    data = data.replace("NaN", '""')

    with open(path, "w") as f:
        f.write(data)