In [1]:

# Import Libraries
import datetime as dt
import json
import os


import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import src.data_processing.MIMIC.test_functions as tests

# LOAD CONFIGURATION 
with open("src/data_processing/MIMIC/MIMIC_PROCESSING_DEFAULT_VARS.json", "r") as f:
    DEFAULT_CONFIG = json.load(f)
    f.close()

if not os.path.exists(DEFAULT_CONFIG["SAVE_FD"]):
    os.makedirs(DEFAULT_CONFIG["SAVE_FD"])

In [5]:
try:
    assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv")
    assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "vitals_intermediate.csv")

except AssertionError as e:
    raise e

In [7]:
# Print Information
print("\n\n ======== PROCESSING OUTCOMES ======== \n\n")

# Load previously processed data
adm_proc = pd.read_csv(
    DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv",
    index_col=0, 
    header=0, 
    parse_dates=["intime", "outtime", "intime_next", "outtime_next", "deathtime"]
)
vit_proc = (
    pd.read_csv(
    DEFAULT_CONFIG["SAVE_FD"] + "vitals_intermediate.csv", 
    index_col=0, 
    header=0, 
    parse_dates=DEFAULT_CONFIG["VITALS_TIME_VARS"]
    )
    .reset_index(drop=False)
    .assign(sampled_time_to_end=lambda x: pd.to_timedelta(x["sampled_time_to_end"]))  # pd does not load timedelta automatically
)


# Check correct computation of admissions and vitals
tests.test_admissions_processed_correctly(adm_proc)
tests.test_vitals_processed_correctly(vit_proc, config_dic=DEFAULT_CONFIG)






Testing admissions processed correctly...

Testing outtime is after intime.
Test passed!

Testing next transfer information is consistent.
Test passed!

Testing admission times are before death (if exists).
Test passed!

Testing ids are unique for params ('subject_id', 'hadm_id', 'stay_id', 'transfer_id_next')
Test passed for variable  subject_id!
Test passed for variable  hadm_id!
Test passed for variable  stay_id!
Test passed for variable  transfer_id_next!

Testing ids are complete for params ('subject_id', 'stay_id', 'intime', 'outtime')
Test passed for variable subject_id!
Test passed for variable stay_id!
Test passed for variable intime!
Test passed for variable outtime!
Test passed!
Admissions correctly computed! Safe to go ahead.

Testing vitals were processed correctly and make sense.

Testing ids are complete for params ('subject_id', 'stay_id', 'sampled_time_to_end')
Test passed for variable subject_id!
Test passed for variable stay_id!
Test passed for variable sampled_

100%|████████████████████████████| 8364/8364 [00:25<00:00, 332.38it/s]


Test passed!

Testing resampling data is linear from min to max per patient.


100%|███████████████████████████| 8364/8364 [00:03<00:00, 2695.11it/s]

Test passed!
Vitals seem correctly processed!





In [8]:
# Load core info
transfers_core = pd.read_csv(
    DEFAULT_CONFIG["DATA_FD"] + "core/transfers.csv", 
    index_col=None, 
    header=0, 
    parse_dates=["intime", "outtime"]
)
admissions_core = pd.read_csv(
    DEFAULT_CONFIG["DATA_FD"] + "core/admissions.csv",
    index_col=None,
    header=0,
    parse_dates=["admittime", "dischtime", "deathtime", "edregtime", "edouttime"]
)



In [141]:

"""
Step 1: Subset the set of transfers/admissions_core to the already processed cohort.

We do this by merging. 
"""

# Define Id for merging. We separate deathtime as one database registers only date, while the other
# registers everyting (i.e. up to second)
tr_merge_ids = [
    col for 
    col in vit_proc.columns.tolist() if
    col in transfers_core.columns.tolist() and
    "death" not in col
]
hadm_merge_ids = [
    col for
    col in vit_proc.columns.tolist() if
    col in admissions_core.columns.tolist() and
    "death" not in col
]
merge_ids = ["subject_id", "hadm_id", "stay_id"]         # Useful simplication

# Inner merge for transfers core
transfers_S1 = (
    transfers_core
    .merge(
        vit_proc.drop_duplicates(subset=merge_ids),   # Drop duplicates as we don't need all the rows
        how="inner",
        on=tr_merge_ids
    )
    .dropna(subset=["hadm_id"])                 # Drop rows with no hadm_id as we can't compare with transfers
    .sort_values(by=merge_ids, ascending=True) # Sort by subject_id and stay_id
)

# Inner merge for admissions core
admissions_S1 = (
    admissions_core
    .merge(
        vit_proc.drop_duplicates(subset=merge_ids), # only want one obvs per admission for merging
        how="inner",
        on=hadm_merge_ids
    )
    .dropna(subset=["hadm_id"])            # Drop rows with no hadm_id as we can't compare with transfers
    .sort_values(by=merge_ids, ascending=True) # Sort by subject_id and stay_id
)

# Testing and save
tests.test_ids_subset_of_cohort(transfers_S1, vit_proc, *merge_ids)
tests.test_ids_subset_of_cohort(admissions_S1, vit_proc, *merge_ids)
tests.test_is_complete_ids(transfers_S1, *merge_ids, "stay_id")
tests.test_is_complete_ids(admissions_S1, *merge_ids, "stay_id")

# Check processing and correctdeness
transfers_S1.to_csv(DEFAULT_CONFIG["SAVE_FD"] + "transfers_S1.csv", header=True, index=True)



Testing ('subject_id', 'hadm_id') are subset of cohort data.
Test passed!

Testing ('subject_id', 'hadm_id') are subset of cohort data.
Test passed!

Testing ids are complete for params ('subject_id', 'hadm_id', 'stay_id')
Test passed for variable subject_id!
Test passed for variable hadm_id!
Test passed for variable stay_id!

Testing ids are complete for params ('subject_id', 'hadm_id', 'stay_id')
Test passed for variable subject_id!
Test passed for variable hadm_id!
Test passed for variable stay_id!


In [142]:
transfers_S1

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime,index,sampled_time_to_end,TEMP,...,transfer_id_next,eventtype_next,careunit_next,intime_next,outtime_next,gender,age,ESI,deathtime,charttime_ub
12469,10000084,23052089.0,35203156,ED,Emergency Department,2160-11-20 20:36:00,2160-11-21 03:20:00,29105,0 days 00:00:00,98.00,...,,,,NaT,NaT,1,72,2.0,NaT,2160-11-21 03:20:00
12470,10000084,23052089.0,35203156,ED,Emergency Department,2160-11-20 20:36:00,2160-11-21 03:20:00,29106,0 days 01:00:00,,...,,,,NaT,NaT,1,72,2.0,NaT,2160-11-21 02:20:00
12471,10000084,23052089.0,35203156,ED,Emergency Department,2160-11-20 20:36:00,2160-11-21 03:20:00,29107,0 days 02:00:00,98.00,...,,,,NaT,NaT,1,72,2.0,NaT,2160-11-21 01:20:00
12472,10000084,23052089.0,35203156,ED,Emergency Department,2160-11-20 20:36:00,2160-11-21 03:20:00,29108,0 days 03:00:00,,...,,,,NaT,NaT,1,72,2.0,NaT,2160-11-21 00:20:00
12473,10000084,23052089.0,35203156,ED,Emergency Department,2160-11-20 20:36:00,2160-11-21 03:20:00,29109,0 days 04:00:00,,...,,,,NaT,NaT,1,72,2.0,NaT,2160-11-20 23:20:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25257,19999204,29046609.0,32654415,ED,Emergency Department,2146-05-30 14:10:00,2146-05-30 20:09:00,14938,0 days 01:00:00,98.10,...,36277445.0,admit,Med/Surg,2146-05-30 20:09:00,2146-06-08 20:27:30,1,61,3.0,NaT,2146-05-30 19:09:00
25258,19999204,29046609.0,32654415,ED,Emergency Department,2146-05-30 14:10:00,2146-05-30 20:09:00,14939,0 days 02:00:00,,...,36277445.0,admit,Med/Surg,2146-05-30 20:09:00,2146-06-08 20:27:30,1,61,3.0,NaT,2146-05-30 18:09:00
25259,19999204,29046609.0,32654415,ED,Emergency Department,2146-05-30 14:10:00,2146-05-30 20:09:00,14940,0 days 03:00:00,98.10,...,36277445.0,admit,Med/Surg,2146-05-30 20:09:00,2146-06-08 20:27:30,1,61,3.0,NaT,2146-05-30 17:09:00
25260,19999204,29046609.0,32654415,ED,Emergency Department,2146-05-30 14:10:00,2146-05-30 20:09:00,14941,0 days 04:00:00,,...,36277445.0,admit,Med/Surg,2146-05-30 20:09:00,2146-06-08 20:27:30,1,61,3.0,NaT,2146-05-30 16:09:00
