In [1]:

# Import Libraries
import datetime as dt
import json
import os

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import src.data_processing.MIMIC.test_functions as tests

# LOAD CONFIGURATION 
with open("src/data_processing/MIMIC/MIMIC_PROCESSING_DEFAULT_VARS.json", "r") as f:
    DEFAULT_CONFIG = json.load(f)
    f.close()

if not os.path.exists(DEFAULT_CONFIG["SAVE_FD"]):
    os.makedirs(DEFAULT_CONFIG["SAVE_FD"])

In [2]:
try:
    assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv")
    assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "vitals_intermediate.csv")

except AssertionError as e:
    raise e

In [3]:
from importlib import reload
reload(tests)

<module 'src.data_processing.MIMIC.test_functions' from '/home/hq-boss31/Projects/Github/VarPhenClustering/src/data_processing/MIMIC/test_functions.py'>

In [5]:
# Print Information
print("\n\n ======== PROCESSING OUTCOMES ======== \n\n")

# Load previously processed data
adm_proc = pd.read_csv(
    DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv",
    index_col=0, 
    header=0, 
    parse_dates=["intime", "outtime", "intime_next", "outtime_next", "deathtime"]
)
vit_proc = (
    pd.read_csv(
    DEFAULT_CONFIG["SAVE_FD"] + "vitals_intermediate.csv", 
    index_col=0, 
    header=0, 
    parse_dates=DEFAULT_CONFIG["VITALS_TIME_VARS"]
    )
    .reset_index(drop=False)
    .assign(sampled_time_to_end=lambda x: pd.to_timedelta(x["sampled_time_to_end"]))  # pd does not load timedelta automatically
)


# Check correct computation of admissions and vitals
tests.test_admissions_processed_correctly(adm_proc)
tests.test_vitals_processed_correctly(vit_proc, config_dic=DEFAULT_CONFIG)






Testing admissions processed correctly...

Testing outtime is after intime.
Test passed!

Testing next transfer information is consistent.
Test passed!

Testing admission times are before death (if exists).
Test passed!

Testing ids are unique for params ('subject_id', 'hadm_id', 'stay_id', 'transfer_id_next')
Test passed!

Testing ids are complete for params ('subject_id', 'stay_id', 'intime', 'outtime')
Test passed!
Admissions correctly computed! Safe to go ahead.

Testing vitals were processed correctly and make sense.

Testing ids are complete for params ('subject_id', 'stay_id', 'sampled_time_to_end')

Testing next transfer information is consistent.
Test passed!

Testing stays have sufficient data based on info dic parameters.
Test passed!

Testing last observation is AT MOST td_window before admission outtime.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8364/8364 [00:27<00:00, 309.21it/s]


Test passed!

Testing resampling data is linear from min to max per patient.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8364/8364 [00:03<00:00, 2296.00it/s]

Test passed!
Vitals seem correctly processed!





In [6]:
# Load core info
transfers_core = pd.read_csv(
    DEFAULT_CONFIG["DATA_FD"] + "core/transfers.csv", 
    index_col=None, 
    header=0, 
    parse_dates=["intime", "outtime"]
)
admissions_core = pd.read_csv(
    DEFAULT_CONFIG["DATA_FD"] + "core/admissions.csv",
    index_col=None,
    header=0,
    parse_dates=["admittime", "dischtime", "deathtime", "edregtime", "edouttime"]
)



In [7]:

"""
Step 1: Subset the set of transfers/admissions_core to the already processed cohort.

We do this by merging. 
"""
merge_ids = ["subject_id", "hadm_id"]

# Inner merge for transfers core
transfers_S1 = (
    transfers_core
    .merge(
        vit_proc[merge_ids + ["stay_id"]].drop_duplicates(),   # Drop duplicates as we don't need all the rows
        how="inner",
        on=merge_ids
    )
    .dropna(subset=["hadm_id"])                 # Drop rows with no hadm_id as we can't compare with transfers
)

# Inner merge for admissions core
admissions_S1 = (
    admissions_core
    .merge(
        vit_proc[merge_ids + ["stay_id"]].drop_duplicates(),
        how="inner",
        on=merge_ids
    )
    .dropna(subset=["hadm_id"])            # Drop rows with no hadm_id as we can't compare with transfers
)

# Testing and save
tests.test_ids_subset_of_cohort(transfers_S1, vit_proc, *merge_ids)
tests.test_ids_subset_of_cohort(admissions_S1, vit_proc, *merge_ids)
tests.test_is_complete_ids(transfers_S1, *merge_ids, "stay_id")
tests.test_is_complete_ids(admissions_S1, *merge_ids, "stay_id")

# Check processing and correctdeness
transfers_S1.to_csv(DEFAULT_CONFIG["SAVE_FD"] + "transfers_S1.csv", header=True, index=True)


Testing ('subject_id', 'hadm_id') are subset of cohort data.
Test passed!

Testing ('subject_id', 'hadm_id') are subset of cohort data.
Test passed!

Testing ids are complete for params ('subject_id', 'hadm_id', 'stay_id')

Testing ids are complete for params ('subject_id', 'hadm_id', 'stay_id')
