In [2]:

# Import Libraries
import datetime as dt
import json
import os


import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import src.data_processing.MIMIC.test_functions as tests

# LOAD CONFIGURATION 
with open("src/data_processing/MIMIC/MIMIC_PROCESSING_DEFAULT_VARS.json", "r") as f:
    DEFAULT_CONFIG = json.load(f)
    f.close()

if not os.path.exists(DEFAULT_CONFIG["SAVE_FD"]):
    os.makedirs(DEFAULT_CONFIG["SAVE_FD"])

In [3]:
try:
    assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv")
    assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "vitals_intermediate.csv")

except AssertionError as e:
    raise e

In [None]:
# Print Information
print("\n\n ======== PROCESSING OUTCOMES ======== \n\n")

# Load previously processed data
adm_proc = pd.read_csv(
    DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv",
    index_col=0, 
    header=0, 
    parse_dates=["intime", "outtime", "intime_next", "outtime_next", "deathtime"]
)
vit_proc = (
    pd.read_csv(
    DEFAULT_CONFIG["SAVE_FD"] + "vitals_intermediate.csv", 
    index_col=0, 
    header=0, 
    parse_dates=DEFAULT_CONFIG["VITALS_TIME_VARS"]
    )
    .reset_index(drop=False)
    .assign(sampled_time_to_end=lambda x: pd.to_timedelta(x["sampled_time_to_end"]))  # pd does not load timedelta automatically
)


# Check correct computation of admissions and vitals
tests.test_admissions_processed_correctly(adm_proc)
tests.test_vitals_processed_correctly(vit_proc, config_dic=DEFAULT_CONFIG)

In [5]:
# Load core info
transfers_core = pd.read_csv(
    DEFAULT_CONFIG["DATA_FD"] + "core/transfers.csv", 
    index_col=None, 
    header=0, 
    parse_dates=["intime", "outtime"]
)
admissions_core = pd.read_csv(
    DEFAULT_CONFIG["DATA_FD"] + "core/admissions.csv",
    index_col=None,
    header=0,
    parse_dates=["admittime", "dischtime", "deathtime", "edregtime", "edouttime"]
)



In [26]:

"""
Step 1: Subset the set of transfers/admissions_core to the already processed cohort.

We do this by merging. 
"""

# Define Id for merging. We separate deathtime as one database registers only date, while the other
# registers everyting (i.e. up to second)
# tr_merge_ids = [
#     col for 
#     col in vit_proc.columns.tolist() if
#     col in transfers_core.columns.tolist() and
#     "death" not in col
# ]
hadm_merge_ids = [
    col for
    col in vit_proc.columns.tolist() if
    col in admissions_core.columns.tolist() and
    "death" not in col
]
merge_ids = ["subject_id", "hadm_id", "stay_id"]         # Useful simplication

# # Inner merge for transfers core
# transfers_S1 = (
#     transfers_core
#     .merge(
#         vit_proc.drop_duplicates(subset=merge_ids),   # Drop duplicates as we don't need all the rows
#         how="inner",
#         on=tr_merge_ids
#     )
#     .dropna(subset=["hadm_id"])                 # Drop rows with no hadm_id as we can't compare with transfers
#     .sort_values(by=merge_ids, ascending=True) # Sort by subject_id and stay_id
# )

# Inner merge for admissions core
admissions_S1 = (
    admissions_core
    .merge(
        vit_proc.drop_duplicates(subset=merge_ids), # only want one obvs per admission for merging
        how="inner",
        on=hadm_merge_ids,
        suffixes=("", "_ed")
    )
    .dropna(subset=["hadm_id"])            # Drop rows with no hadm_id as we can't compare with transfers
    .sort_values(by=merge_ids, ascending=True) # Sort by subject_id and stay_id
)

# Testing and save
# tests.test_ids_subset_of_cohort(transfers_S1, vit_proc, *merge_ids)
tests.test_ids_subset_of_cohort(admissions_S1, vit_proc, *merge_ids)
# tests.test_is_complete_ids(transfers_S1, *merge_ids, "stay_id")
tests.test_is_complete_ids(admissions_S1, *merge_ids, "stay_id")

# Check processing and correctdeness
# transfers_S1.to_csv(DEFAULT_CONFIG["SAVE_FD"] + "transfers_S1.csv", header=True, index=True)



Testing ('subject_id', 'hadm_id', 'stay_id') are subset of cohort data.
Test passed!

Testing ids are complete for params ('subject_id', 'hadm_id', 'stay_id', 'stay_id')
Test passed for variable subject_id!
Test passed for variable hadm_id!
Test passed for variable stay_id!
Test passed for variable stay_id!


In [27]:
admissions_S2 = (
    admissions_S1
    .query("intime <= admittime")                            # admissions to hospital after ED admissions
    .query("intime_next >= admittime | intime_next.isna()")  # admissions to hospital before next ED transfer
    .query("outtime <= edouttime")                           # transfer outtime before ed exit time
    .query("intime <= edregtime")                            # transfer intmie before ed registration time
    .query("dischtime - outtime_next >= @pd.Timedelta('-6h') | outtime_next.isna()")
    # discharge time not earlier than outtime_next (added -6 hours due to some potential delays)
)

In [None]:

def _select_outcome(vitals, transfers, window):
    """
    Determine outcome of admission given dataset with vital and static information, 
    and data for transfers.
    output order is [D, I, W, Disc].
    """

    # Load static information from vitals
    try:
        ed_outtime, dod = vitals[["outtime", "deathtime"]].iloc[0, :]
    except IndexError:
        ed_outtime, dod = vitals[["outtime", "deathtime"]].iloc[:]
    

    # Check deathtime first
    if dod != np.nan:
        
        # Get time to death
        time_to_death = dod - ed_outtime
        if time_to_death <= window:
            return [1, 0, 0, 0]       
        
    # If there is no death, or patient died after time window
    lower_bound, upper_bound = ed_outtime, ed_outtime + window
    transfers_within_window = (
        transfers
        .query("intime >= @lower_bound")
        .query("intime <= @upper_bound")
    )
    
    # Identify ICUs
    has_icus = (
        transfers_within_window.careunit.str.contains("(?i)ICU", na=False) |
        transfers_within_window.careunit.str.contains("(?i)Neuro Stepdown", na=False)
    )

    # If ICU admission
    if has_icus.sum() > 0:
        return [0, 1, 0, 0]
    
    # Check to see transfers contain discharge
    has_discharge = transfers_within_window.eventtype.str.contains("discharge", na=False)
    if has_discharge.sum() > 0:
        return [0, 0, 0, 1]
    
    else:
        return [0, 0, 1, 0]

In [61]:
def _test(x):
    return type(x)

# Compute outcome
test = (
    admissions_S2
    .assign(outcome_24h=)
)

In [62]:
test.outcome_24h

1869    NaN
3294    NaN
5413    NaN
2565    NaN
2857    NaN
       ... 
85      NaN
4740    NaN
4901    NaN
5547    NaN
3900    NaN
Name: outcome_24h, Length: 8140, dtype: object