# Testing Processing For MIMIC

In [1]:
import json
import os

import pandas as pd

# Important for pandas bar progress
from tqdm import tqdm

tqdm.pandas()

# Test functions to check processing
import src.data_processing.MIMIC.test_functions as tests

In [25]:
with open("src/data_processing/MIMIC/MIMIC_PROCESSING_DEFAULT_VARS.json", "r") as f:
    DEFAULT_CONFIG = json.load(f)
    f.close()

    

In [21]:
assert os.path.exists(DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv")

In [29]:
adm_inter = pd.read_csv(
                DEFAULT_CONFIG["SAVE_FD"] + "admissions_intermediate.csv", 
                index_col=0, 
                header=0, 
                parse_dates=DEFAULT_CONFIG["VITALS_TIME_VARS"]
                )
vital_signs_ed = pd.read_csv(
                DEFAULT_CONFIG["DATA_FD"] + "ed/vitalsign.csv", 
                index_col=None, 
                header=0, 
                low_memory=False,
                parse_dates=["charttime"]) 

In [30]:
vital_signs_ed

Unnamed: 0,subject_id,stay_id,charttime,temperature,heartrate,resprate,o2sat,sbp,dbp,rhythm,pain
0,16113983,37539106,2116-06-10 00:32:00,98.20,82.0,15.0,,106.0,72.0,,
1,15128994,30058281,2167-08-29 02:25:00,98.30,79.0,20.0,97.0,126.0,73.0,,0
2,15128994,30058281,2167-08-29 04:51:00,97.60,79.0,20.0,98.0,126.0,73.0,,0
3,15128994,30058281,2167-08-29 05:35:00,98.30,76.0,18.0,,123.0,68.0,,0/10
4,18019452,37300626,2148-12-19 12:34:00,98.10,100.0,16.0,98.0,129.0,86.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
1651114,14212258,31798078,2161-08-27 14:08:00,98.70,67.0,18.0,99.0,139.0,90.0,,0
1651115,18569084,30638322,2153-02-13 15:18:00,,,,,,,,4
1651116,18569084,30638322,2153-02-13 15:19:00,97.90,62.0,16.0,,108.0,57.0,,4
1651117,18569084,30638322,2153-02-13 17:55:00,,57.0,15.0,95.0,110.0,54.0,,0


In [35]:
tests.admissions_processed_correctly(adm_inter)
tests.test_is_complete_ids(vital_signs_ed, "subject_id", "stay_id")

Admissions correctly computed! Safe to go ahead.


In [49]:
test = (vital_signs_ed
	      		.merge(right=adm_inter, how="inner", on=["subject_id","stay_id"]) # merge on stay_id (keep those that are in adm_inter)
			    .rename(DEFAULT_CONFIG["VITALS_RENAMING_DIC"], axis=1) # Rename columns
			    .query("charttime >= intime")       # Observations after intime
			    .query("charttime <= outtime")      # Observations before outtime
		  )
	

In [119]:
def _resample(x, time_feats, resampling_rule):
	"Resample the dataset given time to end column based on the resampling rule."

	# Check dataframe is sorted
	assert x["time_to_end"].is_monotonic_decreasing

	# Create intermediate output to resample
	_x = x[["time_to_end"] + time_feats]
	resampled_x = _x.resample(on="time_to_end", rule=resampling_rule, closed="left", label="left").mean()

	# Add resampling array
	resampled_x.index.name = "sampled_time_to_end"
	resampled_x.reset_index(drop=False, inplace=True)

	# Now add all other vars
	static_vars = [col for col in x.columns if col not in time_feats + ["time_to_end"]]
	pat_static_info = x[static_vars].iloc[0, :]

	# Add to resampled data
	resampled_x[static_vars] = pat_static_info

	return resampled_x

In [130]:
vitals_S3 = (vitals_S2
       .assign(time_to_end=lambda x: x["outtime"] - x["charttime"])
       .sort_values(by=["stay_id", "time_to_end"], ascending=[True, False])
       .groupby("stay_id", as_index=False)
       .progress_apply(lambda x: _resample(x, time_feats = ["HR", "RR", "SBP"], resampling_rule = DEFAULT_CONFIG["RESAMPLING_RULE"]))
       .reset_index(drop=True))
vitals_S3.to_csv(DEFAULT_CONFIG["SAVE_FD"] + "vitals_S3.csv", index=True, header=True)



  3%|██                                                                                | 730/28668 [00:10<06:35, 70.63it/s]


KeyboardInterrupt: 

In [122]:
test

Unnamed: 0,sampled_time_to_end,HR,RR,SBP,subject_id,stay_id,charttime,TEMP,SPO2,DBP,...,outtime_next,transfer_id,eventtype,careunit,intime,outtime,gender,age,ESI,deathtime
0,0 days 01:47:00,119.0,19.0,133.0,18732792,30003172,2136-08-23 10:30:00,99.9,97.0,85.0,...,2136-08-24 19:25:15,30003172,ED,Emergency Department,2136-08-23 09:57:00,2136-08-23 13:23:00,M,25,2.0,NaT
1,0 days 02:47:00,123.0,18.0,144.0,18732792,30003172,2136-08-23 10:30:00,99.9,97.0,85.0,...,2136-08-24 19:25:15,30003172,ED,Emergency Department,2136-08-23 09:57:00,2136-08-23 13:23:00,M,25,2.0,NaT
2,0 days 02:01:00,70.0,22.0,119.0,15641146,30008481,2120-02-10 16:20:00,,94.0,68.0,...,2120-02-16 18:00:15,30008481,ED,Emergency Department,2120-02-10 14:55:00,2120-02-10 23:21:00,F,87,2.0,NaT
3,0 days 03:01:00,77.0,20.0,110.0,15641146,30008481,2120-02-10 16:20:00,,94.0,68.0,...,2120-02-16 18:00:15,30008481,ED,Emergency Department,2120-02-10 14:55:00,2120-02-10 23:21:00,F,87,2.0,NaT
4,0 days 04:01:00,,,,15641146,30008481,2120-02-10 16:20:00,,94.0,68.0,...,2120-02-16 18:00:15,30008481,ED,Emergency Department,2120-02-10 14:55:00,2120-02-10 23:21:00,F,87,2.0,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21144,0 days 04:09:00,64.0,14.0,122.0,19344208,39994809,2128-11-29 17:41:00,,100.0,79.0,...,NaT,39994809,ED,Emergency Department,2128-11-29 14:39:00,2128-11-29 22:36:00,F,53,2.0,NaT
21145,0 days 01:01:00,79.5,17.0,130.0,19919388,39999434,2163-12-03 19:37:00,99.0,100.0,93.0,...,NaT,39999434,ED,Emergency Department,2163-12-03 19:36:00,2163-12-04 00:35:00,M,47,3.0,NaT
21146,0 days 02:01:00,,,,19919388,39999434,2163-12-03 19:37:00,99.0,100.0,93.0,...,NaT,39999434,ED,Emergency Department,2163-12-03 19:36:00,2163-12-04 00:35:00,M,47,3.0,NaT
21147,0 days 03:01:00,,,,19919388,39999434,2163-12-03 19:37:00,99.0,100.0,93.0,...,NaT,39999434,ED,Emergency Department,2163-12-03 19:36:00,2163-12-04 00:35:00,M,47,3.0,NaT


In [123]:

vitals_S4 = (vitals_S3
    .groupby("stay_id", as_index=True)
    .filter(lambda x: x.shape[0] >= DEFAULT_CONFIG["MIN_NUM_OBSERVS"] and
            x[DEFAULT_CONFIG["VITALS_RENAMING_DIC"].values()].isna().sum().le(
                x.shape[0] * DEFAULT_CONFIG["NA_PROP_THRESH"]
                ).all()
            )
.reset_index(drop=True)
)

In [128]:
vitals_S5 = (vitals_S4
        .groupby("stay_id", as_index=True)
        .filter(lambda x: x["sampled_time_to_end"].dt.total_seconds().min() <=   # Check <= for minimum sampled time
                DEFAULT_CONFIG["LAST_OBVS_TIME_TO_EXIT"] * 3600   # Convert to horus
        ))
    

In [129]:
vitals_S5

Unnamed: 0,sampled_time_to_end,HR,RR,SBP,subject_id,stay_id,charttime,TEMP,SPO2,DBP,...,outtime_next,transfer_id,eventtype,careunit,intime,outtime,gender,age,ESI,deathtime
0,0 days 00:22:15,95.0,18.0,138.0,15266492,30010983,2160-08-22 17:12:00,98.1,99.0,105.0,...,NaT,30010983,ED,Emergency Department,2160-08-22 17:10:00,2160-08-22 22:52:15,M,31,2.0,NaT
1,0 days 01:22:15,,,,15266492,30010983,2160-08-22 17:12:00,98.1,99.0,105.0,...,NaT,30010983,ED,Emergency Department,2160-08-22 17:10:00,2160-08-22 22:52:15,M,31,2.0,NaT
2,0 days 02:22:15,97.0,18.0,137.0,15266492,30010983,2160-08-22 17:12:00,98.1,99.0,105.0,...,NaT,30010983,ED,Emergency Department,2160-08-22 17:10:00,2160-08-22 22:52:15,M,31,2.0,NaT
3,0 days 03:22:15,,,,15266492,30010983,2160-08-22 17:12:00,98.1,99.0,105.0,...,NaT,30010983,ED,Emergency Department,2160-08-22 17:10:00,2160-08-22 22:52:15,M,31,2.0,NaT
4,0 days 04:22:15,,,,15266492,30010983,2160-08-22 17:12:00,98.1,99.0,105.0,...,NaT,30010983,ED,Emergency Department,2160-08-22 17:10:00,2160-08-22 22:52:15,M,31,2.0,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6425,0 days 20:50:00,66.0,16.0,123.0,16044228,39987975,2185-02-18 18:36:00,98.2,100.0,81.0,...,NaT,39987975,ED,Emergency Department,2185-02-18 18:33:00,2185-02-19 16:19:00,F,75,3.0,NaT
6426,0 days 01:01:00,79.5,17.0,130.0,19919388,39999434,2163-12-03 19:37:00,99.0,100.0,93.0,...,NaT,39999434,ED,Emergency Department,2163-12-03 19:36:00,2163-12-04 00:35:00,M,47,3.0,NaT
6427,0 days 02:01:00,,,,19919388,39999434,2163-12-03 19:37:00,99.0,100.0,93.0,...,NaT,39999434,ED,Emergency Department,2163-12-03 19:36:00,2163-12-04 00:35:00,M,47,3.0,NaT
6428,0 days 03:01:00,,,,19919388,39999434,2163-12-03 19:37:00,99.0,100.0,93.0,...,NaT,39999434,ED,Emergency Department,2163-12-03 19:36:00,2163-12-04 00:35:00,M,47,3.0,NaT
