In [5]:
import duckdb
import pathlib as pl
import numpy as np
import pandas as pd
from IPython.display import display

# automatically locates the project root and set up relative paths to the data folder
# this makes the notebook reproducible for anyone who clones the repo
ROOT = pl.Path.cwd().parent
DATA = ROOT / "MIMIC_ED"
RAW = DATA / "raw" / "mimicel.csv"

# relative paths
RAW = pl.Path("../../MIMIC_ED/raw/mimicel.csv")
CLEAN = pl.Path("../../MIMIC_ED/cleaned/mimicel_clean.csv")

pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 2000)        # Increase total display width

In [10]:
con = duckdb.connect()

df = con.execute(f"""
    SELECT *
    FROM read_csv_auto('{RAW}', HEADER=TRUE)
""").df()
df.shape

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

(7568824, 31)

In [11]:
df.head()

Unnamed: 0,stay_id,subject_id,hadm_id,timestamps,activity,gender,race,arrival_transport,disposition,seq_num,icd_code,icd_version,icd_title,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,name,gsn,ndc,etc_rn,etccode,etcdescription,med_rn,gsn_rn
0,30000012,11714491,21562392,2126-02-14 20:22:00,Vital sign check,,,,,,,,,98.8,96.0,18.0,93.0,160.0,54.0,0.0,,,,,,,,,,,
1,30000012,11714491,21562392,2126-02-14 20:22:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,,,
2,30000012,11714491,21562392,2126-02-14 20:22:01,Triage in the ED,,,,,,,,,98.8,96.0,18.0,93.0,160.0,54.0,0.0,2.0,CHANGE IN MENTAL STATUS,,,,,,,,,
3,30000012,11714491,21562392,2126-02-14 22:21:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,,rifaximin,66295.0,54868615400.0,1.0,5844.0,Rifamycins and Related Derivative Antibiotics,,
4,30000012,11714491,21562392,2126-02-14 22:21:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,,"multivitamin,tx-minerals",2510.0,10267073710.0,1.0,704.0,Multivitamin and Mineral Combinations,,


In [41]:
# Group by stay_id and compute summary flags
by_stay = (
    df.groupby('stay_id', dropna=True)
      .agg(
          has_hadm=('hadm_id', lambda s: s.notna().any()),
          ever_admitted=('disposition', lambda s: (s == 'ADMITTED').any())
      )
)

# 1. Stays that were ADMITTED but have NO hadm_id
admitted_no_hadm = by_stay[(by_stay['ever_admitted']) & (~by_stay['has_hadm'])]

# 2. Stays that were NEVER ADMITTED but DO have a hadm_id
not_admitted_with_hadm = by_stay[(~by_stay['ever_admitted']) & (by_stay['has_hadm'])]

print("Total unique stay_ids:", len(by_stay))
print("Admitted but missing hadm_id:", len(admitted_no_hadm))
print("Not admitted but has hadm_id:", len(not_admitted_with_hadm))

Total unique stay_ids: 425028
Admitted but missing hadm_id: 384
Not admitted but has hadm_id: 45390


In [17]:
df['activity'].value_counts().head(20)

activity
Medicine reconciliation    2953118
Medicine dispensations     1441839
Vital sign check           1423734
Discharge from the ED       900077
Enter the ED                425028
Triage in the ED            425028
Name: count, dtype: int64

In [26]:
events = df.copy()
# ensure timestamps is datetime
events["timestamps"] = pd.to_datetime(events["timestamps"])
events

Unnamed: 0,stay_id,subject_id,hadm_id,timestamps,activity,gender,race,arrival_transport,disposition,seq_num,icd_code,icd_version,icd_title,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,rhythm,name,gsn,ndc,etc_rn,etccode,etcdescription,med_rn,gsn_rn
0,30000012,11714491,21562392,2126-02-14 20:22:00,Vital sign check,,,,,,,,,98.8,96.0,18.0,93.0,160.0,54.0,0,,,,,,,,,,,
1,30000012,11714491,21562392,2126-02-14 20:22:00,Enter the ED,F,WHITE,AMBULANCE,,,,,,,,,,,,,,,,,,,,,,,
2,30000012,11714491,21562392,2126-02-14 20:22:01,Triage in the ED,,,,,,,,,98.8,96.0,18.0,93.0,160.0,54.0,0,2.0,CHANGE IN MENTAL STATUS,,,,,,,,,
3,30000012,11714491,21562392,2126-02-14 22:21:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,,rifaximin,066295,54868615400,1,00005844,Rifamycins and Related Derivative Antibiotics,,
4,30000012,11714491,21562392,2126-02-14 22:21:00,Medicine reconciliation,,,,,,,,,,,,,,,,,,,"multivitamin,tx-minerals",002510,10267073710,1,00000704,Multivitamin and Mineral Combinations,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7568819,39999964,16479007,29821286,2130-06-06 18:27:00,Discharge from the ED,,,,HOME,2,R45851,10,Suicidal ideations,,,,,,,,,,,,,,,,,,
7568820,39999965,14733226,27008423,2125-09-14 00:46:00,Enter the ED,F,BLACK/AFRICAN AMERICAN,WALK IN,,,,,,,,,,,,,,,,,,,,,,,
7568821,39999965,14733226,27008423,2125-09-14 00:46:01,Triage in the ED,,,,,,,,,97.5,65.0,16.0,100.0,132.0,77.0,0,2.0,Labor,,,,,,,,,
7568822,39999965,14733226,27008423,2125-09-14 21:22:50,Discharge from the ED,,,,TRANSFER,1,64893,9,OTH CURR COND-ANTEPARTUM,,,,,,,,,,,,,,,,,,


In [28]:
# helper function to pull values like race, gender, etc out of column instead of just taking first row entry 
def first_not_null(s: pd.Series):
    s_non_null = s.dropna()
    if len(s_non_null) == 0:
        return pd.NA
    return s_non_null.iloc[0]

## Four-Table Architecture for the ED Simulation

| Table Name       | Granularity                | Purpose / Contents                                      | Why It Matters |
|------------------|-----------------------------|----------------------------------------------------------|----------------|
| **ed_stays**     | 1 row per **ED visit**      | High-level summary: arrival, triage, depart timestamps; demographics; transport; disposition | Defines the population of stays; used for LOS, arrival modeling, routing |
| **ed_activities**| Many rows per **stay_id**   | Detailed event timeline: vitals, meds, reconciliation, etc. | Feeds the DES with activity sequences and durations |
| **ed_diagnoses** | 1 row per **ICD code** per stay | Diagnosis profile: icd_code, icd_version, icd_title | Describes medical conditions impacting flow or resource needs |
| **ed_medications** | 1 row per **medication record** per stay | Medication details: name, gsn, ndc, med_rn, etc.; may include timestamp | Supports medication-driven event timing and resource modeling |

### Summary
- **ed_stays** = stay-level summary  
- **ed_activities** = event-level timeline  
- **ed_diagnoses** = diagnosis-level detail  
- **ed_medications** = medication-level detail  

This normalization is intentional and crucial for building a clean, efficient Discrete-Event Simulation (DES).


In [29]:
arrival_time = (
    events.loc[events["activity"] == "Enter the ED"]
    .groupby("stay_id")["timestamps"]
    .min()
    .rename("arrival_time")
)

triage_time = (
    events.loc[events["activity"] == "Triage in the ED"]
    .groupby("stay_id")["timestamps"]
    .min()
    .rename("triage_time")
)

depart_time = (
    events.loc[events["activity"] == "Discharge from the ED"]
    .groupby("stay_id")["timestamps"]
    .max()
    .rename("depart_time")
)

In [31]:
stay_level = (
    events
    .groupby("stay_id")
    .agg(
        subject_id=("subject_id", "first"),
        hadm_id=("hadm_id", "first"),
        gender=("gender", first_not_null),
        race=("race", first_not_null),
        arrival_transport=("arrival_transport", first_not_null),
        disposition=("disposition", first_not_null),
        acuity=("acuity", first_not_null),
        chiefcomplaint=("chiefcomplaint", first_not_null),
    )
)

In [32]:
ed_stays = (
    stay_level
    .join([arrival_time, triage_time, depart_time])
)

In [33]:
ed_stays["ed_los_hours"] = (
    ed_stays["depart_time"] - ed_stays["arrival_time"]
).dt.total_seconds() / 3600

ed_stays["triage_delay_minutes"] = (
    ed_stays["triage_time"] - ed_stays["arrival_time"]
).dt.total_seconds() / 60

In [34]:
ed_stays = ed_stays.reset_index()

In [68]:
ed_stays.head(15)

Unnamed: 0,stay_id,subject_id,hadm_id,gender,race,arrival_transport,disposition,acuity,chiefcomplaint,arrival_time,triage_time,depart_time,ed_los_hours,triage_delay_minutes,is_admitted
0,30000012,11714491,21562392.0,F,WHITE,AMBULANCE,ADMITTED,2.0,CHANGE IN MENTAL STATUS,2126-02-14 20:22:00,2126-02-14 20:22:01,2126-02-15 01:59:00,5.616667,0.016667,True
1,30000017,14230614,,M,BLACK/AFRICAN AMERICAN,WALK IN,ELOPED,3.0,"ETOH, Unable to ambulate",2185-06-18 11:51:00,2185-06-18 11:51:01,2185-06-18 15:53:00,4.033333,0.016667,False
2,30000038,13821532,26255538.0,F,WHITE,AMBULANCE,ADMITTED,3.0,Cough,2152-12-07 16:37:00,2152-12-07 16:37:01,2152-12-07 19:55:00,3.3,0.016667,True
3,30000039,13340997,23100190.0,M,WHITE,WALK IN,ADMITTED,3.0,s/p Fall,2165-10-06 11:47:00,2165-10-06 11:47:01,2165-10-06 20:18:00,8.516667,0.016667,True
4,30000055,19848164,,F,OTHER,WALK IN,HOME,3.0,L Ear pain,2155-07-18 17:03:00,2155-07-18 17:03:01,2155-07-18 21:12:00,4.15,0.016667,False
5,30000094,19862552,,F,WHITE,AMBULANCE,HOME,2.0,N,2183-09-04 16:08:00,2183-09-04 16:08:01,2183-09-05 00:12:00,8.066667,0.016667,False
6,30000112,13333760,,F,HISPANIC/LATINO - PUERTO RICAN,WALK IN,HOME,3.0,Abnormal labs,2157-12-12 12:45:00,2157-12-12 12:45:01,2157-12-12 15:22:00,2.616667,0.016667,False
7,30000177,17937834,23831044.0,M,ASIAN - SOUTH EAST ASIAN,WALK IN,ADMITTED,2.0,"Abd pain, Vomiting",2143-12-27 22:50:00,2143-12-27 22:50:01,2143-12-28 03:48:00,4.966667,0.016667,True
8,30000202,15346940,,M,BLACK/AFRICAN AMERICAN,WALK IN,HOME,2.0,ALLERGIC REACTION,2158-08-08 10:07:00,2158-08-08 10:07:01,2158-08-08 18:23:00,8.266667,0.016667,False
9,30000204,11615015,25540031.0,M,WHITE,AMBULANCE,HOME,3.0,S/P ASSAULT,2132-10-10 06:36:00,2132-10-10 06:36:01,2132-10-10 18:45:00,12.15,0.016667,False


In [67]:
# adding is_admitted in case we link this data with the ICU MIMIC dataset
admit_dispositions = ["ADMITTED"]
ed_stays["is_admitted"] = ed_stays["disposition"].isin(admit_dispositions)
ed_stays[["stay_id", "disposition", "is_admitted"]].head(10)

Unnamed: 0,stay_id,disposition,is_admitted
0,30000012,ADMITTED,True
1,30000017,ELOPED,False
2,30000038,ADMITTED,True
3,30000039,ADMITTED,True
4,30000055,HOME,False
5,30000094,HOME,False
6,30000112,HOME,False
7,30000177,ADMITTED,True
8,30000202,HOME,False
9,30000204,HOME,False


## Note on Triage Timestamp Accuracy in MIMIC-IV ED

While constructing `ed_stays`, we computed a variable called **`triage_delay_minutes`**, defined as the time difference between:

- **arrival_time** (first "Enter the ED"), and  
- **triage_time** (first "Triage in the ED").

During exploration, we found that *nearly every patient* in the MIMIC-IV ED event log has:

- **arrival_time = HH:MM:SS**
- **triage_time = HH:MM:SS + 1 second**

This produces a constant value of **0.016667 minutes** (i.e., 1 second) for almost the entire dataset.

### Why This Happens
This behavior is **not** a coding error. It reflects how timestamps are recorded in the underlying MIMIC ED event log:

- Many EDs log triage and registration events in batches.
- Timestamps are often rounded or defaulted to the same second.
- The dataset uses shifted dates and harmonized event logs, which further compress timing resolution.
- The “Triage in the ED” event frequently represents when the *triage note was charted*, not when the nurse actually triaged the patient.

As a result, **triage timestamps in MIMIC-IV ED do not represent real-world triage wait times.**

### Implications for the DES Model
For our simulation:
- We cannot infer real triage wait times from MIMIC.
  The data does not contain the necessary temporal resolution.
- We should keep the `triage_time` field for ordering events, but we should not use `triage_delay_minutes` for modeling actual triage delays.
- We do plan to switch to UCSD Health Data so keeping this metric is still important.

### What We *Can* Still Use from MIMIC
Despite the triage-timing limitations, MIMIC-IV ED remains extremely useful for:

- **Arrival → depart timeframes** (accurate ED length of stay)
- **Activity ordering** (the sequence of ED events is reliable)
- **Inter-activity timing for meds, vitals, procedures**
- **Medication and diagnosis timestamps**
- **Disposition (discharge, transfer, admission)**

These provide strong, real-world foundations for modeling ED flow.

### How to Handle Triage Delay in the Simulation
Since MIMIC cannot provide this:
- Use **external literature values** (typical triage waits are 5–20 minutes)
- OR sample from a **distribution** (e.g., lognormal or gamma) based on national datasets like **NHAMCS**
- OR choose a fixed **constant delay** if the model does not focus on triage bottlenecks

We will therefore keep the triage timestamps in the dataset for completeness, but treat the actual delay as **unreliable** and **not suitable for DES parameterization**.

In [66]:
# next few cells will be doing some sanity checks on the df to check for completeness
ed_stays[["stay_id", "subject_id", "arrival_time", "depart_time", "disposition"]].isna().mean()

stay_id         0.0
subject_id      0.0
arrival_time    0.0
depart_time     0.0
disposition     0.0
dtype: float64

In [59]:
# confirming timestamp ordering
(ed_stays["triage_time"] < ed_stays["arrival_time"]).sum()

np.int64(0)

In [60]:
# same here
(ed_stays["depart_time"] < ed_stays["arrival_time"]).sum()

np.int64(0)

In [62]:
# checking disposition LOS values for outliers or anything concerning, this all looks normal.
ed_stays["ed_los_hours"].describe(percentiles=[0.01,0.25,0.5,0.75,0.99])

count    425028.000000
mean          7.159524
std           6.627396
min           0.016667
1%            0.766667
25%           3.533333
50%           5.466667
75%           8.316667
99%          30.483333
max         493.069444
Name: ed_los_hours, dtype: float64

In [63]:
# checking for zero or negative LOS
(ed_stays["ed_los_hours"] <= 0).sum()

np.int64(0)

In [64]:
# checking demographic completness, I am shocked at how complete this is. Very nice.
ed_stays[["gender", "race", "acuity", "chiefcomplaint"]].isna().mean()

gender            0.000000
race              0.000000
acuity            0.016413
chiefcomplaint    0.000047
dtype: float64

In [65]:
# checking that disposition looks reasonable, it does
ed_stays["disposition"].value_counts(dropna=False)

disposition
HOME                           241626
ADMITTED                       158010
TRANSFER                         7025
LEFT WITHOUT BEING SEEN          6154
ELOPED                           5710
OTHER                            4245
LEFT AGAINST MEDICAL ADVICE      1881
EXPIRED                           377
Name: count, dtype: int64