# MIMIC

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.special import expit

In [None]:
# seed
seed = 0
rng = np.random.default_rng(seed)

#### helpers

In [None]:
def parse_datetime_cols(df, cols):
    """ Parses dates to pandas. """
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")
    return df

def calculate_age(reference_date, date_of_birth):
    """ Calculates age from reference_date format (yyyy-mm-dd) and date_of_birth format (yyyy-mm-dd 00:00:00). """
    if len(reference_date) != len(date_of_birth):
        return 'arrays of different length'

    ages = []
    for i in range(len(reference_date)):
        years = int(reference_date[i][:4]) - int(date_of_birth[i][:4])
        months = int(reference_date[i][5:7]) - int(date_of_birth[i][5:7])
        days = int(reference_date[i][8:10]) - int(date_of_birth[i][8:10])

        if months < 0:
            years -= 1
        elif months == 0 and days < 0:
            years -= 1
        
        ages.append(years)

    return ages

#### read data

Source data can be downloaded on ``https://physionet.org/content/mimiciii/1.4/``

In [None]:
# set source dir
source_dir = "./source_data/mimic-iii-clinical-database-1.4/"

In [None]:
# patients data
patients = pd.read_csv(os.path.join(source_dir, "PATIENTS.csv"), usecols=["SUBJECT_ID","GENDER","DOB"])
patients = parse_datetime_cols(patients, ["DOB"])
patients.rename(columns={"SUBJECT_ID":"subject_id","GENDER":"sex"}, inplace=True)

# ICU stays
icustays = pd.read_csv(os.path.join(source_dir, "ICUSTAYS.csv"), usecols=["SUBJECT_ID","ICUSTAY_ID","INTIME", "OUTTIME"])
icustays = parse_datetime_cols(icustays, ["INTIME", "OUTTIME"])
icustays.rename(columns={"SUBJECT_ID":"subject_id","ICUSTAY_ID":"icustay_id","INTIME":"icu_intime", "OUTTIME":"icu_outtime"}, inplace=True)

#### process features

In [None]:
# first ICU stay per patient
icustays = icustays.sort_values(["subject_id","icu_intime"])
first_icustay_idx = icustays.groupby("subject_id")["icu_intime"].idxmin()
first_icustays = icustays.loc[first_icustay_idx].copy()

In [None]:
# add patient data
df = (first_icustays.merge(patients, on="subject_id", how="left"))

In [None]:
# compute age
df["age"] = calculate_age(df["icu_intime"].astype(str), df["DOB"].astype(str))
df = df[(df["age"] >= 18) & (df["age"] < 90)].copy()

In [None]:
# binarize sex (1 male, 0 female)
df["sex"] = (df["sex"]=='M').astype(int)

In [None]:
# compute reference time
df["t0"] = df["icu_intime"] + pd.to_timedelta(3, unit="h")
df = df[df["icu_outtime"] >= df["t0"]].copy()

In [None]:
# basic identifiers / keep columns
keep_cols = ["subject_id", "icustay_id", "sex", "age", "t0"]
df = df[keep_cols].copy()

#### process vital sign measurements

In [None]:
# read chart events
chartevents_path = os.path.join(source_dir, "CHARTEVENTS.csv")
usecols = ["SUBJECT_ID","HADM_ID","ICUSTAY_ID","ITEMID","CHARTTIME","VALUENUM","VALUEUOM"]

In [None]:
# filter keys
icu_set = set(df['icustay_id'])
t0_lookup = df.set_index("icustay_id")["t0"].to_dict()

In [None]:
# collector
filtered = []

# read
for chunk in pd.read_csv(chartevents_path, usecols=usecols, chunksize=1000000, low_memory=False):

    # filter
    chunk.rename(columns=str.lower, inplace=True)
    chunk = chunk[chunk["icustay_id"].isin(icu_set)]
    if chunk.empty:
        continue

    # add reference time
    chunk["t0"] = chunk["icustay_id"].map(t0_lookup)
    chunk["charttime"] = pd.to_datetime(chunk["charttime"], errors="coerce")

    # keep only valid measurements
    chunk = chunk[chunk["charttime"].notna() & (chunk["charttime"] <= chunk["t0"])]
    chunk = chunk[chunk["valuenum"].notna()]
    filtered.append(chunk)

# store
measurements_df = pd.concat(filtered, ignore_index=True)

In [None]:
# map item ids to variable names
mapping = {"temp_c":   [678, 223761],
           "temp_f":   [679, 223762],
           "hr":       [211, 220045],
           "sbp":      [51, 220179],
           "map":      [456, 220052, 220181],
           "rr":       [618, 220210],
           "spo2":     [646, 220277]}

# helper
def tag_type(itemid):
    i = int(itemid)
    for k, ids in mapping.items():
        if i in ids:
            return k
    return None

# extract
measurements_df["vital_type"] = measurements_df["itemid"].map(tag_type)
measurements_df = measurements_df[measurements_df["vital_type"].notna()].copy()

In [None]:
# process temperature
temps = measurements_df[measurements_df['vital_type'].isin(["temp_c", "temp_f"])].copy()
units = temps["valueuom"].astype(str).str.upper()
fahr_items = temps["vital_type"].isin(["temp_f"])
fahr_units = units.str.contains('F', na=False)
fahr_value = (temps["valueuom"].isna() & (temps["valuenum"] >= 79))
fahr = fahr_items | fahr_units | fahr_value
temps.loc[fahr, 'valuenum'] = (temps.loc[fahr, 'valuenum'] - 32.0) * (5.0/9.0)
temps["vital_type"] = "temp_c"
mask = temps["valuenum"].between(25.0, 45.0)
temps.loc[~mask, "valuenum"] = np.nan

# store
measurements_df.loc[temps.index, "valuenum"] = temps["valuenum"]
measurements_df.loc[temps.index, "vital_type"] = temps["vital_type"]

In [None]:
# keep last vital sign readings before reference time
last_vals = (measurements_df.sort_values(["subject_id","icustay_id","vital_type","charttime"])
      .groupby(["subject_id","icustay_id", "vital_type"], as_index=False)
      .tail(1)[["subject_id","icustay_id", "vital_type","valuenum"]])

In [None]:
# transform
last_vals = last_vals.pivot(index=["subject_id","icustay_id"], columns="vital_type", values="valuenum").reset_index()
last_vals.columns.name = None

In [None]:
# rename and sort
last_vals.columns = ['subject_id', 'icustay_id', 'heart_rate', 'arterial_pressure', 'respiratory_rate', 'blood_pressure', 'oxygen_saturation', 'temperature']
last_vals = last_vals[['subject_id', 'icustay_id', 'heart_rate', 'temperature', 'blood_pressure', 'arterial_pressure', 'respiratory_rate', 'oxygen_saturation']]

#### filter and sample dataset

In [None]:
# merge
df = df.merge(last_vals, on=["subject_id","icustay_id"], how="left")
df = df.dropna()

In [None]:
# sample +  features
df = df.sample(6000, random_state=seed)
df = df[['sex', 'age', 'heart_rate', 'temperature', 'blood_pressure', 'arterial_pressure', 'respiratory_rate', 'oxygen_saturation']].reset_index(drop=True)
df.to_csv("./mimic_tmp.csv")

#### generate treatments and outcomes

In [None]:
# read features
df = pd.read_csv("./mimic_tmp.csv", index_col=0)

In [None]:
# scale features
cont_cols = ['age', 'heart_rate', 'temperature', 'blood_pressure', 'arterial_pressure', 'respiratory_rate', 'oxygen_saturation']
bin_cols = ['sex']
df[cont_cols] = (df[cont_cols] - df[cont_cols].mean()) / df[cont_cols].std()

In [None]:
# copy for data generation
gen_df = df.copy()

In [None]:
# s(x)
gen_df['s'] = (
    - 0.7 * gen_df['blood_pressure']                  
    - 0.7 * gen_df['arterial_pressure']               
    - 0.3 * gen_df['age']                             
    - 0.3 * gen_df['age'] * gen_df['blood_pressure']) 

In [None]:
# v(x)
gen_df['v'] = (
    0.5 * gen_df['heart_rate']         
  + 0.5 * gen_df['respiratory_rate']   
  + 0.2 * gen_df['heart_rate'] * gen_df['respiratory_rate'])

In [None]:
# e(x)
logit_e = 0.5 * gen_df['v'] + 0.5 * gen_df['s']   
gen_df['e'] = 0.2 + 0.6 * expit(logit_e)          
gen_df['T'] = rng.binomial(1, gen_df['e'])        

In [None]:
# mu_0(x)
gen_df['M0'] = (
   - 2.0  * gen_df['temperature']                            
   + 0.5 * np.tanh(1.5 * gen_df['oxygen_saturation'])        
  - 0.25 * np.tanh(gen_df['temperature']) * np.tanh(gen_df['oxygen_saturation'])  
  + 0.15 * gen_df['sex']                                     
  - 0.15 * gen_df['age'])                                    

In [None]:
# tau(x)
gen_df['cate'] = (2 * expit(1.5 * gen_df['s'] - 0.2) + 0.6 * gen_df['s'])                 

In [None]:
# mu_1(x)
gen_df['M1'] = gen_df['M0'] + gen_df['cate']

In [None]:
# y(x)
sigma_y = 0.6
gen_df['Y0'] = gen_df['M0'] + rng.normal(0, sigma_y, len(gen_df))
gen_df['Y1'] = gen_df['M1'] + rng.normal(0, sigma_y, len(gen_df))
gen_df['Y'] = np.where(gen_df['T']==1, gen_df['Y1'], gen_df['Y0'])

In [None]:
# set variables
df["T"] = gen_df["T"]
df["M0"] = gen_df["M0"]
df["M1"] = gen_df["M1"]
df["cate"] = gen_df["cate"]
df["Y"] = gen_df["Y"]
df["e"] = gen_df["e"]
df["s"] = gen_df["s"]

In [None]:
# store
df.to_csv("./mimic.csv")