In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath("../scripts"))
import feature_engineering as fe



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
sepsis_df = pd.read_csv("/Users/kayvans/Documents/sepsis-causal-discovery/data/raw/sepsis_cohort.csv")
sepsis_df.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'icu_intime', 'icu_outtime',
       'sepsis_onset_time', 'sofa_score', 'aki_24h_onset',
       'aki_24h_onset_stage', 'aki_post24h', 'aki_post24h_stage',
       'mechvent_24h_onset', 'mechvent_post24h'],
      dtype='str')

In [6]:
rename_map = {
        "icu_intime": "intime",
        "icu_outtime": "outtime",
        "sepsis_onset_time": "sepsis_onset_time",
    }
sepsis_df = sepsis_df.rename(columns=rename_map)

for col in ["intime", "outtime", "sepsis_onset_time"]:
    sepsis_df[col] = pd.to_datetime(sepsis_df[col], errors="coerce")
sepsis_df["ICU_length"] = (sepsis_df["outtime"] - sepsis_df["intime"]).dt.total_seconds() / (24 * 3600)


In [7]:
USECOLS = {
    "patients": ["subject_id", "gender", "anchor_age", "anchor_year", "anchor_year_group"],
    "admissions": ["subject_id", "hadm_id", "admittime", "dischtime", "race", "hospital_expire_flag"],
    "labevents": ["subject_id","hadm_id","itemid","charttime","valuenum"],
    "d_labitems": ["itemid","label"],
    "pharmacy": ["subject_id","hadm_id","medication","starttime"],
    "diagnoses_icd": ["subject_id","hadm_id","icd_code","icd_version","seq_num"],
    "d_icd_diagnoses": ["icd_code","long_title"],
    "procedures_icd": ["subject_id","hadm_id","icd_code","icd_version"],
    "d_icd_procedures": ["icd_code","icd_version","long_title"],
    "omr": ["subject_id","result_name","result_value","chartdate"],
    "icustays": ["subject_id","hadm_id","stay_id","intime","outtime"],
    "chartevents": ["subject_id","hadm_id","stay_id","itemid","charttime","valuenum"],
    "d_items": ["itemid","label","category"]
}
def load_all():
    path_hosp = "/Users/kayvans/Documents/mimic/mimic-iv-3.1/hosp"
    path_icu  = "/Users/kayvans/Documents/mimic/mimic-iv-3.1/icu"

    def read(path, name):
        print(f"Loading {name}...")
        return pd.read_csv(
            f"{path}/{name}.csv.gz",
            usecols=USECOLS.get(name)
    )

    patients     = read(path_hosp, "patients")
    admissions   = read(path_hosp, "admissions")
    labs         = read(path_hosp, "labevents")
    d_labitems   = read(path_hosp, "d_labitems")
    pharmacy     = read(path_hosp, "pharmacy")
    diagnoses    = read(path_hosp, "diagnoses_icd")
    d_diagnoses  = read(path_hosp, "d_icd_diagnoses")
    procedures   = read(path_hosp, "procedures_icd")
    d_procedures = read(path_hosp, "d_icd_procedures")
    omr          = read(path_hosp, "omr")
    icustays       = read(path_icu, "icustays")
    chartevents    = read(path_icu, "chartevents")
    d_items        = read(path_icu, "d_items")
    return (
        patients,
        admissions,
        labs,
        d_labitems,
        pharmacy,
        diagnoses,
        d_diagnoses,
        procedures,
        d_procedures,
        omr,
        icustays,
        chartevents,
        d_items
    )

In [8]:
(
    patients,
    admissions,
    labs,
    d_labitems,
    pharmacy,
    diagnoses,
    d_diagnoses,
    procedures,
    d_procedures,
    omr,
    icustays,
    chartevents,
    d_items
) = load_all()

Loading patients...
Loading admissions...
Loading labevents...
Loading d_labitems...
Loading pharmacy...
Loading diagnoses_icd...
Loading d_icd_diagnoses...
Loading procedures_icd...
Loading d_icd_procedures...
Loading omr...
Loading icustays...
Loading chartevents...
Loading d_items...


In [9]:
stay_ids = set(sepsis_df["stay_id"])
hadm_ids = set(sepsis_df["hadm_id"])
subj_ids = set(sepsis_df["subject_id"])
patients   = patients[patients["subject_id"].isin(subj_ids)]
admissions = admissions[admissions["hadm_id"].isin(hadm_ids)]
icustays   = icustays[icustays["stay_id"].isin(stay_ids)]
chartevents = chartevents[chartevents["stay_id"].isin(stay_ids)]
labs        = labs[labs["hadm_id"].isin(hadm_ids)]
pharmacy    = pharmacy[pharmacy["hadm_id"].isin(hadm_ids)]
diagnoses   = diagnoses[diagnoses["hadm_id"].isin(hadm_ids)]
procedures  = procedures[procedures["hadm_id"].isin(hadm_ids)]
omr         = omr[omr["subject_id"].isin(subj_ids)]

In [10]:
base = sepsis_df.merge(patients[["subject_id","gender","anchor_age"]],on="subject_id", how="left")
base = base.merge(admissions[["hadm_id","admittime","dischtime","race","hospital_expire_flag"]],on="hadm_id", how="left")
base.shape

(32899, 20)

In [None]:
testing = fe.get_vitals(base, before=24, after=24, chartevents=chartevents)

In [None]:
medications = fe.get_medications(testing, pharmacy=pharmacy)

In [None]:
labs = fe.get_labs(medications,labs=labs)

In [None]:
labs["temp_max_F"]= labs.apply(fe.get_max_temperature, axis=1)