In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import cleaning as cl



In [None]:
sepsis_df = pd.read_csv("bquxjob_36223f92_19c1b1111d7.csv")
sepsis_df.columns

Index(['stay_id', 'aki_any', 'aki_max_stage'], dtype='object')

In [5]:
rename_map = {
        "icu_intime": "intime",
        "icu_outtime": "outtime",
        "sepsis_onset_time": "sepsis_onset_time",
    }
sepsis_df = sepsis_df.rename(columns=rename_map)

for col in ["intime", "outtime", "sepsis_onset_time"]:
    sepsis_df[col] = pd.to_datetime(sepsis_df[col], errors="coerce")
sepsis_df["ICU_length"] = (sepsis_df["outtime"] - sepsis_df["intime"]).dt.total_seconds() / (24 * 3600)
sepsis_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,sepsis_onset_time,sofa_score,ICU_length
0,18212223,24629597,30752269,2136-08-15 03:35:00,2136-08-15 18:38:09,2136-08-14 23:00:00,2,0.627188
1,13736311,23006559,39454408,2178-04-29 04:16:06,2178-05-04 10:04:38,2178-04-28 15:45:00,2,5.242037
2,19085966,21185316,34095671,2156-02-03 02:30:00,2156-02-06 14:03:29,2156-02-03 03:52:00,2,3.481586
3,10147182,22852948,33175266,2179-11-22 20:32:33,2179-11-24 00:53:20,2179-11-22 21:08:00,2,1.1811
4,19669410,20152616,38510130,2143-10-02 18:43:09,2143-10-03 14:31:22,2143-10-02 20:00:00,2,0.82515


In [6]:
USECOLS = {
    "patients": ["subject_id", "gender", "anchor_age", "anchor_year", "anchor_year_group"],
    "admissions": ["subject_id", "hadm_id", "admittime", "dischtime", "race", "hospital_expire_flag"],
    "labevents": ["subject_id","hadm_id","itemid","charttime","valuenum"],
    "d_labitems": ["itemid","label"],
    "pharmacy": ["subject_id","hadm_id","medication","starttime"],
    "diagnoses_icd": ["subject_id","hadm_id","icd_code","icd_version","seq_num"],
    "d_icd_diagnoses": ["icd_code","long_title"],
    "procedures_icd": ["subject_id","hadm_id","icd_code","icd_version"],
    "d_icd_procedures": ["icd_code","icd_version","long_title"],
    "omr": ["subject_id","result_name","result_value","chartdate"],
    "icustays": ["subject_id","hadm_id","stay_id","intime","outtime"],
    "chartevents": ["subject_id","hadm_id","stay_id","itemid","charttime","valuenum"],
    "d_items": ["itemid","label","category"]
}
def load_all():
    path_hosp = "/Users/kayvans/Documents/mimic/mimic-iv-3.1/hosp"
    path_icu  = "/Users/kayvans/Documents/mimic/mimic-iv-3.1/icu"

    def read(path, name):
        print(f"Loading {name}...")
        return pd.read_csv(
            f"{path}/{name}.csv.gz",
            usecols=USECOLS.get(name)
    )

    patients     = read(path_hosp, "patients")
    admissions   = read(path_hosp, "admissions")
    labs         = read(path_hosp, "labevents")
    d_labitems   = read(path_hosp, "d_labitems")
    pharmacy     = read(path_hosp, "pharmacy")
    diagnoses    = read(path_hosp, "diagnoses_icd")
    d_diagnoses  = read(path_hosp, "d_icd_diagnoses")
    procedures   = read(path_hosp, "procedures_icd")
    d_procedures = read(path_hosp, "d_icd_procedures")
    omr          = read(path_hosp, "omr")
    icustays       = read(path_icu, "icustays")
    chartevents    = read(path_icu, "chartevents")
    d_items        = read(path_icu, "d_items")
    return (
        patients,
        admissions,
        labs,
        d_labitems,
        pharmacy,
        diagnoses,
        d_diagnoses,
        procedures,
        d_procedures,
        omr,
        icustays,
        chartevents,
        d_items
    )

In [7]:
(
    patients,
    admissions,
    labs,
    d_labitems,
    pharmacy,
    diagnoses,
    d_diagnoses,
    procedures,
    d_procedures,
    omr,
    icustays,
    chartevents,
    d_items
) = load_all()

Loading patients...
Loading admissions...
Loading labevents...
Loading d_labitems...
Loading pharmacy...
Loading diagnoses_icd...
Loading d_icd_diagnoses...
Loading procedures_icd...
Loading d_icd_procedures...
Loading omr...
Loading icustays...
Loading chartevents...
Loading d_items...


In [8]:
stay_ids = set(sepsis_df["stay_id"])
hadm_ids = set(sepsis_df["hadm_id"])
subj_ids = set(sepsis_df["subject_id"])
patients   = patients[patients["subject_id"].isin(subj_ids)]
admissions = admissions[admissions["hadm_id"].isin(hadm_ids)]
icustays   = icustays[icustays["stay_id"].isin(stay_ids)]
chartevents = chartevents[chartevents["stay_id"].isin(stay_ids)]
labs        = labs[labs["hadm_id"].isin(hadm_ids)]
pharmacy    = pharmacy[pharmacy["hadm_id"].isin(hadm_ids)]
diagnoses   = diagnoses[diagnoses["hadm_id"].isin(hadm_ids)]
procedures  = procedures[procedures["hadm_id"].isin(hadm_ids)]
omr         = omr[omr["subject_id"].isin(subj_ids)]

In [10]:
base = sepsis_df.merge(patients[["subject_id","gender","anchor_age"]],on="subject_id", how="left")
base = base.merge(admissions[["hadm_id","admittime","dischtime","race","hospital_expire_flag"]],on="hadm_id", how="left")
base.shape

(32899, 14)

In [11]:
testing = cl.get_vitals(base, before=24, after=24, chartevents=chartevents)

In [12]:
medications = cl.get_medications(testing, pharmacy=pharmacy)

In [13]:
medications.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,sepsis_onset_time,sofa_score,ICU_length,gender,anchor_age,...,blood_pressure_min,spO2_min,FiO2_max,temperature_max_C,temperature_max_F,gsc_motor_min,gsc_verbal_min,gsc_eye_min,antibiotics_given,vaso_given
0,18212223,24629597,30752269,2136-08-15 03:35:00,2136-08-15 18:38:09,2136-08-14 23:00:00,2,0.627188,M,43,...,70.0,96.0,,,99.1,5.0,1.0,3.0,1,0
1,13736311,23006559,39454408,2178-04-29 04:16:06,2178-05-04 10:04:38,2178-04-28 15:45:00,2,5.242037,F,69,...,56.0,87.0,50.0,,98.2,6.0,5.0,3.0,1,0
2,19085966,21185316,34095671,2156-02-03 02:30:00,2156-02-06 14:03:29,2156-02-03 03:52:00,2,3.481586,M,55,...,86.0,94.0,100.0,,98.5,1.0,1.0,1.0,1,0
3,10147182,22852948,33175266,2179-11-22 20:32:33,2179-11-24 00:53:20,2179-11-22 21:08:00,2,1.1811,M,63,...,52.0,91.0,50.0,,98.2,4.0,1.0,2.0,1,0
4,19669410,20152616,38510130,2143-10-02 18:43:09,2143-10-03 14:31:22,2143-10-02 20:00:00,2,0.82515,M,51,...,83.0,96.0,,,98.6,6.0,1.0,3.0,1,1


In [14]:
labs = cl.get_labs(medications,labs=labs)

In [15]:
labs.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,sepsis_onset_time,sofa_score,ICU_length,gender,anchor_age,...,glucose_min,pH_min,lactate_max,platelet_max,wbc_max,hemoglobin_min,ast_max,alt_max,bilirubin_max,inr_max
0,18212223,24629597,30752269,2136-08-15 03:35:00,2136-08-15 18:38:09,2136-08-14 23:00:00,2,0.627188,M,43,...,107.0,,1.9,322.0,17.3,15.1,71.0,61.0,1.0,1.4
1,13736311,23006559,39454408,2178-04-29 04:16:06,2178-05-04 10:04:38,2178-04-28 15:45:00,2,5.242037,F,69,...,90.0,7.37,,159.0,11.7,10.1,,,,
2,19085966,21185316,34095671,2156-02-03 02:30:00,2156-02-06 14:03:29,2156-02-03 03:52:00,2,3.481586,M,55,...,104.0,7.25,5.2,514.0,15.5,12.6,,,,1.2
3,10147182,22852948,33175266,2179-11-22 20:32:33,2179-11-24 00:53:20,2179-11-22 21:08:00,2,1.1811,M,63,...,77.0,7.12,2.4,538.0,21.4,7.6,,,,1.4
4,19669410,20152616,38510130,2143-10-02 18:43:09,2143-10-03 14:31:22,2143-10-02 20:00:00,2,0.82515,M,51,...,160.0,7.37,1.4,192.0,12.8,11.8,,,,1.3


In [16]:
labs["temp_max_F"]= labs.apply(cl.get_max_temperature, axis=1)

In [39]:
aki = pd.read_csv("sepsis+aki.csv")[["stay_id", "aki_24h_onset", "aki_24h_onset_stage"]]
aki

Unnamed: 0,stay_id,aki_24h_onset,aki_24h_onset_stage
0,33206685,0,0
1,31122843,0,0
2,35470765,0,0
3,33445309,0,0
4,34287204,0,0
...,...,...,...
32894,39418030,1,3
32895,36508279,1,3
32896,37933335,1,3
32897,34447242,1,3


In [41]:
sepsis_aki = labs.merge(aki, on="stay_id")
sepsis_aki

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,sepsis_onset_time,sofa_score,ICU_length,gender,anchor_age,...,platelet_max,wbc_max,hemoglobin_min,ast_max,alt_max,bilirubin_max,inr_max,temp_max_F,aki_24h_onset,aki_24h_onset_stage
0,18212223,24629597,30752269,2136-08-15 03:35:00,2136-08-15 18:38:09,2136-08-14 23:00:00,2,0.627188,M,43,...,322.0,17.3,15.1,71.0,61.0,1.0,1.4,99.10,1,1
1,13736311,23006559,39454408,2178-04-29 04:16:06,2178-05-04 10:04:38,2178-04-28 15:45:00,2,5.242037,F,69,...,159.0,11.7,10.1,,,,,98.20,0,0
2,19085966,21185316,34095671,2156-02-03 02:30:00,2156-02-06 14:03:29,2156-02-03 03:52:00,2,3.481586,M,55,...,514.0,15.5,12.6,,,,1.2,98.50,1,2
3,10147182,22852948,33175266,2179-11-22 20:32:33,2179-11-24 00:53:20,2179-11-22 21:08:00,2,1.181100,M,63,...,538.0,21.4,7.6,,,,1.4,98.20,0,0
4,19669410,20152616,38510130,2143-10-02 18:43:09,2143-10-03 14:31:22,2143-10-02 20:00:00,2,0.825150,M,51,...,192.0,12.8,11.8,,,,1.3,98.60,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32894,17715294,28399871,37357219,2131-07-19 21:42:35,2131-08-02 01:06:23,2131-07-25 12:50:00,17,13.141528,M,70,...,45.0,21.8,8.7,98.0,196.0,19.6,3.3,98.96,1,3
32895,10827578,20980039,34763877,2130-05-04 19:42:04,2130-07-04 16:49:12,2130-05-09 08:00:00,17,60.879954,M,57,...,49.0,31.7,11.7,686.0,965.0,9.4,1.5,98.50,1,3
32896,11713814,28888207,37767813,2159-05-28 02:41:32,2159-06-06 06:51:35,2159-05-31 06:25:00,17,9.173646,F,61,...,45.0,13.9,7.5,334.0,123.0,6.6,4.3,103.00,1,3
32897,16794039,22579763,30606871,2112-01-05 18:35:12,2112-01-14 04:03:13,2112-01-10 12:29:00,20,8.394456,M,60,...,235.0,45.0,12.3,204.0,127.0,19.4,1.5,98.30,1,3


In [43]:
sepsis_aki["aki_24h_onset_stage"].value_counts()

aki_24h_onset_stage
0    11042
2    10752
1     7267
3     3838
Name: count, dtype: int64

In [44]:
sepsis_aki.to_csv("sepsis_aki_ready.csv", index=False)