In [1]:
import pandas as pd
import os

# Load datasets
base_path = "../data/mimic_demo"
dfs = {}

for folder, dataset_type in [("hosp", "hosp"), ("icu", "icu")]:
    path = os.path.join(base_path, folder)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(path, file), low_memory=False)
            dfs[file.replace(".csv", "")] = df

# Merge ICU stays with Admissions on hadm_id and Patients on subject_id
icu_adm = dfs['icustays'].merge(dfs['admissions'], on=['subject_id', 'hadm_id'], how='left')
icu_adm_pat = icu_adm.merge(dfs['patients'], on='subject_id', how='left')

# Convert date columns to datetime
date_cols = ['intime', 'outtime', 'admittime', 'dischtime', 'deathtime', 'dod']
for col in date_cols:
    icu_adm_pat[col] = pd.to_datetime(icu_adm_pat[col], errors='coerce')

# Categorize mortality
import numpy as np
icu_adm_pat['mortality_category'] = np.nan

# 1️⃣ Died in ICU
icu_adm_pat.loc[
    (icu_adm_pat['hospital_expire_flag'] == 1) & 
    (icu_adm_pat['deathtime'].notna()) & 
    (icu_adm_pat['deathtime'] <= icu_adm_pat['outtime']), 
    'mortality_category'
] = 'Died in ICU'

# 2️⃣ Died in hospital (after ICU discharge)
icu_adm_pat.loc[
    (icu_adm_pat['hospital_expire_flag'] == 1) & 
    (icu_adm_pat['deathtime'].notna()) & 
    (icu_adm_pat['deathtime'] > icu_adm_pat['outtime']), 
    'mortality_category'
] = 'Died in hospital'

# 3️⃣ Died after hospital discharge
icu_adm_pat.loc[
    (icu_adm_pat['hospital_expire_flag'] == 0) & 
    (icu_adm_pat['dod'].notna()), 
    'mortality_category'
] = 'Died after hospital discharge'

# 4️⃣ Still alive (not in the other categories)
icu_adm_pat.loc[icu_adm_pat['mortality_category'].isna(), 'mortality_category'] = 'Alive'

# Keep only relevant columns
final_df = icu_adm_pat[[
    'subject_id', 'hadm_id', 'stay_id', 'gender', 'anchor_age', 'race', 'insurance',
    'admittime', 'dischtime', 'intime', 'outtime', 'los', 
    'hospital_expire_flag', 'deathtime', 'dod', 'mortality_category'
]]

# Save to processed folder
processed_path = "../data/processed"
os.makedirs(processed_path, exist_ok=True)
final_df.to_csv(os.path.join(processed_path, "icu_admissions_processed.csv"), index=False)

print("\n✅ Processed dataset saved: icu_admissions_processed.csv")
print(final_df.head())



✅ Processed dataset saved: icu_admissions_processed.csv
   subject_id   hadm_id   stay_id gender  anchor_age  \
0    10018328  23786647  31269608      F          83   
1    10020187  24104168  37509585      F          63   
2    10020187  26842957  32554129      F          63   
3    10012853  27882036  31338022      F          91   
4    10020740  25826145  32145159      M          56   

                           race insurance           admittime  \
0                         WHITE     Other 2154-04-24 03:15:00   
1  HISPANIC/LATINO - SALVADORAN     Other 2169-01-15 04:04:00   
2  HISPANIC/LATINO - SALVADORAN     Other 2170-02-24 00:00:00   
3        BLACK/AFRICAN AMERICAN  Medicare 2176-11-25 21:28:00   
4                         WHITE     Other 2150-06-03 20:12:00   

            dischtime              intime             outtime       los  \
0 2154-05-03 14:00:00 2154-04-24 23:03:44 2154-05-02 15:55:21  7.702512   
1 2169-01-24 17:20:00 2169-01-15 04:56:00 2169-01-20 15:47:50  5.

  icu_adm_pat.loc[


In [2]:
final_df['mortality_category'].value_counts()


mortality_category
Alive                            91
Died after hospital discharge    29
Died in ICU                      12
Died in hospital                  8
Name: count, dtype: int64

In [3]:
unique_dead_patients = final_df[final_df['mortality_category'] != "Alive"]['subject_id'].nunique()
print(f"Unique patients who died: {unique_dead_patients}")


Unique patients who died: 31


In [4]:
final_df.groupby('subject_id')['mortality_category'].nunique().value_counts()


mortality_category
1    94
2     5
3     1
Name: count, dtype: int64

In [6]:
final_df.groupby('subject_id').agg({'mortality_category': 'first'}).value_counts()


mortality_category           
Alive                            69
Died after hospital discharge    18
Died in ICU                       9
Died in hospital                  4
Name: count, dtype: int64

In [7]:
# Define keywords for each treatment type
vasopressor_keywords = ['dopamine', 'epinephrine', 'norepinephrine', 'phenylephrine', 'vasopressin']
ventilation_keywords = ['ventilation', 'respirator', 'oxygen', 'mechanical', 'intubation']
rrt_keywords = ['renal', 'dialysis', 'RRT', 'kidney', 'hemofiltration']

# Function to filter d_items based on keywords
def filter_d_items_by_keywords(d_items_df, keywords, category_name):
    filtered = d_items_df[d_items_df['label'].str.contains('|'.join(keywords), case=False, na=False)]
    print(f"\n🔍 {category_name} - Found {len(filtered)} items")
    print(filtered[['itemid', 'label', 'category']])
    return filtered

# Load d_items.csv
d_items = dfs['d_items']

# Filter for each category
vasopressors_items = filter_d_items_by_keywords(d_items, vasopressor_keywords, "Vasopressors")
ventilation_items = filter_d_items_by_keywords(d_items, ventilation_keywords, "Mechanical Ventilation")
rrt_items = filter_d_items_by_keywords(d_items, rrt_keywords, "Renal Replacement Therapy")


🔍 Vasopressors - Found 10 items
      itemid                          label     category
2256  229789     Phenylephrine (Intubation)   Intubation
3088  229617                   Epinephrine.  Medications
3108  221662                       Dopamine  Medications
3109  222315                    Vasopressin  Medications
3121  229630         Phenylephrine (50/250)  Medications
3146  221906                 Norepinephrine  Medications
3151  221289                    Epinephrine  Medications
3167  229631  Phenylephrine (200/250)_OLD_1  Medications
3174  221749                  Phenylephrine  Medications
3198  229632        Phenylephrine (200/250)  Medications

🔍 Mechanical Ventilation - Found 83 items
      itemid                                              label  \
3     229875                            Oxygenator visible (CH)   
9     229274                          Oxygenator visible (ECMO)   
208   223985                                Respiratory Pattern   
223   223990                 

In [8]:
import pandas as pd
import os

# Load necessary datasets
base_path = "../data/mimic_demo/icu"
dfs = {}
for file in os.listdir(base_path):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(base_path, file), low_memory=False)
        dfs[file.replace(".csv", "")] = df

# Load processed ICU admissions data
processed_path = "../data/processed/icu_admissions_processed.csv"
icu_admissions = pd.read_csv(processed_path)

# Relevant Item IDs
vasopressor_ids = [229789, 229617, 221662, 222315, 229630, 221906, 221289, 229631, 221749, 229632]
ventilation_ids = [225794, 225792, 224385]
rrt_ids = [225441, 225802, 225805, 225809, 225955]

# Extract Vasopressor Usage
vasopressor_data = dfs['inputevents'][dfs['inputevents']['itemid'].isin(vasopressor_ids)]
vasopressor_data = vasopressor_data[['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'itemid', 'amount', 'rate']]
vasopressor_data['vasopressor_administered'] = 1

# Extract Mechanical Ventilation Usage
ventilation_data = dfs['chartevents'][dfs['chartevents']['itemid'].isin(ventilation_ids)]
ventilation_data = ventilation_data[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid']]
ventilation_data['mechanical_ventilation'] = 1

# Extract Renal Replacement Therapy (RRT) Usage
rrt_data = dfs['procedureevents'][dfs['procedureevents']['itemid'].isin(rrt_ids)]
rrt_data = rrt_data[['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'itemid']]
rrt_data['rrt_administered'] = 1

# Merge treatments with ICU admissions data
icu_adm_treatments = icu_admissions.merge(vasopressor_data, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
icu_adm_treatments = icu_adm_treatments.merge(ventilation_data, on=['subject_id', 'hadm_id', 'stay_id'], how='left')
icu_adm_treatments = icu_adm_treatments.merge(rrt_data, on=['subject_id', 'hadm_id', 'stay_id'], how='left')

# Fill NaN values (no treatment received)
icu_adm_treatments[['vasopressor_administered', 'mechanical_ventilation', 'rrt_administered']] = icu_adm_treatments[
    ['vasopressor_administered', 'mechanical_ventilation', 'rrt_administered']].fillna(0)

# Save processed data
icu_adm_treatments.to_csv("../data/processed/icu_treatments_processed.csv", index=False)

print("\n✅ Processed dataset saved: icu_treatments_processed.csv")



✅ Processed dataset saved: icu_treatments_processed.csv


In [None]:
# Load d_items.csv
d_items_path = "../data/mimic_demo/icu/d_items.csv"
d_items = pd.read_csv(d_items_path)

# Define keywords for each treatment type
vasopressor_keywords = ['dopamine', 'epinephrine', 'norepinephrine', 'phenylephrine', 'vasopressin']
ventilation_keywords = ['ventilation', 'respirator', 'oxygen', 'mechanical', 'intubation']
rrt_keywords = ['renal', 'dialysis', 'RRT', 'kidney', 'hemofiltration']

# Function to filter d_items based on keywords
def filter_d_items_by_keywords(d_items_df, keywords, category_name):
    filtered = d_items_df[d_items_df['label'].str.contains('|'.join(keywords), case=False, na=False)].copy()
    filtered['treatment_category'] = category_name
    return filtered

# Filter for each category
vasopressors_items = filter_d_items_by_keywords(d_items, vasopressor_keywords, "Vasopressors")
ventilation_items = filter_d_items_by_keywords(d_items, ventilation_keywords, "Mechanical Ventilation")
rrt_items = filter_d_items_by_keywords(d_items, rrt_keywords, "Renal Replacement Therapy")

# Combine into one DataFrame
combined_items = pd.concat([vasopressors_items, ventilation_items, rrt_items], ignore_index=True)

# Save to processed folder
processed_path = "../data/processed"
os.makedirs(processed_path, exist_ok=True)
output_csv = os.path.join(processed_path, "filtered_d_items.csv")
combined_items.to_csv(output_csv, index=False)

print(f"✅ Processed d_items saved: {output_csv}")

✅ Processed d_items saved: ../data/processed\filtered_d_items.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['treatment_category'] = category_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['treatment_category'] = category_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['treatment_category'] = category_name
