# ARDS Cohort Definition - UPDATED CRITERIA

This notebook identifies the cohort for analyzing timing of proning and neuromuscular blockade in ARDS patients.

## NEW Inclusion Criteria:
- Adults (≥18 years)
- At least one ICU admission
- PEEP ≥ 5 within first 48 hours of ICU admission
- S/F ratio < 315 at least once (SpO2/FiO2)
- At least one radiology report

## Exclusion Criteria:
- Pregnant patients
- Patients with heart failure

In [32]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Define MIMIC data path
MIMIC_PATH = '/Users/kavenchhikara/Desktop/CLIF/MIMIC-IV-3.1/physionet.org/files'

print(f"MIMIC data path: {MIMIC_PATH}")
print(f"Analysis start time: {datetime.now()}")

MIMIC data path: /Users/kavenchhikara/Desktop/CLIF/MIMIC-IV-3.1/physionet.org/files
Analysis start time: 2025-07-20 00:20:53.279952


## Step 1: Load Core Tables

In [33]:
# Load patient demographics
print("Loading patient demographics...")
patients = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/hosp/patients.csv.gz')
print(f"Total patients: {len(patients):,}")

# Load admissions
print("\nLoading admissions...")
admissions = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/hosp/admissions.csv.gz')
print(f"Total admissions: {len(admissions):,}")

# Load ICU stays
print("\nLoading ICU stays...")
icustays = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/icu/icustays.csv.gz')
print(f"Total ICU stays: {len(icustays):,}")

Loading patient demographics...
Total patients: 364,627

Loading admissions...
Total admissions: 546,028

Loading ICU stays...
Total ICU stays: 94,458


## Step 2: Filter to Adult Patients with ICU Admissions

In [34]:
# Calculate age at admission
patients['anchor_year'] = pd.to_numeric(patients['anchor_year'])
patients['anchor_age'] = pd.to_numeric(patients['anchor_age'])

# Merge with admissions to get admission year
admissions['admittime'] = pd.to_datetime(admissions['admittime'])
admissions['admit_year'] = admissions['admittime'].dt.year

# Merge patients with admissions
patient_admissions = admissions.merge(patients[['subject_id', 'anchor_age', 'anchor_year', 'gender']], 
                                     on='subject_id', how='left')

# Calculate age at admission
patient_admissions['age_at_admission'] = (patient_admissions['anchor_age'] + 
                                         (patient_admissions['admit_year'] - patient_admissions['anchor_year']))

# Filter adults only
adult_admissions = patient_admissions[patient_admissions['age_at_admission'] >= 18].copy()
print(f"Adult admissions (≥18 years): {len(adult_admissions):,}")
print(f"Unique adult patients: {adult_admissions['subject_id'].nunique():,}")

# NEW: Filter to admissions with at least one ICU stay
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])

# Get admissions with ICU stays
admissions_with_icu = adult_admissions.merge(
    icustays[['hadm_id', 'stay_id', 'intime', 'outtime']].drop_duplicates(['hadm_id']),
    on='hadm_id',
    how='inner'
)

print(f"\nAdult admissions with ICU stays: {len(admissions_with_icu):,}")
print(f"Unique patients with ICU stays: {admissions_with_icu['subject_id'].nunique():,}")

Adult admissions (≥18 years): 546,028
Unique adult patients: 223,452

Adult admissions with ICU stays: 85,242
Unique patients with ICU stays: 65,366


In [35]:
# Vectorized approach for extracting PEEP and S/F ratio criteria
print("Loading chartevents for cohort filtering (vectorized approach)...")

# Define all itemids we need
peep_itemids = [220339, 224700, 224699]  # PEEP set, Total PEEP, Auto PEEP
spo2_itemids = [220277, 224696]  # SpO2 pulse oximetry
fio2_itemids = [220210, 223835]  # FiO2 (%) and FiO2 (fraction)
all_itemids = peep_itemids + spo2_itemids + fio2_itemids

# Get all ICU stays for our cohort
cohort_icustays = icustays[icustays['hadm_id'].isin(admissions_with_icu['hadm_id'])].copy()
cohort_stay_ids = set(cohort_icustays['stay_id'])
print(f"ICU stays for adult cohort: {len(cohort_icustays):,}")

print("Loading chartevents data (this may take a few minutes)...")
# SINGLE pass through chartevents - load all needed data at once
chartevents = pd.read_csv(
    f'{MIMIC_PATH}/mimiciv/3.1/icu/chartevents.csv.gz',
    usecols=['stay_id', 'itemid', 'charttime', 'valuenum']
)

print(f"Total chartevents loaded: {len(chartevents):,}")

# Filter to our cohort and items in one operation (vectorized)
cohort_data = chartevents[
    (chartevents['stay_id'].isin(cohort_stay_ids)) &
    (chartevents['itemid'].isin(all_itemids)) &
    (chartevents['valuenum'].notna())
].copy()

print(f"Relevant measurements for our cohort: {len(cohort_data):,}")

# Add parameter types (vectorized)
cohort_data['param_type'] = 'unknown'
cohort_data.loc[cohort_data['itemid'].isin(peep_itemids), 'param_type'] = 'peep'
cohort_data.loc[cohort_data['itemid'].isin(spo2_itemids), 'param_type'] = 'spo2'
cohort_data.loc[cohort_data['itemid'].isin(fio2_itemids), 'param_type'] = 'fio2'

# Convert times and merge with ICU stay info
cohort_data['charttime'] = pd.to_datetime(cohort_data['charttime'])
cohort_data = cohort_data.merge(
    cohort_icustays[['stay_id', 'hadm_id', 'intime']], 
    on='stay_id'
)

# Calculate hours from ICU admission (vectorized)
cohort_data['hours_from_icu'] = (
    cohort_data['charttime'] - cohort_data['intime']
).dt.total_seconds() / 3600

print("Processing PEEP criteria...")
# Process PEEP filter: PEEP ≥ 5 within first 48 hours (vectorized)
peep_first_48h = cohort_data[
    (cohort_data['param_type'] == 'peep') &
    (cohort_data['hours_from_icu'] >= 0) &
    (cohort_data['hours_from_icu'] <= 48) &
    (cohort_data['valuenum'] >= 5)
]

admissions_with_peep = set(peep_first_48h['hadm_id'].unique())
print(f"Admissions with PEEP ≥5 in first 48h: {len(admissions_with_peep):,}")

# Filter data to PEEP-qualifying admissions for efficiency
cohort_data_peep = cohort_data[cohort_data['hadm_id'].isin(admissions_with_peep)]
print(f"Measurements after PEEP filter: {len(cohort_data_peep):,}")

# Clear original chartevents to free memory
del chartevents
import gc
gc.collect()

Loading chartevents for cohort filtering (vectorized approach)...
ICU stays for adult cohort: 94,458
Loading chartevents data (this may take a few minutes)...
Total chartevents loaded: 432,997,491
Relevant measurements for our cohort: 19,758,934


KeyboardInterrupt: 

# Extract SpO2 and FiO2 data (vectorized)

In [None]:
print("Processing S/F ratio criteria...")

spo2_data = cohort_data_peep[cohort_data_peep['param_type'] == 'spo2'].copy()
fio2_data = cohort_data_peep[cohort_data_peep['param_type'] == 'fio2'].copy()

print(f"SpO2 measurements: {len(spo2_data):,}")
print(f"FiO2 measurements: {len(fio2_data):,}")

if len(spo2_data) > 0 and len(fio2_data) > 0:
    # Convert FiO2 percentages to fractions (vectorized)
    fio2_data.loc[fio2_data['valuenum'] > 1, 'valuenum'] /= 100
    
    # Sort data for merge_asof (time-based matching)
    spo2_data = spo2_data.sort_values(['hadm_id', 'charttime'])
    fio2_data = fio2_data.sort_values(['hadm_id', 'charttime'])
    
    print("Matching SpO2 and FiO2 measurements within 2-hour windows...")
    # Use pandas merge_asof for efficient time-based matching
    sf_ratios = pd.merge_asof(
        spo2_data[['hadm_id', 'charttime', 'valuenum']],
        fio2_data[['hadm_id', 'charttime', 'valuenum']],
        on='charttime',
        by='hadm_id',
        tolerance=pd.Timedelta(hours=2),
        direction='nearest',
        suffixes=('_spo2', '_fio2')
    )
    
    # Calculate S/F ratios (vectorized)
    sf_ratios = sf_ratios.dropna(subset=['valuenum_fio2'])
    sf_ratios = sf_ratios[sf_ratios['valuenum_fio2'] > 0]  # Avoid division by zero
    sf_ratios['sf_ratio'] = sf_ratios['valuenum_spo2'] / sf_ratios['valuenum_fio2']
    
    print(f"Successful S/F ratio calculations: {len(sf_ratios):,}")
    print(f"S/F ratio distribution:")
    print(sf_ratios['sf_ratio'].describe())
    
    # Filter for S/F < 315
    low_sf_ratios = sf_ratios[sf_ratios['sf_ratio'] < 315]
    admissions_with_low_sf = set(low_sf_ratios['hadm_id'].unique())
    
    print(f"\nAdmissions with S/F ratio < 315: {len(admissions_with_low_sf):,}")
    
    # Get final qualifying admissions (both PEEP and S/F criteria)
    qualifying_admissions = admissions_with_peep.intersection(admissions_with_low_sf)
    print(f"Admissions meeting both PEEP and S/F criteria: {len(qualifying_admissions):,}")
    
    # Filter cohort
    cohort_with_criteria = admissions_with_icu[
        admissions_with_icu['hadm_id'].isin(qualifying_admissions)
    ].copy()
    
    print(f"Cohort after PEEP and S/F filters: {len(cohort_with_criteria):,}")
    
else:
    print("Insufficient SpO2 or FiO2 data!")
    cohort_with_criteria = pd.DataFrame()

# Clear processed data to free memory
del cohort_data, cohort_data_peep
if 'spo2_data' in locals():
    del spo2_data, fio2_data
gc.collect()

In [None]:
# Filter for patients with radiology reports
print("Filtering for patients with radiology reports...")
radiology = pd.read_csv(f'{MIMIC_PATH}/mimic-iv-note/2.2/note/radiology.csv.gz')
print(f"Total radiology reports: {len(radiology):,}")

# Get admissions with at least one radiology report
admissions_with_radiology = set(radiology['hadm_id'].dropna().unique())
print(f"Admissions with radiology reports: {len(admissions_with_radiology):,}")

# Filter cohort to those with radiology reports
if len(cohort_with_criteria) > 0:
    cohort_with_radiology = cohort_with_criteria[
        cohort_with_criteria['hadm_id'].isin(admissions_with_radiology)
    ].copy()
    print(f"Cohort with radiology reports: {len(cohort_with_radiology):,}")
else:
    cohort_with_radiology = pd.DataFrame()
    print("No cohort data to filter for radiology!")

# Clear radiology data to free memory
del radiology
gc.collect()

# Load radiology reports

In [None]:
print("Loading radiology reports...")
radiology = pd.read_csv(f'{MIMIC_PATH}/mimic-iv-note/2.2/note/radiology.csv.gz')
print(f"Total radiology reports: {len(radiology):,}")

# Get patients with at least one radiology report
patients_with_radiology = radiology[['subject_id', 'hadm_id']].drop_duplicates()
print(f"Unique patients with radiology: {patients_with_radiology['subject_id'].nunique():,}")

# Filter cohort to those with radiology reports
cohort_with_radiology = cohort_with_sf.merge(
    patients_with_radiology[['hadm_id']].drop_duplicates(), 
    on='hadm_id', 
    how='inner'
)
print(f"\nCohort with radiology reports: {len(cohort_with_radiology):,}")

In [None]:
# Apply exclusion criteria (Heart Failure and Pregnancy)
print("Applying exclusion criteria...")
diagnoses = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/hosp/diagnoses_icd.csv.gz')

# Heart failure ICD codes (vectorized)
hf_icd9_codes = [str(x) for x in range(4280, 4290)]  # 428.0 - 428.9
hf_icd10_codes = ['I50' + str(x) for x in range(10)] + ['I50']  # I50, I50.0 - I50.9

hf_diagnoses = diagnoses[
    (diagnoses['icd_code'].str.startswith(tuple(hf_icd9_codes))) |
    (diagnoses['icd_code'].str.startswith(tuple(hf_icd10_codes)))
]
hf_hadm_ids = set(hf_diagnoses['hadm_id'].unique())
print(f"Admissions with heart failure: {len(hf_hadm_ids):,}")

# Pregnancy ICD codes (vectorized)
pregnancy_icd9_prefixes = [str(x) for x in range(630, 680)]
pregnancy_icd10_prefix = 'O'

pregnancy_diagnoses = diagnoses[
    (diagnoses['icd_code'].str[:3].isin(pregnancy_icd9_prefixes)) |
    (diagnoses['icd_code'].str.startswith(pregnancy_icd10_prefix))
]
pregnant_hadm_ids = set(pregnancy_diagnoses['hadm_id'].unique())
print(f"Admissions with pregnancy codes: {len(pregnant_hadm_ids):,}")

# Apply exclusions if we have cohort data
if len(cohort_with_radiology) > 0:
    # Mark exclusion criteria
    cohort_with_radiology['has_heart_failure'] = cohort_with_radiology['hadm_id'].isin(hf_hadm_ids)
    cohort_with_radiology['is_pregnant'] = cohort_with_radiology['hadm_id'].isin(pregnant_hadm_ids)
    
    print(f"\nCohort patients with HF: {cohort_with_radiology['has_heart_failure'].sum():,}")
    print(f"Cohort patients who are pregnant: {cohort_with_radiology['is_pregnant'].sum():,}")
    
    # Apply exclusions (vectorized)
    final_cohort = cohort_with_radiology[
        (~cohort_with_radiology['has_heart_failure']) & 
        (~cohort_with_radiology['is_pregnant'])
    ].copy()
    
    print(f"\nFinal cohort after exclusions: {len(final_cohort):,}")
    print(f"Unique patients: {final_cohort['subject_id'].nunique():,}")
    
    # Add admission and discharge times
    final_cohort['admission_dttm'] = final_cohort['admittime']
    final_cohort['discharge_dttm'] = final_cohort['dischtime']
    
else:
    final_cohort = pd.DataFrame()
    print("No cohort data to apply exclusions to!")

# Clear diagnoses data to free memory
del diagnoses
gc.collect()

## Step 7: Summary Statistics

In [None]:
# Load diagnoses
print("Loading diagnoses for heart failure identification...")
diagnoses = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/hosp/diagnoses_icd.csv.gz')

# Heart failure ICD codes
# ICD-9: 428.x
# ICD-10: I50.x
hf_icd9_codes = [str(x) for x in range(4280, 4290)]  # 428.0 - 428.9
hf_icd10_codes = ['I50' + str(x) for x in range(10)] + ['I50']  # I50, I50.0 - I50.9

# Find patients with heart failure
hf_diagnoses = diagnoses[
    (diagnoses['icd_code'].str.startswith(tuple(hf_icd9_codes))) |
    (diagnoses['icd_code'].str.startswith(tuple(hf_icd10_codes)))
]

hf_hadm_ids = set(hf_diagnoses['hadm_id'].unique())
print(f"Admissions with heart failure: {len(hf_hadm_ids):,}")

# Mark heart failure in cohort
cohort_with_vent['has_heart_failure'] = cohort_with_vent['hadm_id'].isin(hf_hadm_ids)
print(f"Cohort patients with HF: {cohort_with_vent['has_heart_failure'].sum():,}")

Loading diagnoses for heart failure identification...
Admissions with heart failure: 80,611
Cohort patients with HF: 17,000


## Step 6: Identify Pregnant Patients

In [None]:
# Pregnancy ICD codes
# ICD-9: 630-679 (pregnancy, childbirth, and the puerperium)
# ICD-10: O00-O99 (pregnancy, childbirth and the puerperium)
print("Loading diagnoses for pregnant patients...")
diagnoses = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/hosp/diagnoses_icd.csv.gz')

# 398.91|402.01|402.11|402.91|404.01|404.03|404.11|404.13|404.91|404.93|428
pregnancy_icd9_prefixes = [str(x) for x in range(630, 680)]
pregnancy_icd10_prefix = 'O'

# Find pregnant patients
pregnancy_diagnoses = diagnoses[
    (diagnoses['icd_code'].str[:3].isin(pregnancy_icd9_prefixes)) |
    (diagnoses['icd_code'].str.startswith(pregnancy_icd10_prefix))
]

pregnant_hadm_ids = set(pregnancy_diagnoses['hadm_id'].unique())
print(f"Admissions with pregnancy codes: {len(pregnant_hadm_ids):,}")

# Mark pregnancy in cohort
cohort_with_vent['is_pregnant'] = cohort_with_vent['hadm_id'].isin(pregnant_hadm_ids)
print(f"Cohort patients who are pregnant: {cohort_with_vent['is_pregnant'].sum():,}")

Loading diagnoses for pregnant patients...
Admissions with pregnancy codes: 26,549
Cohort patients who are pregnant: 322


In [None]:
# Find patients who are both pregnant and have heart failure
pregnant_hf_hadm_ids = pregnant_hadm_ids.intersection(hf_hadm_ids)
cohort_with_vent['is_pregnant_hf'] = cohort_with_vent['hadm_id'].isin(pregnant_hf_hadm_ids)
print(f"Cohort patients who are pregnant and have HF: {cohort_with_vent['is_pregnant_hf'].sum():,}")

Cohort patients who are pregnant and have HF: 19


## Step 7: Apply Exclusion Criteria

In [None]:
# Apply exclusions
final_cohort = cohort_with_vent[
    (~cohort_with_vent['has_heart_failure']) & 
    (~cohort_with_vent['is_pregnant'])
].copy()

print(f"Final cohort after exclusions: {len(final_cohort):,}")
print(f"Unique patients: {final_cohort['subject_id'].nunique():,}")

# Add admission and discharge times
final_cohort['admission_dttm'] = final_cohort['admittime']
final_cohort['discharge_dttm'] = final_cohort['dischtime']

Final cohort after exclusions: 45,506
Unique patients: 37,492


# Summary statistics for the final cohort
if len(final_cohort) > 0:
    print("=== FINAL COHORT SUMMARY ===")
    print(f"\nTotal admissions: {len(final_cohort):,}")
    print(f"Unique patients: {final_cohort['subject_id'].nunique():,}")
    
    print(f"\nAge distribution:")
    print(final_cohort['age_at_admission'].describe())
    
    print(f"\nGender distribution:")
    print(final_cohort['gender'].value_counts())
    
    print(f"\nAdmission type:")
    print(final_cohort['admission_type'].value_counts())
    
    print(f"\nAdmission location:")
    print(final_cohort['admission_location'].value_counts().head(10))
    
    print(f"\nDischarge location:")
    print(final_cohort['discharge_location'].value_counts().head(10))
    
    # Hospital mortality
    final_cohort['mortality'] = final_cohort['hospital_expire_flag']
    print(f"\nHospital mortality: {final_cohort['mortality'].sum():,} ({final_cohort['mortality'].mean()*100:.1f}%)")
    
    # ICU length of stay
    icu_los = final_cohort['outtime'] - final_cohort['intime']
    final_cohort['icu_los_days'] = icu_los.dt.total_seconds() / (24 * 3600)
    print(f"\nICU Length of Stay (days):")
    print(final_cohort['icu_los_days'].describe())
    
else:
    print("No final cohort data available for summary!")

In [None]:
## Step 8: Save Cohort

## Step 9: Save Cohort

In [None]:
# Save the final cohort
if len(final_cohort) > 0:
    # Create output directory
    import os
    output_dir = '/Users/kavenchhikara/Desktop/projects/SCCM/SCCM-Team2/ards_analysis/data'
    os.makedirs(output_dir, exist_ok=True)
    
    # Select key columns for cohort
    cohort_columns = [
        'subject_id', 'hadm_id', 'admission_dttm', 'discharge_dttm',
        'age_at_admission', 'gender', 'admission_type',
        'admission_location', 'discharge_location', 'insurance',
        'marital_status', 'mortality', 'has_heart_failure', 'is_pregnant',
        'icu_los_days', 'intime', 'outtime'
    ]
    
    # Save cohort
    cohort_file = f'{output_dir}/base_cohort_updated.parquet'
    final_cohort[cohort_columns].to_parquet(cohort_file, index=False)
    print(f"\nUpdated cohort saved to: {cohort_file}")
    print(f"File size: {os.path.getsize(cohort_file) / 1024 / 1024:.1f} MB")
    
    # Also save summary statistics
    summary_stats = {
        'total_admissions': len(final_cohort),
        'unique_patients': final_cohort['subject_id'].nunique(),
        'mean_age': final_cohort['age_at_admission'].mean(),
        'mortality_rate': final_cohort['mortality'].mean(),
        'mean_icu_los': final_cohort['icu_los_days'].mean()
    }
    
    print(f"\nCohort Summary:")
    for key, value in summary_stats.items():
        print(f"{key}: {value}")
        
else:
    print("No final cohort to save!")

In [None]:
# Create output directory
import os
output_dir = '/Users/kavenchhikara/Desktop/projects/SCCM/SCCM-Team2/ards_analysis/data'
os.makedirs(output_dir, exist_ok=True)

# Select key columns for cohort
cohort_columns = [
    'subject_id', 'hadm_id', 'admission_dttm', 'discharge_dttm',
    'age_at_admission', 'gender', 'admission_type',
    'admission_location', 'discharge_location', 'insurance',
    'marital_status', 'mortality', 'has_heart_failure', 'is_pregnant'
]

# Save cohort
cohort_file = f'{output_dir}/base_cohort.parquet'
final_cohort.to_parquet(cohort_file, index=False)
print(f"\nCohort saved to: {cohort_file}")
print(f"File size: {os.path.getsize(cohort_file) / 1024 / 1024:.1f} MB")


Cohort saved to: /Users/kavenchhikara/Desktop/projects/SCCM/SCCM-Team2/ards_analysis/data/base_cohort.parquet
File size: 3.5 MB


## Next Steps

This cohort represents all adult patients with:
- At least one ICU admission
- PEEP ≥ 5 within first 48 hours of ICU admission
- S/F ratio < 315 at least once (SpO2/FiO2)
- At least one radiology report
- No heart failure diagnosis
- Not pregnant

Next notebooks will:
1. Apply Berlin criteria to identify ARDS patients
2. Extract proning events
3. Extract neuromuscular blockade administration
4. Calculate timing metrics and outcomes