# ARDS Identification Using Berlin Criteria

This notebook identifies ARDS patients from our base cohort using the Berlin Definition:

## Berlin Criteria for ARDS:
1. **Timing**: Within 1 week of known clinical insult or new/worsening respiratory symptoms
2. **Chest imaging**: Bilateral opacities not fully explained by effusions, lobar/lung collapse, or nodules
3. **Origin of edema**: Respiratory failure not fully explained by cardiac failure or fluid overload
4. **Oxygenation** (with PEEP ≥ 5 cm H2O):
   - Mild: 200 < PaO2/FiO2 ≤ 300
   - Moderate: 100 < PaO2/FiO2 ≤ 200
   - Severe: PaO2/FiO2 ≤ 100

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import warnings
warnings.filterwarnings('ignore')

# Define paths
MIMIC_PATH = '/Users/kavenchhikara/Desktop/CLIF/MIMIC-IV-3.1/physionet.org/files'
DATA_PATH = '/Users/kavenchhikara/Desktop/projects/SCCM/SCCM-Team2/ards_analysis/data'

print(f"Analysis start time: {datetime.now()}")

Analysis start time: 2025-07-19 22:38:08.175920


## Step 1: Load Base Cohort

In [2]:
# Load the base cohort from previous notebook
base_cohort = pd.read_parquet(f'{DATA_PATH}/base_cohort.parquet')
print(f"Base cohort loaded: {len(base_cohort):,} admissions")
print(f"Unique patients: {base_cohort['subject_id'].nunique():,}")

# Convert datetime columns
base_cohort['admission_dttm'] = pd.to_datetime(base_cohort['admission_dttm'])
base_cohort['discharge_dttm'] = pd.to_datetime(base_cohort['discharge_dttm'])

Base cohort loaded: 45,506 admissions
Unique patients: 37,492


## Step 2: Extract Radiology Reports for Bilateral Infiltrates

In [3]:
# Load radiology reports
print("Loading radiology reports...")
radiology = pd.read_csv(f'{MIMIC_PATH}/mimic-iv-note/2.2/note/radiology.csv.gz')

# Filter to our cohort
cohort_radiology = radiology[radiology['hadm_id'].isin(base_cohort['hadm_id'])].copy()
print(f"Radiology reports for cohort: {len(cohort_radiology):,}")

# Convert charttime
cohort_radiology['charttime'] = pd.to_datetime(cohort_radiology['charttime'])

Loading radiology reports...
Radiology reports for cohort: 358,141


In [4]:
# Function to detect bilateral infiltrates/opacities in radiology reports
def detect_bilateral_infiltrates(text):
    if pd.isna(text):
        return False
    
    text_lower = text.lower()
    
    # Bilateral patterns
    bilateral_patterns = [
        r'bilateral.*(?:infiltrate|opacity|opacities|consolidation)',
        r'(?:infiltrate|opacity|opacities|consolidation).*bilateral',
        r'both lung.*(?:infiltrate|opacity|opacities|consolidation)',
        r'(?:infiltrate|opacity|opacities|consolidation).*both lung',
        r'diffuse.*(?:infiltrate|opacity|opacities|consolidation)',
        r'multifocal.*(?:infiltrate|opacity|opacities|consolidation)',
        r'bibasilar.*(?:infiltrate|opacity|opacities|consolidation)',
        r'bilateral.*ground.?glass',
        r'bilateral.*airspace disease',
        r'ards', # Direct ARDS mention
        r'acute respiratory distress syndrome'
    ]
    
    # Check for any bilateral pattern
    for pattern in bilateral_patterns:
        if re.search(pattern, text_lower):
            return True
    
    # Check for infiltrates in both lungs mentioned separately
    left_infiltrate = re.search(r'left.*(?:infiltrate|opacity|consolidation)', text_lower)
    right_infiltrate = re.search(r'right.*(?:infiltrate|opacity|consolidation)', text_lower)
    
    return bool(left_infiltrate and right_infiltrate)

# Apply detection
print("Detecting bilateral infiltrates in radiology reports...")
cohort_radiology['bilateral_infiltrates'] = cohort_radiology['text'].apply(detect_bilateral_infiltrates)
print(f"Reports with bilateral infiltrates: {cohort_radiology['bilateral_infiltrates'].sum():,} ({cohort_radiology['bilateral_infiltrates'].mean()*100:.1f}%)")

Detecting bilateral infiltrates in radiology reports...
Reports with bilateral infiltrates: 34,232 (9.6%)


In [5]:
# Get first occurrence of bilateral infiltrates for each admission
bilateral_infiltrates_df = cohort_radiology[cohort_radiology['bilateral_infiltrates']].groupby('hadm_id').agg({
    'charttime': 'min',  # First occurrence
    'note_id': 'first'
}).reset_index()
bilateral_infiltrates_df.rename(columns={'charttime': 'bilateral_infiltrates_time'}, inplace=True)

print(f"Admissions with bilateral infiltrates: {len(bilateral_infiltrates_df):,}")

Admissions with bilateral infiltrates: 14,557


## Step 3: Extract Ventilation Parameters (PEEP, PaO2, FiO2)

In [6]:
# Load ICU stays to link hadm_id with stay_id
icustays = pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/icu/icustays.csv.gz')
cohort_icustays = icustays[icustays['hadm_id'].isin(base_cohort['hadm_id'])].copy()
print(f"ICU stays for cohort: {len(cohort_icustays):,}")

# Define itemids for ventilation parameters
ventilation_itemids = {
    'peep': [220339, 224700],  # PEEP
    'fio2_percent': [220210],  # FiO2 (%)
    'fio2_fraction': [223835, 224698],  # FiO2 (fraction)
    'pao2': [220224, 224689],  # PaO2
    'pf_ratio': [223834]  # PaO2/FiO2 ratio (if directly recorded)
}

ICU stays for cohort: 49,755


In [10]:
# Extract ventilation parameters
print("Extracting ventilation parameters...")
all_itemids = [item for items in ventilation_itemids.values() for item in items]

# Load chartevents for our cohort's ICU stays
vent_data = []
chunk_size = 1000000

for i, chunk in enumerate(pd.read_csv(f'{MIMIC_PATH}/mimiciv/3.1/icu/chartevents.csv.gz', 
                                     chunksize=chunk_size)):
    # Filter for our cohort's stays and relevant items
    chunk_filtered = chunk[
        (chunk['stay_id'].isin(cohort_icustays['stay_id'])) &
        (chunk['itemid'].isin(all_itemids))
    ]
    
    if len(chunk_filtered) > 0:
        vent_data.append(chunk_filtered)
    
    if i % 10 == 0:
        print(f"Processed {(i+1)*chunk_size:,} rows...")

# Combine all ventilation data
vent_df = pd.concat(vent_data, ignore_index=True)
print(f"\nTotal ventilation measurements: {len(vent_df):,}")


Extracting ventilation parameters...
Processed 1,000,000 rows...
Processed 11,000,000 rows...
Processed 21,000,000 rows...
Processed 31,000,000 rows...
Processed 41,000,000 rows...
Processed 51,000,000 rows...
Processed 61,000,000 rows...
Processed 71,000,000 rows...
Processed 81,000,000 rows...
Processed 91,000,000 rows...
Processed 101,000,000 rows...
Processed 111,000,000 rows...
Processed 121,000,000 rows...
Processed 131,000,000 rows...
Processed 141,000,000 rows...
Processed 151,000,000 rows...
Processed 161,000,000 rows...
Processed 171,000,000 rows...
Processed 181,000,000 rows...
Processed 191,000,000 rows...
Processed 201,000,000 rows...
Processed 211,000,000 rows...
Processed 221,000,000 rows...
Processed 231,000,000 rows...
Processed 241,000,000 rows...
Processed 251,000,000 rows...
Processed 261,000,000 rows...
Processed 271,000,000 rows...
Processed 281,000,000 rows...
Processed 291,000,000 rows...
Processed 301,000,000 rows...
Processed 311,000,000 rows...
Processed 321,

In [12]:
cohort_icustays.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit',
       'intime', 'outtime', 'los'],
      dtype='object')

In [13]:
vent_df.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'caregiver_id', 'charttime',
      dtype='object')

In [14]:
# Add hadm_id from icustays
# vent_df = vent_df.merge(cohort_icustays[['stay_id', 'hadm_id']], on='stay_id', how='left')

# Convert charttime
vent_df['charttime'] = pd.to_datetime(vent_df['charttime'])

In [16]:
def process_vent_params_vectorized(df):
    """
    Vectorized version - much faster than the loop-based approach
    """
    print("Processing ventilation parameters (vectorized)...")

    # Create a copy to avoid modifying original
    df = df.copy()

    # Add parameter type column
    df['param_type'] = 'unknown'
    df.loc[df['itemid'].isin(ventilation_itemids['peep']), 'param_type'] = 'peep'
    df.loc[df['itemid'].isin(ventilation_itemids['fio2_percent']), 'param_type'] = 'fio2_percent'
    df.loc[df['itemid'].isin(ventilation_itemids['fio2_fraction']), 'param_type'] = 'fio2_fraction'
    df.loc[df['itemid'].isin(ventilation_itemids['pao2']), 'param_type'] = 'pao2'
    df.loc[df['itemid'].isin(ventilation_itemids['pf_ratio']), 'param_type'] = 'pf_ratio'

    # Convert FiO2 percentage to fraction
    df.loc[df['param_type'] == 'fio2_percent', 'valuenum'] = df.loc[df['param_type'] == 'fio2_percent', 'valuenum'] / 100

    # Combine FiO2 types
    df.loc[df['param_type'] == 'fio2_percent', 'param_type'] = 'fio2'
    df.loc[df['param_type'] == 'fio2_fraction', 'param_type'] = 'fio2'

    # Pivot to get one row per hadm_id/charttime with columns for each parameter
    pivot_df = df.pivot_table(
        index=['hadm_id', 'charttime'],
        columns='param_type',
        values='valuenum',
        aggfunc='first'  # Take first value if multiple
    ).reset_index()

    # Flatten column names
    pivot_df.columns.name = None

    # Ensure all expected columns exist
    for col in ['peep', 'fio2', 'pao2', 'pf_ratio']:
        if col not in pivot_df.columns:
            pivot_df[col] = None

    # Calculate P/F ratio where not directly available
    mask = (pivot_df['pf_ratio'].isna()) & (pivot_df['pao2'].notna()) &(pivot_df['fio2'].notna()) & (pivot_df['fio2'] > 0)
    pivot_df.loc[mask, 'pf_ratio'] = pivot_df.loc[mask, 'pao2'] / pivot_df.loc[mask, 'fio2']

    return pivot_df[['hadm_id', 'charttime', 'peep', 'fio2', 'pao2', 'pf_ratio']]

# Use the vectorized version
vent_processed = process_vent_params_vectorized(vent_df)

print(f"Processed ventilation measurements: {len(vent_processed):,}")
print(f"Measurements with PEEP: {vent_processed['peep'].notna().sum():,}")
print(f"Measurements with P/F ratio: {vent_processed['pf_ratio'].notna().sum():,}")

Processing ventilation parameters (vectorized)...
Processed ventilation measurements: 4,637,866
Measurements with PEEP: 450,594
Measurements with P/F ratio: 748,916


## Step 4: Apply Berlin Criteria

In [None]:
# Identify ARDS cases based on Berlin criteria - CORRECTED VERSION
# Using admission time as clinical insult proxy
def identify_ards_onset_corrected(hadm_id, bilateral_infiltrates_df, vent_processed, base_cohort):
    """
    Identify ARDS onset for a given admission using CORRECT Berlin criteria timing
    Returns: dict with ARDS onset info or None
    
    Correct Berlin timing: 
    - Clinical insult = admission time
    - Bilateral infiltrates must be documented within 7 days of admission
    - ARDS onset (meeting oxygenation criteria) must be within 7 days of admission
    - ARDS onset must be AT OR AFTER bilateral infiltrates documentation
    """
    # Check if patient has bilateral infiltrates
    if hadm_id not in bilateral_infiltrates_df['hadm_id'].values:
        return None
    
    bilateral_time = bilateral_infiltrates_df[bilateral_infiltrates_df['hadm_id'] == hadm_id]['bilateral_infiltrates_time'].iloc[0]
    admission_time = base_cohort[base_cohort['hadm_id'] == hadm_id]['admission_dttm'].iloc[0]
    
    # Check if bilateral infiltrates are within 7 days of admission (clinical insult)
    if (bilateral_time - admission_time).total_seconds() / (24 * 3600) > 7:
        return None
    
    # Get ventilation data for this admission
    hadm_vent = vent_processed[vent_processed['hadm_id'] == hadm_id].copy()
    
    if len(hadm_vent) == 0:
        return None
    
    # ARDS criteria must be met within 7 days of clinical insult (admission)
    # AND at or after bilateral infiltrates documentation
    time_window_end = admission_time + timedelta(days=7)
    
    qualifying_measurements = hadm_vent[
        (hadm_vent['charttime'] >= bilateral_time) &      # Must be after/at infiltrates documented
        (hadm_vent['charttime'] >= admission_time) &      # Must be after admission
        (hadm_vent['charttime'] <= time_window_end) &     # Within 7 days of admission (clinical insult)
        (hadm_vent['peep'] >= 5) &                        # PEEP ≥ 5 cm H2O
        (hadm_vent['pf_ratio'].notna()) &
        (hadm_vent['pf_ratio'] <= 300)                    # P/F ratio ≤ 300
    ].copy()
    
    if len(qualifying_measurements) == 0:
        return None
    
    # Find the earliest qualifying measurement
    qualifying_measurements = qualifying_measurements.sort_values('charttime')
    first_qualifying = qualifying_measurements.iloc[0]
    
    # Determine ARDS severity
    pf_ratio = first_qualifying['pf_ratio']
    if pf_ratio <= 100:
        severity = 'severe'
    elif pf_ratio <= 200:
        severity = 'moderate'
    else:
        severity = 'mild'
    
    return {
        'hadm_id': hadm_id,
        'ards_onset_time': first_qualifying['charttime'],
        'bilateral_infiltrates_time': bilateral_time,
        'admission_time': admission_time,
        'initial_pf_ratio': pf_ratio,
        'initial_peep': first_qualifying['peep'],
        'initial_fio2': first_qualifying['fio2'],
        'initial_pao2': first_qualifying['pao2'],
        'ards_severity': severity
    }

# Vectorized version with admission time as clinical insult
def identify_ards_onset_vectorized_corrected(bilateral_infiltrates_df, vent_processed, base_cohort):
    """
    Vectorized ARDS identification with admission time as clinical insult proxy
    """
    print("Identifying ARDS cases (vectorized, admission time as clinical insult)...")
    
    # Merge ventilation data with bilateral infiltrates and admission info
    vent_with_bilateral = vent_processed.merge(
        bilateral_infiltrates_df[['hadm_id', 'bilateral_infiltrates_time']], 
        on='hadm_id', 
        how='inner'
    ).merge(
        base_cohort[['hadm_id', 'admission_dttm']], 
        on='hadm_id', 
        how='inner'
    )
    
    if len(vent_with_bilateral) == 0:
        return pd.DataFrame()
    
    # Calculate time differences (vectorized)
    vent_with_bilateral['hours_from_admission'] = (
        vent_with_bilateral['charttime'] - vent_with_bilateral['admission_dttm']
    ).dt.total_seconds() / 3600  # Hours from admission
    
    vent_with_bilateral['hours_from_bilateral'] = (
        vent_with_bilateral['charttime'] - vent_with_bilateral['bilateral_infiltrates_time']
    ).dt.total_seconds() / 3600  # Hours from bilateral infiltrates
    
    vent_with_bilateral['bilateral_from_admission'] = (
        vent_with_bilateral['bilateral_infiltrates_time'] - vent_with_bilateral['admission_dttm']
    ).dt.total_seconds() / 3600  # Hours from admission to bilateral infiltrates
    
    # Apply CORRECTED Berlin criteria filters
    qualifying_measurements = vent_with_bilateral[
        (vent_with_bilateral['bilateral_from_admission'] <= 168) &    # Bilateral infiltrates within 7 days of admission
        (vent_with_bilateral['hours_from_bilateral'] >= 0) &         # Must be AT OR AFTER infiltrates
        (vent_with_bilateral['hours_from_admission'] >= 0) &         # Must be after admission
        (vent_with_bilateral['hours_from_admission'] <= 168) &       # Within 7 days of admission (clinical insult)
        (vent_with_bilateral['peep'] >= 5) &                         # PEEP ≥ 5 cm H2O
        (vent_with_bilateral['pf_ratio'].notna()) &
        (vent_with_bilateral['pf_ratio'] <= 300)                     # P/F ratio ≤ 300
    ].copy()
    
    if len(qualifying_measurements) == 0:
        return pd.DataFrame()
    
    # Find earliest qualifying measurement for each patient (vectorized)
    earliest_ards = qualifying_measurements.loc[
        qualifying_measurements.groupby('hadm_id')['charttime'].idxmin()
    ].copy()
    
    # Determine ARDS severity (vectorized)
    severity_conditions = [
        earliest_ards['pf_ratio'] <= 100,
        earliest_ards['pf_ratio'] <= 200,
        earliest_ards['pf_ratio'] <= 300
    ]
    severity_choices = ['severe', 'moderate', 'mild']
    earliest_ards['ards_severity'] = np.select(severity_conditions, severity_choices, default='unknown')
    
    # Rename and select columns
    ards_df = earliest_ards[[
        'hadm_id', 'charttime', 'bilateral_infiltrates_time', 'admission_dttm',
        'pf_ratio', 'peep', 'fio2', 'pao2', 'ards_severity', 
        'hours_from_admission', 'hours_from_bilateral', 'bilateral_from_admission'
    ]].rename(columns={
        'charttime': 'ards_onset_time',
        'admission_dttm': 'admission_time',
        'pf_ratio': 'initial_pf_ratio',
        'peep': 'initial_peep',
        'fio2': 'initial_fio2',
        'pao2': 'initial_pao2'
    })
    
    return ards_df

# Use the vectorized corrected version with admission time as clinical insult
ards_df = identify_ards_onset_vectorized_corrected(bilateral_infiltrates_df, vent_processed, base_cohort)

print(f"ARDS cases identified (admission time as clinical insult): {len(ards_df):,}")
if len(ards_df) > 0:
    print(f"\nARDS severity distribution:")
    print(ards_df['ards_severity'].value_counts())
    
    print(f"\nTiming analysis:")
    print(f"Hours from admission to bilateral infiltrates:")
    print(ards_df['bilateral_from_admission'].describe())
    
    print(f"\nHours from admission to ARDS onset:")
    print(ards_df['hours_from_admission'].describe())
    
    print(f"\nHours from bilateral infiltrates to ARDS onset:")
    print(ards_df['hours_from_bilateral'].describe())
    
    print(f"\nARDS onset timing distribution:")
    print(f"Within 24 hours of admission: {(ards_df['hours_from_admission'] <= 24).sum()} ({(ards_df['hours_from_admission'] <= 24).mean()*100:.1f}%)")
    print(f"Within 72 hours of admission: {(ards_df['hours_from_admission'] <= 72).sum()} ({(ards_df['hours_from_admission'] <= 72).mean()*100:.1f}%)")
    print(f"Within 7 days of admission: {(ards_df['hours_from_admission'] <= 168).sum()} ({(ards_df['hours_from_admission'] <= 168).mean()*100:.1f}%)")
    
    print(f"\nSame day as bilateral infiltrates: {(ards_df['hours_from_bilateral'] < 24).sum()}")
    print(f"ARDS onset exactly at bilateral infiltrates time: {(ards_df['hours_from_bilateral'] == 0).sum()}")

## Step 5: Create Final ARDS Cohort

In [None]:
# Merge ARDS information with base cohort
ards_cohort = base_cohort.merge(ards_df, on='hadm_id', how='inner')

# Calculate time from admission to ARDS onset
ards_cohort['ards_onset_time'] = pd.to_datetime(ards_cohort['ards_onset_time'])
ards_cohort['hours_to_ards_onset'] = (ards_cohort['ards_onset_time'] - ards_cohort['admission_dttm']).dt.total_seconds() / 3600

print(f"Final ARDS cohort: {len(ards_cohort):,} patients")
print(f"\nTime to ARDS onset (hours):")
print(ards_cohort['hours_to_ards_onset'].describe())

## Step 6: Summary Statistics

In [None]:
print("=== ARDS COHORT SUMMARY ===")
print(f"\nTotal ARDS patients: {len(ards_cohort):,}")
print(f"Unique subjects: {ards_cohort['subject_id'].nunique():,}")

print(f"\nAge distribution:")
print(ards_cohort['age_at_admission'].describe())

print(f"\nGender distribution:")
print(ards_cohort['gender'].value_counts())

print(f"\nARDS severity:")
print(ards_cohort['ards_severity'].value_counts())
print(f"\nSeverity percentages:")
print(ards_cohort['ards_severity'].value_counts(normalize=True) * 100)

print(f"\nInitial P/F ratio by severity:")
print(ards_cohort.groupby('ards_severity')['initial_pf_ratio'].agg(['mean', 'std', 'min', 'max']))

print(f"\nHospital mortality:")
print(f"Overall: {ards_cohort['hospital_expire_flag'].mean()*100:.1f}%")
print("\nBy severity:")
print(ards_cohort.groupby('ards_severity')['hospital_expire_flag'].agg(['sum', 'mean']))

## Step 7: Save ARDS Cohort

In [None]:
# Save ARDS cohort
ards_cohort_file = f'{DATA_PATH}/ards_cohort.parquet'
ards_cohort.to_parquet(ards_cohort_file, index=False)
print(f"\nARDS cohort saved to: {ards_cohort_file}")

# Also save the ventilation data for later use
vent_file = f'{DATA_PATH}/ventilation_parameters.parquet'
vent_processed.to_parquet(vent_file, index=False)
print(f"Ventilation parameters saved to: {vent_file}")

print(f"\nAnalysis completed at: {datetime.now()}")

## Next Steps

We have successfully identified ARDS patients using Berlin criteria:
- Bilateral infiltrates detected from radiology reports
- PEEP ≥ 5 cm H2O
- P/F ratio ≤ 300
- Within 7-day time window

Next notebooks will:
1. Extract proning events from nursing documentation
2. Extract neuromuscular blockade administration
3. Calculate timing from ARDS onset
4. Analyze outcomes (mortality, LOS, extubation)