# Exploring ARDS in MIMIC-IV Structured Data

This notebook explores methods to identify ARDS patients using structured data:
- ICD diagnosis codes
- Ventilator settings (P/F ratio, PEEP)
- Clinical criteria

In [None]:
import pandas as pd
import numpy as np
import yaml
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Setup
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Load paths configuration
with open('../configs/paths.yaml', 'r') as f:
    paths = yaml.safe_load(f)
    
# Resolve variables in paths
mimic_root = paths['mimic_root']
for category in paths:
    if isinstance(paths[category], dict):
        for subcategory in paths[category]:
            if isinstance(paths[category][subcategory], dict):
                for key in paths[category][subcategory]:
                    paths[category][subcategory][key] = paths[category][subcategory][key].replace('${mimic_root}', mimic_root)

## 1. ARDS ICD Codes

Common ICD codes for ARDS:
- ICD-9: 518.82 (Acute respiratory distress syndrome)
- ICD-10: J80 (Acute respiratory distress syndrome)

In [None]:
# Load diagnosis data
print("Loading diagnosis data...")
diagnoses = pd.read_csv(paths['mimic_iv']['hosp']['diagnoses_icd'], compression='gzip')
d_icd_diagnoses = pd.read_csv(paths['mimic_iv']['hosp']['d_icd_diagnoses'], compression='gzip')

print(f"Total diagnoses: {len(diagnoses):,}")
print(f"Unique patients: {diagnoses['subject_id'].nunique():,}")
print(f"Unique admissions: {diagnoses['hadm_id'].nunique():,}")

In [None]:
# Define ARDS-related ICD codes based on clinical criteria
# Inclusion codes for ARDS and related respiratory conditions
ards_inclusion_icd9 = ['51881', '51882', '51884', '51851', '51852', '51853', '5184', '5187', '78552', '99592', '9670', '9671', '9672']
# Exclusion codes (heart failure and related conditions that can mimic ARDS)
ards_exclusion_icd9 = ['49391', '49392', '49322', '4280']

# Search for ARDS-related ICD codes by keywords
ards_keywords = ['respiratory distress', 'ARDS', 'acute lung injury', 'ALI']

ards_codes_keywords = d_icd_diagnoses[
    d_icd_diagnoses['long_title'].str.contains('|'.join(ards_keywords), case=False, na=False)
]

# Get all inclusion codes details
ards_codes_inclusion = d_icd_diagnoses[
    d_icd_diagnoses['icd_code'].isin(ards_inclusion_icd9)
]

# Get all exclusion codes details
ards_codes_exclusion = d_icd_diagnoses[
    d_icd_diagnoses['icd_code'].isin(ards_exclusion_icd9)
]

print("ARDS Inclusion ICD-9 codes:")
print(ards_codes_inclusion[['icd_code', 'icd_version', 'long_title']].to_string())
print("\n" + "="*80 + "\n")

print("ARDS Exclusion ICD-9 codes (to rule out):")
print(ards_codes_exclusion[['icd_code', 'icd_version', 'long_title']].to_string())
print("\n" + "="*80 + "\n")

print("Additional ARDS-related codes found by keywords:")
additional_codes = ards_codes_keywords[~ards_codes_keywords['icd_code'].isin(ards_inclusion_icd9)]
print(additional_codes[['icd_code', 'icd_version', 'long_title']].to_string())

In [None]:
# Find patients with ARDS diagnosis using inclusion/exclusion criteria
# Include both the specific ICD-9 codes and ICD-10 codes
ards_icd10 = ['J80']   # J80 for ARDS

# Get patients with inclusion codes
ards_patients_inclusion = diagnoses[
    (diagnoses['icd_code'].isin(ards_inclusion_icd9 + ards_icd10)) |
    (diagnoses['icd_code'].str.startswith('J80'))
]

# Get patients with exclusion codes to filter out
exclusion_patients = diagnoses[
    diagnoses['icd_code'].isin(ards_exclusion_icd9)
]

# Remove patients who have exclusion codes
ards_patients = ards_patients_inclusion[
    ~ards_patients_inclusion['hadm_id'].isin(exclusion_patients['hadm_id'])
]

print(f"Patients with ARDS inclusion codes: {ards_patients_inclusion['subject_id'].nunique():,}")
print(f"Patients with exclusion codes: {exclusion_patients['subject_id'].nunique():,}")
print(f"Final ARDS patients (after exclusions): {ards_patients['subject_id'].nunique():,}")
print(f"Final ARDS admissions (after exclusions): {ards_patients['hadm_id'].nunique():,}")

# Show breakdown by ICD code
print("\nBreakdown by ICD code:")
code_counts = ards_patients['icd_code'].value_counts()
for code, count in code_counts.head(10).items():
    code_desc = d_icd_diagnoses[d_icd_diagnoses['icd_code'] == code]['long_title'].iloc[0] if len(d_icd_diagnoses[d_icd_diagnoses['icd_code'] == code]) > 0 else "Unknown"
    print(f"{code}: {count} - {code_desc}")

## 2. Load ICU Stay Data

We need ICU data to identify mechanically ventilated patients and extract ventilator parameters.

In [None]:
# Load ICU stays
print("Loading ICU stays...")
icustays = pd.read_csv(paths['mimic_iv']['icu']['icustays'], compression='gzip')

# Filter to ARDS patients
ards_icustays = icustays[icustays['hadm_id'].isin(ards_patients['hadm_id'])]

print(f"Total ICU stays: {len(icustays):,}")
print(f"ARDS patient ICU stays: {len(ards_icustays):,}")
print(f"\nFirst unit distribution for ARDS patients:")
print(ards_icustays['first_careunit'].value_counts())

## 3. Summary Statistics

In [None]:
# Create summary of ARDS patients identified by ICD codes
summary = {
    'Total patients with ARDS ICD': ards_patients['subject_id'].nunique(),
    'Total admissions with ARDS ICD': ards_patients['hadm_id'].nunique(),
    'Total ICU stays with ARDS': len(ards_icustays),
    'ICD-9 ARDS codes': len(diagnoses[diagnoses['icd_code'].isin(ards_icd9)]),
    'ICD-10 ARDS codes': len(diagnoses[diagnoses['icd_code'].isin(ards_icd10) | diagnoses['icd_code'].str.startswith('J80')])
}

for key, value in summary.items():
    print(f"{key}: {value:,}")

## Next Steps

1. **Extract ventilator parameters** (P/F ratio, PEEP, plateau pressure) from chartevents
2. **Apply Berlin criteria** for ARDS identification
3. **Compare with radiology report findings**
4. **Extract BMI/obesity data** for patient stratification