# ARDS Detection using NLP on Radiology Reports

## Objective
Implement bilateral opacity detection using ARDSFlag methodology for MIMIC-IV radiology reports.

## Research Question
How does obesity modify the relationship between early plateau pressures and clinical outcomes in ARDS patients, when ARDS onset is accurately detected using unstructured radiology reports?

## Methodology
- **Berlin Definition ARDS criteria implementation**
- **ARDSFlag NLP patterns for bilateral opacity detection**
- **Rule-based approach with confidence scoring**

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import warnings
warnings.filterwarnings('ignore')

# Data paths
DATA_BASE = '/Users/kavenchhikara/Desktop/CLIF/MIMIC-IV-3.1/physionet.org/files'
RAD_PATH = f'{DATA_BASE}/mimic-iv-note/2.2/note/radiology.csv.gz'
MIMIC_BASE = f'{DATA_BASE}/mimiciv/3.1'

print("Environment setup complete!")
print(f"Radiology data path: {RAD_PATH}")
print(f"MIMIC base path: {MIMIC_BASE}")

## 1. Data Loading and Initial Exploration

In [None]:
# Load radiology data structure
print("=== RADIOLOGY DATA STRUCTURE ===")
rad_sample = pd.read_csv(RAD_PATH, nrows=5)
print("Columns:", rad_sample.columns.tolist())
print("Sample shape:", rad_sample.shape)
print("\nSample data:")
rad_sample.head()

In [None]:
# Load larger sample and identify chest imaging
print("=== CHEST IMAGING IDENTIFICATION ===")
rad_chunk = pd.read_csv(RAD_PATH)

# Filter for chest imaging
chest_mask = rad_chunk['text'].str.contains(
    'chest.*x.*ray|chest.*pa.*lat|chest.*film|portable.*chest|thorax', 
    case=False, na=False
)
chest_reports = rad_chunk[chest_mask].copy()

print(f"Total radiology reports (sample): {len(rad_chunk):,}")
print(f"Chest imaging reports: {len(chest_reports):,}")
print(f"Chest imaging percentage: {len(chest_reports)/len(rad_chunk):.1%}")

# Show distribution of report types
print("\nNote types in sample:")
print(rad_chunk['note_type'].value_counts())

In [None]:
# Examine sample chest X-ray reports
print("=== SAMPLE CHEST X-RAY REPORTS ===")

for i, (idx, row) in enumerate(chest_reports.head(3).iterrows()):
    print(f"\n--- Report {i+1} (ID: {row['note_id']}) ---")
    text = row['text'][:600] + '...' if len(row['text']) > 600 else row['text']
    print(text)
    print(f"Full text length: {len(row['text'])} characters")

## 2. ARDSFlag NLP Implementation

In [None]:
class BerlinARDSDetector:
    """
    Berlin Definition ARDS Detection using ARDSFlag methodology
    """
    
    def __init__(self):
        # Enhanced bilateral opacity patterns from ARDSFlag
        self.bilateral_patterns = [
            # Direct bilateral opacity mentions
            r'bilateral\s+(?:ground[‐\s]?glass\s+)?(?:opacit|infiltrat|consolidat|shadowing)',
            r'(?:opacit|infiltrat|consolidat|shadowing).{0,30}bilateral',
            
            # Bilateral anatomical mentions
            r'bilateral\s+(?:lung|pulmonary|alveolar)',
            r'both\s+(?:lung|lower\s+lobe|upper\s+lobe|base)',
            r'(?:right|left)\s+(?:and|&|\+)\s+(?:left|right)\s+(?:lung|lobe|base)',
            
            # Diffuse/extensive patterns
            r'diffuse\s+(?:bilateral\s+)?(?:opacit|infiltrat|consolidat|ground[‐\s]?glass)',
            r'extensive\s+(?:bilateral\s+)?(?:opacit|infiltrat|consolidat)',
            r'multifocal\s+(?:opacit|infiltrat|consolidat)',
            r'widespread\s+(?:opacit|infiltrat|consolidat)',
            
            # Ground glass specific (common in ARDS)
            r'bilateral\s+ground[‐\s]?glass',
            r'diffuse\s+ground[‐\s]?glass',
            r'ground[‐\s]?glass\s+(?:opacit|change).{0,20}bilateral',
        ]
        
        # Exclusion patterns for negation
        self.exclusion_patterns = [
            r'no\s+(?:bilateral|diffuse|extensive|multifocal)',
            r'without\s+(?:bilateral|diffuse|extensive)',
            r'absence\s+of\s+(?:bilateral|diffuse)',
            r'clear\s+(?:lung|bilateral)',
            r'resolved\s+(?:bilateral|diffuse)',
            r'improving\s+(?:bilateral|diffuse)',
        ]
        
        # CHF/cardiogenic patterns for exclusion
        self.chf_patterns = [
            r'congestive\s+heart\s+failure',
            r'\bchf\b',
            r'cardiogenic\s+(?:edema|pulmonary)',
            r'heart\s+failure',
            r'cardiac\s+(?:failure|dysfunction)',
            r'left\s+(?:heart|ventricular)\s+failure',
            r'pulmonary\s+(?:edema|congestion).{0,30}cardiac',
        ]
    
    def clean_radiology_text(self, text):
        """Clean and normalize radiology report text"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove patient identifiers and dates
        text = re.sub(r'___+', ' ', text)
        text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', ' ', text)
        
        # Normalize medical abbreviations
        text = re.sub(r'\bchf\b', 'congestive heart failure', text)
        text = re.sub(r'\bcopd\b', 'chronic obstructive pulmonary disease', text)
        text = re.sub(r'\bpe\b', 'pulmonary embolism', text)
        text = re.sub(r'\bGGO\b', 'ground glass opacity', text)
        
        # Standardize spacing around hyphens
        text = re.sub(r'ground-glass', 'ground glass', text)
        text = re.sub(r'ground‐glass', 'ground glass', text)
        
        return text
    
    def detect_bilateral_opacities(self, text):
        """Detect bilateral opacities using enhanced pattern matching"""
        clean_text = self.clean_radiology_text(text)
        
        # Check for exclusion patterns first
        exclusions = []
        for pattern in self.exclusion_patterns:
            if re.search(pattern, clean_text):
                exclusions.append(pattern)
        
        if exclusions:
            return {
                'has_bilateral_opacities': False,
                'confidence': 0.0,
                'matched_patterns': [],
                'excluded_by': exclusions,
                'reason': 'negated'
            }
        
        # Check for bilateral opacity patterns
        matched_patterns = []
        for pattern in self.bilateral_patterns:
            matches = re.findall(pattern, clean_text)
            if matches:
                matched_patterns.append({
                    'pattern': pattern,
                    'matches': matches
                })
        
        has_bilateral = len(matched_patterns) > 0
        
        # Enhanced confidence scoring
        confidence = 0.0
        if has_bilateral:
            base_confidence = 0.3 * len(matched_patterns)
            
            # Bonus for specific high-confidence patterns
            high_conf_patterns = ['bilateral', 'diffuse', 'ground glass']
            for pattern_info in matched_patterns:
                for term in high_conf_patterns:
                    if term in pattern_info['pattern']:
                        confidence += 0.2
            
            confidence = min(confidence, 1.0)
        
        return {
            'has_bilateral_opacities': has_bilateral,
            'confidence': confidence,
            'matched_patterns': matched_patterns,
            'excluded_by': [],
            'reason': 'detected' if has_bilateral else 'not_found'
        }
    
    def detect_chf_exclusion(self, text):
        """Detect CHF/cardiogenic causes for exclusion"""
        clean_text = self.clean_radiology_text(text)
        
        matched_chf = []
        for pattern in self.chf_patterns:
            matches = re.findall(pattern, clean_text)
            if matches:
                matched_chf.append({
                    'pattern': pattern,
                    'matches': matches
                })
        
        has_chf = len(matched_chf) > 0
        
        return {
            'has_chf': has_chf,
            'matched_patterns': matched_chf,
            'reason': 'chf_detected' if has_chf else 'no_chf'
        }

# Initialize detector
detector = BerlinARDSDetector()
print("ARDSFlag detector initialized!")
print(f"Bilateral patterns: {len(detector.bilateral_patterns)}")
print(f"Exclusion patterns: {len(detector.exclusion_patterns)}")
print(f"CHF patterns: {len(detector.chf_patterns)}")

## 3. Test Pattern Detection on Sample Reports

In [None]:
# Test on initial sample
print("=== TESTING BILATERAL OPACITY DETECTION ===")

sample_reports = chest_reports.head(5)
for idx, row in sample_reports.iterrows():
    bilateral_result = detector.detect_bilateral_opacities(row['text'])
    chf_result = detector.detect_chf_exclusion(row['text'])
    
    print(f"\n--- Report ID: {row['note_id']} ---")
    print(f"Bilateral opacities: {bilateral_result['has_bilateral_opacities']} (confidence: {bilateral_result['confidence']:.2f})")
    print(f"CHF detected: {chf_result['has_chf']}")
    if bilateral_result['has_bilateral_opacities']:
        print(f"Matched patterns: {len(bilateral_result['matched_patterns'])}")
    print("Text preview:", row['text'][:200] + "...")

## 4. Large-Scale Analysis on Extended Dataset

In [None]:
# Load larger dataset for comprehensive analysis
print("=== LARGE-SCALE BILATERAL OPACITY ANALYSIS ===")

# Load larger sample
rad_large = pd.read_csv(RAD_PATH, nrows=20000)

# Filter for chest imaging
chest_mask = rad_large['text'].str.contains(
    'chest.*x.*ray|chest.*pa.*lat|chest.*film|portable.*chest|thorax', 
    case=False, na=False
)
chest_reports_large = rad_large[chest_mask].copy()

print(f"Large sample size: {len(rad_large):,} radiology reports")
print(f"Chest imaging reports: {len(chest_reports_large):,}")
print(f"Processing for bilateral opacity detection...")

In [None]:
# Process all chest reports for bilateral opacities
results = []
for idx, row in chest_reports_large.iterrows():
    bilateral_result = detector.detect_bilateral_opacities(row['text'])
    chf_result = detector.detect_chf_exclusion(row['text'])
    
    results.append({
        'note_id': row['note_id'],
        'subject_id': row['subject_id'],
        'hadm_id': row['hadm_id'],
        'charttime': row['charttime'],
        'has_bilateral_opacities': bilateral_result['has_bilateral_opacities'],
        'bilateral_confidence': bilateral_result['confidence'],
        'bilateral_patterns': len(bilateral_result['matched_patterns']),
        'has_chf': chf_result['has_chf'],
        'chf_patterns': len(chf_result['matched_patterns']),
        'text_length': len(row['text'])
    })

results_df = pd.DataFrame(results)
print(f"Processing complete! Results shape: {results_df.shape}")

In [None]:
# Summary statistics
print("=== BILATERAL OPACITIES DETECTION RESULTS ===")
print(f"Total reports analyzed: {len(results_df):,}")
print(f"Reports with bilateral opacities: {results_df['has_bilateral_opacities'].sum():,} ({results_df['has_bilateral_opacities'].mean():.1%})")
print(f"Reports with CHF mentions: {results_df['has_chf'].sum():,} ({results_df['has_chf'].mean():.1%})")

# Confidence distribution for positive cases
positive_cases = results_df[results_df['has_bilateral_opacities']]
if len(positive_cases) > 0:
    print(f"\nConfidence distribution for positive cases:")
    print(f"Mean confidence: {positive_cases['bilateral_confidence'].mean():.2f}")
    print(f"Median confidence: {positive_cases['bilateral_confidence'].median():.2f}")
    print(f"High confidence cases (≥0.5): {(positive_cases['bilateral_confidence'] >= 0.5).sum():,}")

# Unique patients and admissions
print(f"\nUnique subjects with bilateral opacities: {positive_cases['subject_id'].nunique():,}")
print(f"Unique admissions with bilateral opacities: {positive_cases['hadm_id'].nunique():,}")

## 5. Examples of Detected Bilateral Opacities

In [None]:
# Show examples of detected bilateral opacities
print("=== EXAMPLES OF DETECTED BILATERAL OPACITIES ===")

# Get high-confidence examples
high_conf_examples = results_df[
    (results_df['has_bilateral_opacities']) & 
    (results_df['bilateral_confidence'] >= 0.4)
].head(3)

if len(high_conf_examples) > 0:
    # Merge back with original text
    examples_with_text = chest_reports_large.merge(
        high_conf_examples[['note_id', 'bilateral_confidence']], 
        on='note_id'
    )
    
    for i, (_, row) in enumerate(examples_with_text.iterrows()):
        print(f"\n--- Example {i+1} (Confidence: {row['bilateral_confidence']:.2f}) ---")
        print(f"Report ID: {row['note_id']}")
        print(f"Subject ID: {row['subject_id']}")
        
        # Show relevant sections
        text = row['text']
        if 'FINDINGS' in text.upper():
            findings_start = text.upper().find('FINDINGS')
            relevant_text = text[findings_start:findings_start+500]
        else:
            relevant_text = text[:500]
        
        print("Relevant text:", relevant_text + "...")
        
        # Show what patterns were detected
        bilateral_result = detector.detect_bilateral_opacities(text)
        print(f"Detected patterns: {len(bilateral_result['matched_patterns'])}")
        for pattern_info in bilateral_result['matched_patterns']:
            print(f"  - Found: {pattern_info['matches']}")
else:
    print("No high-confidence examples found in this sample.")
    # Show any positive examples
    any_positive = results_df[results_df['has_bilateral_opacities']].head(2)
    if len(any_positive) > 0:
        print("\nShowing lower-confidence examples:")
        examples_with_text = chest_reports_large.merge(
            any_positive[['note_id', 'bilateral_confidence']], 
            on='note_id'
        )
        
        for i, (_, row) in enumerate(examples_with_text.iterrows()):
            print(f"\n--- Example {i+1} (Confidence: {row['bilateral_confidence']:.2f}) ---")
            print(f"Report ID: {row['note_id']}")
            print("Text preview:", row['text'][:300] + "...")

## 6. Save Results for Next Steps

In [None]:
# Save detection results
results_df.to_csv('../data/bilateral_opacity_detection_results.csv', index=False)
print(f"Results saved to ../data/bilateral_opacity_detection_results.csv")
print(f"Shape: {results_df.shape}")

# Summary for next notebook
print("\n=== SUMMARY FOR NEXT STEPS ===")
print(f"✅ Bilateral opacity detection implemented and tested")
print(f"✅ Found {results_df['has_bilateral_opacities'].sum():,} potential ARDS cases")
print(f"✅ {results_df['subject_id'].nunique():,} unique subjects in dataset")
print(f"")
print(f"🎯 Next steps:")
print(f"   1. Extract ventilator parameters (P/F ratio, PEEP)")
print(f"   2. Add BMI/obesity classification")
print(f"   3. Define clinical outcomes")
print(f"   4. Statistical analysis of obesity-plateau pressure interaction")