In [None]:
from datasets import load_dataset
import re

# ---- 1. Login to Hugging Face (only needs to be done once per session) ----
# Replace with your HF token (with write access)
# login("hf_your_token_here")

# ---- 2. Load dataset ----
dataset = load_dataset("jomoll/mimic-cxr-reports")

# ---- 3. Filter for non-empty findings ----
def has_findings(example):
    return example["findings_section"] is not None and example["findings_section"].strip() != ""

filtered_dataset = dataset.filter(has_findings)

# ---- 4. Remove Pleura section ----
def remove_pleura(example):
    text = example["findings_section"]

    # Regex: "Pleura:" followed by text until next subsection (capitalized word + colon) or end
    cleaned = re.sub(r"Pleura:.*?(?=(?:[A-Z][a-z]+:)|$)", "", text, flags=re.DOTALL)
    # also remove "Lungs and Airways:" section if it exists
    cleaned2 = re.sub(r"Lungs and Airways:.*?(?=(?:[A-Z][a-z]+:)|$)", "", cleaned, flags=re.DOTALL)
    cleaned3 = re.sub(r"Cardiovascular:.*?(?=(?:[A-Z][a-z]+:)|$)", "", cleaned2, flags=re.DOTALL)
    cleaned4 = re.sub(r"Tubes, Catheters, and Support Devices:.*?(?=(?:[A-Z][a-z]+:)|$)", "", cleaned3, flags=re.DOTALL)
    # Normalize whitespace
    cleaned = re.sub(r"\n\s*\n", "\n", cleaned).strip()
    cleaned2 = re.sub(r"\n\s*\n", "\n", cleaned2).strip()
    cleaned3 = re.sub(r"\n\s*\n", "\n", cleaned3).strip()
    cleaned4 = re.sub(r"\n\s*\n", "\n", cleaned4).strip()

    example["findings_no_pleura"] = cleaned
    example["findings_no_pleura_no_lungs"] = cleaned2
    example["findings_no_pleura_no_lungs_no_cardio"] = cleaned3
    example["findings_no_pleura_no_lungs_no_cardio_no_tubes"] = cleaned4

    return example

processed_dataset = filtered_dataset.map(remove_pleura)

# ---- 5. Push to Hub ----
# This will create "jomoll/mimic-cxr-reports-no-pleura" under your account
processed_dataset.push_to_hub("jomoll/mimic-cxr-reports-no-pleura")
