In [1]:
import os
import tarfile
import requests
import xml.etree.ElementTree as ET
from Chest_Xray_Report import logger
import pandas as pd



In [2]:
os.chdir("../")
%pwd

'd:\\FInal_project\\Chest_xRay_report'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    image_URL: str
    report_URL: str
    image_data_file: Path
    report_data_file: Path
    unzip_dir_image: Path
    unzip_dir_report: Path

In [3]:
from Chest_Xray_Report.constants import *
from Chest_Xray_Report.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_file_path = CONFIG_FILE_PATH,
            parms_file_path = PARAMS_FILE_PATH     
            ):
        self.config = read_yaml(config_file_path)
        self.parms =  read_yaml(parms_file_path)

        create_directories([self.config.artifacts_root])

    
    def get_data_ingestion_config(self) ->DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            image_URL=config.image_URL,
            report_URL=config.report_URL,
            image_data_file=config.image_data_file,
            report_data_file=config.report_data_file,
            unzip_dir_image = config.unzip_dir_image,
            unzip_dir_report = config.unzip_dir_report
        )

        return data_ingestion_config
        
        

In [4]:
def download_file(url, dest_path):
    """Download file from URL to dest_path if not already present."""
    if os.path.exists(dest_path):
        logger.info(f"File already exists: {dest_path}")
        print(f"[INFO] File already exists: {dest_path}")
        return dest_path  # ✅ return even if exists

    logger.info(f"[INFO] Downloading from {url}...")
    response = requests.get(url, stream=True)
    if response.status_code != 200:
        raise Exception(f"Failed to download {url} (status {response.status_code})")

    with open(dest_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    logger.info(f"[INFO] Saved: {dest_path}")
    return dest_path  # ✅ return file path


In [5]:
def extract_tgz(tgz_path, extract_to):
    if not os.path.exists(tgz_path):
        raise FileNotFoundError(f"File not found: {tgz_path}")
    if os.path.getsize(tgz_path) == 0:
        raise ValueError(f"File is empty: {tgz_path}")

    print(f"[INFO] Extracting {tgz_path}...")
    with tarfile.open(tgz_path, "r:gz") as tar:
        tar.extractall(path=extract_to)
    print(f"[INFO] Extracted to {extract_to}")


In [None]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config

    def start_download_extract(self):
        try:
            data_image_url = self.config.image_URL
            data_report_url = self.config.report_URL
            image_dir_down = self.config.image_data_file
            report_dir_down = self.config.report_data_file
            unzip_dir_image = self.config.unzip_dir_image
            unzip_dir_reports = self.config.unzip_dir_report

            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            os.makedirs("artifacts/data_ingestion/image", exist_ok=True)
            os.makedirs("artifacts/data_ingestion/report", exist_ok=True)

            logger.info("downloading the data")

            image_dir_down = download_file(data_image_url,image_dir_down)
            report_dir_down = download_file(data_report_url,report_dir_down)

            extract_tgz(image_dir_down, unzip_dir_image)
            extract_tgz(report_dir_down, unzip_dir_reports)

            
        except Exception as e:
            raise e

    def remove_folder(self):
        ecgen_path = os.path.join(self.config.unzip_dir_report, "ecgen-radiology")
        if os.path.exists(ecgen_path):
            for file in os.listdir(ecgen_path):
                if file.endswith(".xml"):
                    os.rename(os.path.join(ecgen_path, file), os.path.join(self.config.unzip_dir_report, file))
            os.rmdir(ecgen_path)

    import os
import xml.etree.ElementTree as ET

def convert_to_csv(self):
    # Ensure report folder exists
    if not os.path.exists(self.config.unzip_dir_report):
        raise FileNotFoundError(f"Report folder not found: {self.config.unzip_dir_report}")

    reports = os.listdir(self.config.unzip_dir_report)
    reports.sort()

    reports_with_no_image = []
    reports_with_empty_sections = []
    reports_with_no_impression = []
    reports_with_no_findings = []

    images_captions = {}
    reports_with_images = {}
    text_of_reports = {}

    for report in reports:
        report_path = os.path.join(self.config.unzip_dir_report, report)
        if not os.path.isfile(report_path):
            continue  # skip folders

        try:
            tree = ET.parse(report_path)
            root = tree.getroot()
        except ET.ParseError:
            print(f"Skipping invalid XML: {report}")
            continue

        img_ids = []
        impression = None
        findings = None

        # Find the images of the report
        images = root.findall("parentImage")
        if len(images) == 0:
            reports_with_no_image.append(report)
            continue

        # Extract sections
        sections = root.find("MedlineCitation").find("Article").find("Abstract").findall("AbstractText")
        for section in sections:
            label = section.get("Label")
            if label == "FINDINGS":
                findings = section.text
            elif label == "IMPRESSION":
                impression = section.text

        if impression is None and findings is None:
            reports_with_empty_sections.append(report)
            continue

        if impression is None:
            reports_with_no_impression.append(report)
            caption = findings
        elif findings is None:
            reports_with_no_findings.append(report)
            caption = impression
        else:
            caption = f"{impression} {findings}"

        for image in images:
            image_name = f"{image.get('id')}.png"
            images_captions[image_name] = caption
            img_ids.append(image_name)

        reports_with_images[report] = img_ids
        text_of_reports[report] = caption

    print("Found", len(reports_with_no_image), "reports with no associated image")
    print("Found", len(reports_with_empty_sections), "reports with empty Impression and Findings sections")
    print("Found", len(reports_with_no_impression), "reports with no Impression section")
    print("Found", len(reports_with_no_findings), "reports with no Findings section")
    print("Collected", len(images_captions), "image-caption pairs")

        

In [13]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.start_download_extract()
    data_ingestion.remove_folder()
    data_ingestion.convert_to_csv()
except Exception as e:
    raise e

FileNotFoundError: [Errno 2] No such file or directory: 'config\\config.yaml'

In [7]:


try:
    # Works when running as a .py file
    project_root = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # Fallback for Jupyter or interactive mode
    project_root = os.getcwd()

folder_path = os.path.join(project_root, "artifacts", "data_ingestion", "report")
os.makedirs(folder_path, exist_ok=True)


reports = os.listdir(folder_path)
reports.sort()

reports_with_no_image = []
reports_with_empty_sections = []
reports_with_no_impression = []
reports_with_no_findings = []

images_captions = {}
reports_with_images = {}
text_of_reports = {}

for report in reports:

    tree = ET.parse(os.path.join(folder_path, report))
    root = tree.getroot()
    img_ids = []
    # find the images of the report
    images = root.findall("parentImage")
    # if there aren't any ignore the report
    if len(images) == 0:
        reports_with_no_image.append(report)
    else:
        sections = root.find("MedlineCitation").find("Article").find("Abstract").findall("AbstractText")
        # find impression and findings sections
        for section in sections:
            if section.get("Label") == "FINDINGS":
                findings = section.text
            if section.get("Label") == "IMPRESSION":
                impression = section.text

        if impression is None and findings is None:
            reports_with_empty_sections.append(report)
        else:
            if impression is None:
                reports_with_no_impression.append(report)
                caption = findings
            elif findings is None:
                reports_with_no_findings.append(report)
                caption = impression
            else:
                caption = impression + " " + findings

            for image in images:
                images_captions[image.get("id") + ".png"] = caption
                img_ids.append(image.get("id") + ".png")

            reports_with_images[report] = img_ids
            text_of_reports[report] = caption

print("Found", len(reports_with_no_image), "reports with no associated image")
print("Found", len(reports_with_empty_sections), "reports with empty Impression and Findings sections")
print("Found", len(reports_with_no_impression), "reports with no Impression section")
print("Found", len(reports_with_no_findings), "reports with no Findings section")

print("Collected", len(images_captions), "image-caption pairs")

Found 0 reports with no associated image
Found 0 reports with empty Impression and Findings sections
Found 0 reports with no Impression section
Found 0 reports with no Findings section
Collected 0 image-caption pairs


In [48]:
manifest_data = []
reports = [f for f in os.listdir(reports_path) if f.endswith(".xml")]
reports.sort()

reports_with_no_image = []
reports_with_empty_sections = []
reports_with_no_impression = []
reports_with_no_findings = []

images_captions = {}
reports_with_images = {}
text_of_reports = {}



for report in reports:
    tree = ET.parse(os.path.join(reports_path, report))
    root = tree.getroot()

    images = root.findall("parentImage")
    if not images:
        reports_with_no_image.append(report)
        continue

    findings = None
    impression = None
    sections = root.find("MedlineCitation").find("Article").find("Abstract").findall("AbstractText")
    for section in sections:
        label = section.get("Label")
        if label == "FINDINGS":
            findings = section.text
        elif label == "IMPRESSION":
            impression = section.text

    if impression is None and findings is None:
        reports_with_empty_sections.append(report)
        continue

    if impression is None:
        reports_with_no_impression.append(report)
        caption = findings
    elif findings is None:
        reports_with_no_findings.append(report)
        caption = impression
    else:
        caption = (impression or "") + " " + (findings or "")

    img_ids = []
    for image in images:
        img_file = image.get("id") + ".png"
        img_path = os.path.join(images_path, img_file)
        if os.path.exists(img_path):
            manifest_data.append([img_path, caption])

    reports_with_images[report] = img_ids
    text_of_reports[report] = caption
manifest_df = pd.DataFrame(manifest_data, columns=["image_path", "caption"])
manifest_df.to_csv("iu_xray_manifest.csv", index=False)
print(f"[INFO] Saved manifest with {len(manifest_df)} image-caption pairs")




[INFO] Saved manifest with 7430 image-caption pairs


In [49]:
print(f"Found {len(reports_with_no_image)} reports with no associated image")
print(f"Found {len(reports_with_empty_sections)} reports with empty Impression and Findings sections")
print(f"Found {len(reports_with_no_impression)} reports with no Impression section")
print(f"Found {len(reports_with_no_findings)} reports with no Findings section")
print(f"Collected {len(images_captions)} image-caption pairs")

Found 104 reports with no associated image
Found 25 reports with empty Impression and Findings sections
Found 6 reports with no Impression section
Found 489 reports with no Findings section
Collected 0 image-caption pairs
