# **Segmentation Data Pipeline**

In [None]:
import pandas as pd
import numpy as np
import torch
import torchvision
import albumentations as A
from albumentations import ToTensorV2
import matplotlib
import matplotlib.pyplot as plt
import cv2
import os

  from .autonotebook import tqdm as notebook_tqdm


## 1. **Create DataFrames**

In [5]:
def build_dataframe(ddr_root: str, idrid_root: str):
    rows = []
    for root, dirs, files in os.walk(ddr_root):
        for f in files:
            if f.lower().endswith('.jpg') and os.path.basename(root) == "image":
                img_path = os.path.join(root, f)
                base = os.path.splitext(f)[0]
                label_root = root.replace("image", "label")

                ex = os.path.join(label_root, "EX", base + ".tif")
                he = os.path.join(label_root, "HE", base + ".tif")
                ma = os.path.join(label_root, "MA", base + ".tif")
                se = os.path.join(label_root, "SE", base + ".tif")

                rows.append({
                    "dataset": "DDR",
                    "image_path": img_path,
                    "ex_path": ex if os.path.exists(ex) else None,
                    "he_path": he if os.path.exists(he) else None,
                    "ma_path": ma if os.path.exists(ma) else None,
                    "se_path": se if os.path.exists(se) else None,
                    "od_path": None
                })
    for split in ["train", "test"]:
        img_dir = os.path.join(idrid_root, "Original_Images", split)
        if not os.path.exists(img_dir):
            continue

        for f in os.listdir(img_dir):
            if f.lower().endswith(('.jpg', '.png', '.jpeg', '.tif')):
                img_path = os.path.join(img_dir, f)
                base = os.path.splitext(f)[0]

                mask_base = os.path.join(idrid_root, "Segmentation_Groundtruths", split)

                ma = os.path.join(mask_base, "1. Microaneurysms", base + "_MA.tif")
                he = os.path.join(mask_base, "2. Haemorrhages", base + "_HE.tif")
                ex = os.path.join(mask_base, "3. Hard Exudates", base + "_EX.tif")
                se = os.path.join(mask_base, "4. Soft Exudates", base + "_SE.tif")
                od = os.path.join(mask_base, "5. Optic Disc", base + "_OD.tif")

                rows.append({
                    "dataset": "IDRID",
                    "image_path": img_path,
                    "ma_path": ma if os.path.exists(ma) else None,
                    "he_path": he if os.path.exists(he) else None,
                    "ex_path": ex if os.path.exists(ex) else None,
                    "se_path": se if os.path.exists(se) else None,
                    "od_path": od if os.path.exists(od) else None
                })

    df = pd.DataFrame(rows)
    return df

df = build_dataframe('DDR-SEGMENTATION', 'IDRID')


In [8]:
df.isna().sum()

dataset         0
image_path      0
ex_path       149
he_path       150
ma_path       149
se_path       190
od_path       757
dtype: int64

In [10]:
df.shape

(838, 7)

In [11]:
df.sample(10)

Unnamed: 0,dataset,image_path,ex_path,he_path,ma_path,se_path,od_path
713,DDR,DDR-SEGMENTATION/train/image/007-3673-200.jpg,DDR-SEGMENTATION/train/label/EX/007-3673-200.tif,DDR-SEGMENTATION/train/label/HE/007-3673-200.tif,DDR-SEGMENTATION/train/label/MA/007-3673-200.tif,DDR-SEGMENTATION/train/label/SE/007-3673-200.tif,
775,IDRID,IDRID/Original_Images/train/IDRiD_37.jpg,IDRID/Segmentation_Groundtruths/train/3. Hard ...,IDRID/Segmentation_Groundtruths/train/2. Haemo...,IDRID/Segmentation_Groundtruths/train/1. Micro...,,IDRID/Segmentation_Groundtruths/train/5. Optic...
136,DDR,DDR-SEGMENTATION/valid/image/007-5944-300.jpg,,,,,
133,DDR,DDR-SEGMENTATION/valid/image/007-6258-300.jpg,,,,,
805,IDRID,IDRID/Original_Images/train/IDRiD_01.jpg,IDRID/Segmentation_Groundtruths/train/3. Hard ...,IDRID/Segmentation_Groundtruths/train/2. Haemo...,IDRID/Segmentation_Groundtruths/train/1. Micro...,,IDRID/Segmentation_Groundtruths/train/5. Optic...
198,DDR,DDR-SEGMENTATION/test/image/007-4159-200.jpg,DDR-SEGMENTATION/test/label/EX/007-4159-200.tif,DDR-SEGMENTATION/test/label/HE/007-4159-200.tif,DDR-SEGMENTATION/test/label/MA/007-4159-200.tif,DDR-SEGMENTATION/test/label/SE/007-4159-200.tif,
714,DDR,DDR-SEGMENTATION/train/image/007-6608-400.jpg,DDR-SEGMENTATION/train/label/EX/007-6608-400.tif,DDR-SEGMENTATION/train/label/HE/007-6608-400.tif,DDR-SEGMENTATION/train/label/MA/007-6608-400.tif,DDR-SEGMENTATION/train/label/SE/007-6608-400.tif,
632,DDR,DDR-SEGMENTATION/train/image/007-2378-100.jpg,DDR-SEGMENTATION/train/label/EX/007-2378-100.tif,DDR-SEGMENTATION/train/label/HE/007-2378-100.tif,DDR-SEGMENTATION/train/label/MA/007-2378-100.tif,DDR-SEGMENTATION/train/label/SE/007-2378-100.tif,
320,DDR,DDR-SEGMENTATION/test/image/007-6734-400.jpg,DDR-SEGMENTATION/test/label/EX/007-6734-400.tif,DDR-SEGMENTATION/test/label/HE/007-6734-400.tif,DDR-SEGMENTATION/test/label/MA/007-6734-400.tif,DDR-SEGMENTATION/test/label/SE/007-6734-400.tif,
70,DDR,DDR-SEGMENTATION/valid/image/007-5882-300.jpg,,,,,
