# **Project SEE-DR: Diabetic Retinopathy Segmentation Data Pipeline**
Data is loaded, augmented, and saved into .pt files for training.

In [1]:
import pandas as pd
import numpy as np
import torch
import torchvision
import albumentations as A
from albumentations import ToTensorV2
import matplotlib
import matplotlib.pyplot as plt
import cv2
import os
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## **1. Create DataFrames**
Extracts the image and mask paths from the folder. All data is extracted at once into one dataframe. Train-test 75/25 split is applied after we get the full dataframe.

**Important!: Files must have this exact structure:**  
```
cmac-segmentation (root)/
├── DDR-SEGMENTATION/
│   ├── train/
│   │   ├── image/
│   │   └── label/
│   │       ├── EX/
│   │       ├── HE/
│   │       ├── MA/
│   │       └── SE/
│   ├── test/
│   │   ├── image/
│   │   └── label/
│   │       ├── EX/
│   │       ├── HE/
│   │       ├── MA/
│   │       └── SE/
│   └── valid/
│       ├── image/
│       └── label/
│           ├── EX/
│           ├── HE/
│           ├── MA/
│           └── SE/
└── IDRID/
    ├── Original_Images/
    │   ├── test/
    │   └── train/
    └── Segmentation_Groundtruths/
        ├── test/
        │   ├── 1. Microaneurysms/
        │   ├── 2. Haemorrhages/
        │   ├── 3. Hard Exudates/
        │   ├── 4. Soft Exudates/
        │   └── 5. Optic Disk/
        └── train/
            ├── 1. Microaneurysms/
            ├── 2. Haemorrhages/
            ├── 3. Hard Exudates/
            ├── 4. Soft Exudates/
            └── 5. Optic Disk/
```

In [10]:
def build_dataframe(ddr_root: str, idrid_root: str):
    rows = []
    for root, dirs, files in os.walk(ddr_root):
        for f in files:
            if f.lower().endswith('.jpg') and os.path.basename(root) == "image":
                img_path = os.path.join(root, f)
                base = os.path.splitext(f)[0]
                label_root = root.replace("image", "label")

                ex = os.path.join(label_root, "EX", base + ".tif")
                he = os.path.join(label_root, "HE", base + ".tif")
                ma = os.path.join(label_root, "MA", base + ".tif")
                se = os.path.join(label_root, "SE", base + ".tif")

                rows.append({
                    "dataset": "DDR",
                    "image_path": img_path,
                    "ex_path": ex if os.path.exists(ex) else None,
                    "he_path": he if os.path.exists(he) else None,
                    "ma_path": ma if os.path.exists(ma) else None,
                    "se_path": se if os.path.exists(se) else None,
                    "od_path": None
                })
    for split in ["train", "test"]:
        img_dir = os.path.join(idrid_root, "Original_Images", split)
        if not os.path.exists(img_dir):
            continue

        for f in os.listdir(img_dir):
            if f.lower().endswith(('.jpg', '.png', '.jpeg', '.tif')):
                img_path = os.path.join(img_dir, f)
                base = os.path.splitext(f)[0]

                mask_base = os.path.join(idrid_root, "Segmentation_Groundtruths", split)

                ma = os.path.join(mask_base, "1. Microaneurysms", base + "_MA.tif")
                he = os.path.join(mask_base, "2. Haemorrhages", base + "_HE.tif")
                ex = os.path.join(mask_base, "3. Hard Exudates", base + "_EX.tif")
                se = os.path.join(mask_base, "4. Soft Exudates", base + "_SE.tif")
                od = os.path.join(mask_base, "5. Optic Disc", base + "_OD.tif")

                rows.append({
                    "dataset": "IDRID",
                    "image_path": img_path,
                    "ma_path": ma if os.path.exists(ma) else None,
                    "he_path": he if os.path.exists(he) else None,
                    "ex_path": ex if os.path.exists(ex) else None,
                    "se_path": se if os.path.exists(se) else None,
                    "od_path": od if os.path.exists(od) else None
                })

    df = pd.DataFrame(rows)
    return df

df = build_dataframe('DDR-SEGMENTATION', 'IDRID')


In [11]:
df.isna().sum()

dataset         0
image_path      0
ex_path         0
he_path         1
ma_path         0
se_path        41
od_path       757
dtype: int64

In [12]:
df.shape

(838, 7)

In [13]:
df.sample(10)

Unnamed: 0,dataset,image_path,ex_path,he_path,ma_path,se_path,od_path
637,DDR,DDR-SEGMENTATION/train/image/007-5510-300.jpg,DDR-SEGMENTATION/train/label/EX/007-5510-300.tif,DDR-SEGMENTATION/train/label/HE/007-5510-300.tif,DDR-SEGMENTATION/train/label/MA/007-5510-300.tif,DDR-SEGMENTATION/train/label/SE/007-5510-300.tif,
616,DDR,DDR-SEGMENTATION/train/image/007-3669-200.jpg,DDR-SEGMENTATION/train/label/EX/007-3669-200.tif,DDR-SEGMENTATION/train/label/HE/007-3669-200.tif,DDR-SEGMENTATION/train/label/MA/007-3669-200.tif,DDR-SEGMENTATION/train/label/SE/007-3669-200.tif,
537,DDR,DDR-SEGMENTATION/train/image/007-2580-100.jpg,DDR-SEGMENTATION/train/label/EX/007-2580-100.tif,DDR-SEGMENTATION/train/label/HE/007-2580-100.tif,DDR-SEGMENTATION/train/label/MA/007-2580-100.tif,DDR-SEGMENTATION/train/label/SE/007-2580-100.tif,
590,DDR,DDR-SEGMENTATION/train/image/007-1829-100.jpg,DDR-SEGMENTATION/train/label/EX/007-1829-100.tif,DDR-SEGMENTATION/train/label/HE/007-1829-100.tif,DDR-SEGMENTATION/train/label/MA/007-1829-100.tif,DDR-SEGMENTATION/train/label/SE/007-1829-100.tif,
346,DDR,DDR-SEGMENTATION/test/image/007-4467-200.jpg,DDR-SEGMENTATION/test/label/EX/007-4467-200.tif,DDR-SEGMENTATION/test/label/HE/007-4467-200.tif,DDR-SEGMENTATION/test/label/MA/007-4467-200.tif,DDR-SEGMENTATION/test/label/SE/007-4467-200.tif,
442,DDR,DDR-SEGMENTATION/train/image/007-3457-200.jpg,DDR-SEGMENTATION/train/label/EX/007-3457-200.tif,DDR-SEGMENTATION/train/label/HE/007-3457-200.tif,DDR-SEGMENTATION/train/label/MA/007-3457-200.tif,DDR-SEGMENTATION/train/label/SE/007-3457-200.tif,
108,DDR,DDR-SEGMENTATION/valid/image/007-7235-400.jpg,DDR-SEGMENTATION/valid/label/EX/007-7235-400.tif,DDR-SEGMENTATION/valid/label/HE/007-7235-400.tif,DDR-SEGMENTATION/valid/label/MA/007-7235-400.tif,DDR-SEGMENTATION/valid/label/SE/007-7235-400.tif,
400,DDR,DDR-SEGMENTATION/train/image/007-3372-200.jpg,DDR-SEGMENTATION/train/label/EX/007-3372-200.tif,DDR-SEGMENTATION/train/label/HE/007-3372-200.tif,DDR-SEGMENTATION/train/label/MA/007-3372-200.tif,DDR-SEGMENTATION/train/label/SE/007-3372-200.tif,
589,DDR,DDR-SEGMENTATION/train/image/007-5518-300.jpg,DDR-SEGMENTATION/train/label/EX/007-5518-300.tif,DDR-SEGMENTATION/train/label/HE/007-5518-300.tif,DDR-SEGMENTATION/train/label/MA/007-5518-300.tif,DDR-SEGMENTATION/train/label/SE/007-5518-300.tif,
617,DDR,DDR-SEGMENTATION/train/image/007-4747-200.jpg,DDR-SEGMENTATION/train/label/EX/007-4747-200.tif,DDR-SEGMENTATION/train/label/HE/007-4747-200.tif,DDR-SEGMENTATION/train/label/MA/007-4747-200.tif,DDR-SEGMENTATION/train/label/SE/007-4747-200.tif,


In [14]:
# Split the dataframe for testing and training
train_df, test_df = train_test_split(
    df,
    test_size = 0.25,
    random_state = 42,
    shuffle = True
)

train_df = train_df.reset_index(drop = True)
test_df  = test_df.reset_index(drop = True)

## **2. Augmentation Definitions**