In [1]:
# === Cell 1: Imports ===
from pathlib import Path
import shutil
from tqdm import tqdm

# === Cell 2: Define paths ===
raw_data = Path("../data/raw/Data_Brain")  # contains Train and Val
output_data = Path("../data/data_yolo")
(output_data / "images/train").mkdir(parents=True, exist_ok=True)
(output_data / "labels/train").mkdir(parents=True, exist_ok=True)
(output_data / "images/val").mkdir(parents=True, exist_ok=True)
(output_data / "labels/val").mkdir(parents=True, exist_ok=True)


In [2]:
# === Cell 3: Helper function to process one subset ===
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff"}

def prepare_subset(subset):
    src_folder = raw_data / subset
    print(f"\nProcessing subset: {subset}")
    copied, skipped = 0, 0

    for class_dir in sorted(src_folder.iterdir()):
        if not class_dir.is_dir():
            continue
        img_dir = class_dir / "images"
        lbl_dir = class_dir / "labels"
        if not img_dir.exists() or not lbl_dir.exists():
            print(f" Missing folder in {class_dir}")
            continue

        for img_path in tqdm(list(img_dir.iterdir()), desc=f"{class_dir.name}"):
            if img_path.suffix.lower() not in IMG_EXTS:
                continue
            label_path = lbl_dir / f"{img_path.stem}.txt"
            if label_path.exists():
                shutil.copy2(img_path, output_data / "images" / subset.lower() / img_path.name)
                shutil.copy2(label_path, output_data / "labels" / subset.lower() / label_path.name)
                copied += 1
            else:
                skipped += 1
    print(f" Copied {copied} valid pairs,  Skipped {skipped} missing labels.")


In [3]:
# === Cell 4: Run preparation ===
for subset in ["Train", "Val"]:
    prepare_subset(subset)



Processing subset: Train


Glioma: 100%|██████████| 1153/1153 [00:08<00:00, 133.21it/s]
Meningioma: 100%|██████████| 1449/1449 [00:11<00:00, 120.81it/s]
No Tumor: 100%|██████████| 711/711 [00:05<00:00, 127.90it/s]
Pituitary: 100%|██████████| 1424/1424 [00:11<00:00, 120.38it/s]


 Copied 4737 valid pairs,  Skipped 0 missing labels.

Processing subset: Val


Glioma: 100%|██████████| 136/136 [00:01<00:00, 120.67it/s]
Meningioma: 100%|██████████| 140/140 [00:01<00:00, 123.25it/s]
No Tumor: 100%|██████████| 100/100 [00:00<00:00, 118.89it/s]
Pituitary: 100%|██████████| 136/136 [00:01<00:00, 130.88it/s]

 Copied 510 valid pairs,  Skipped 2 missing labels.



