In [1]:
from pathlib import Path
import shutil
import random

# === Konfigurasi dasar ===

# Notebook ada di folder: .../final_project_computer_vision/notebooks
# Jadi BASE_DIR = parent dari folder notebooks
BASE_DIR = Path.cwd().parent
print("BASE_DIR:", BASE_DIR)

# Root dataset asal dari Roboflow
DATASET_ROOT = BASE_DIR / "dataset"   # SESUAI screenshot kamu

# Root dataset baru hasil re-split 80/10/10
OUT_ROOT = BASE_DIR / "dataset_resplit_80_10_10"

print("DATASET_ROOT:", DATASET_ROOT)
print("OUT_ROOT:", OUT_ROOT)


BASE_DIR: c:\dev\final_project_computer_vision
DATASET_ROOT: c:\dev\final_project_computer_vision\dataset
OUT_ROOT: c:\dev\final_project_computer_vision\dataset_resplit_80_10_10


In [2]:
IMG_EXTS = {".jpg", ".jpeg", ".png"}

def collect_all_pairs():
    image_dirs = [
        DATASET_ROOT / "train" / "images",
        DATASET_ROOT / "valid" / "images",
        DATASET_ROOT / "test" / "images",
    ]

    pairs = []
    for img_dir in image_dirs:
        print("Scanning:", img_dir)
        if not img_dir.exists():
            print(" ⚠️  Directory not found:", img_dir)
            continue

        for img_path in img_dir.iterdir():
            if img_path.suffix.lower() not in IMG_EXTS:
                continue

            label_path = img_path.parent.parent / "labels" / (img_path.stem + ".txt")
            if not label_path.exists():
                print(" ⚠️  No label for image:", img_path.name)
                continue

            pairs.append((img_path, label_path))

    return pairs

pairs = collect_all_pairs()
print("Total image+label pairs found:", len(pairs))
pairs[:3]  # lihat beberapa contoh


Scanning: c:\dev\final_project_computer_vision\dataset\train\images
Scanning: c:\dev\final_project_computer_vision\dataset\valid\images
Scanning: c:\dev\final_project_computer_vision\dataset\test\images
Total image+label pairs found: 3735


[(WindowsPath('c:/dev/final_project_computer_vision/dataset/train/images/BikesHelmets100_png_jpg.rf.14401dde2d6c7e132e0e14c2da052a08.jpg'),
  WindowsPath('c:/dev/final_project_computer_vision/dataset/train/labels/BikesHelmets100_png_jpg.rf.14401dde2d6c7e132e0e14c2da052a08.txt')),
 (WindowsPath('c:/dev/final_project_computer_vision/dataset/train/images/BikesHelmets100_png_jpg.rf.1ac1fe36fbf3a4987e73c1d990a03225.jpg'),
  WindowsPath('c:/dev/final_project_computer_vision/dataset/train/labels/BikesHelmets100_png_jpg.rf.1ac1fe36fbf3a4987e73c1d990a03225.txt')),
 (WindowsPath('c:/dev/final_project_computer_vision/dataset/train/images/BikesHelmets100_png_jpg.rf.221243c923da3253e4f818d357626d91.jpg'),
  WindowsPath('c:/dev/final_project_computer_vision/dataset/train/labels/BikesHelmets100_png_jpg.rf.221243c923da3253e4f818d357626d91.txt'))]

In [3]:
random.seed(42)  # biar reproducible

n = len(pairs)
print("Total pairs:", n)

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

n_train = int(train_ratio * n)
n_val = int(val_ratio * n)
# sisanya jadi test
n_test = n - n_train - n_val

print("Planned split:")
print("  Train:", n_train)
print("  Val  :", n_val)
print("  Test :", n_test)

# Shuffle
random.shuffle(pairs)

train_pairs = pairs[:n_train]
val_pairs = pairs[n_train:n_train + n_val]
test_pairs = pairs[n_train + n_val:]

len(train_pairs), len(val_pairs), len(test_pairs)


Total pairs: 3735
Planned split:
  Train: 2988
  Val  : 373
  Test : 374


(2988, 373, 374)

In [4]:
def copy_split(split_name, split_pairs):
    img_out = OUT_ROOT / split_name / "images"
    lbl_out = OUT_ROOT / split_name / "labels"
    img_out.mkdir(parents=True, exist_ok=True)
    lbl_out.mkdir(parents=True, exist_ok=True)

    for img_path, lbl_path in split_pairs:
        shutil.copy2(img_path, img_out / img_path.name)
        shutil.copy2(lbl_path, lbl_out / lbl_path.name)

    print(f"{split_name}: {len(split_pairs)} images copied to {img_out}")

# Hapus folder lama kalau ada (hati-hati)
if OUT_ROOT.exists():
    print("⚠️ OUT_ROOT already exists, we will overwrite content.")
else:
    print("OUT_ROOT does not exist, will be created:", OUT_ROOT)

copy_split("train", train_pairs)
copy_split("val", val_pairs)
copy_split("test", test_pairs)


⚠️ OUT_ROOT already exists, we will overwrite content.
train: 2988 images copied to c:\dev\final_project_computer_vision\dataset_resplit_80_10_10\train\images
val: 373 images copied to c:\dev\final_project_computer_vision\dataset_resplit_80_10_10\val\images
test: 374 images copied to c:\dev\final_project_computer_vision\dataset_resplit_80_10_10\test\images


In [5]:
def count_images_labels(root):
    for split in ["train", "val", "test"]:
        img_dir = root / split / "images"
        lbl_dir = root / split / "labels"
        n_img = len(list(img_dir.glob("*.jpg"))) + len(list(img_dir.glob("*.png"))) + len(list(img_dir.glob("*.jpeg")))
        n_lbl = len(list(lbl_dir.glob("*.txt")))
        print(f"{split:5s} - images: {n_img}, labels: {n_lbl}")

count_images_labels(OUT_ROOT)


train - images: 2988, labels: 2988
val   - images: 373, labels: 373
test  - images: 374, labels: 374


In [6]:
from pathlib import Path

# 1. Tentukan BASE_DIR (root project)
BASE_DIR = Path.cwd().parent  # karena notebook di /notebooks
print("BASE_DIR:", BASE_DIR)

# 2. Path ke file YAML yang mau dibuat di root
yaml_path = BASE_DIR / "helmet.yaml"
print("YAML will be written to:", yaml_path)

# 3. Isi YAML (sesuai struktur dataset baru)
yaml_content = """\
path: dataset_resplit_80_10_10

train: train/images
val: val/images
test: test/images

nc: 2
names: ["Helmet", "No Helmet"]
"""

# 4. Tulis ke file
yaml_path.write_text(yaml_content, encoding="utf-8")
print("helmet.yaml created!")


BASE_DIR: c:\dev\final_project_computer_vision
YAML will be written to: c:\dev\final_project_computer_vision\helmet.yaml
helmet.yaml created!


In [7]:
print(yaml_path.read_text())


path: dataset_resplit_80_10_10

train: train/images
val: val/images
test: test/images

nc: 2
names: ["Helmet", "No Helmet"]

