In [1]:
import pathlib
import shutil
import random
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# ==========================================
# KONFIGURASI PATH
# ==========================================
base_dir = pathlib.Path(".")
raw_data_dir = base_dir / 'data valid merah'  # Folder data asli Anda
output_dir = base_dir / 'dataset_final'       # Folder tujuan (Train/Val/Test)

# Target jumlah per kelas (Sesuai screenshot)
TARGET_SAMPLES = 200
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

# ==========================================
# 1. FUNGSI AUGMENTASI AMAN (Rotasi Kecil/Flip)
# ==========================================
def augment_image_safe(image_path):
    img = cv2.imread(str(image_path))
    if img is None: return None
    
    # Pilih acak: Flip atau Rotasi Kecil
    if random.random() > 0.5:
        # Flip Horizontal
        img = cv2.flip(img, 1)
    else:
        # Rotasi Kecil (-10 sampai 10 derajat)
        angle = random.uniform(-10, 10)
        h, w = img.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        img = cv2.warpAffine(img, M, (w, h), borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0))
    return img

# ==========================================
# 2. PROSES UTAMA (SPLIT + OVERSAMPLE)
# ==========================================
def main():
    # Header Style
    print("="*60)
    print("OVERSAMPLING KELAS MINORITAS (AUGMENTASI AMAN)")
    print("="*60)
    print("Starting oversampling process...\n")

    # --- TAHAP A: MEMBAGI DATA (SPLITTING) ---
    # Kita split dulu ke memori agar folder 'train' siap dihitung
    
    # Hapus folder output lama jika ada (untuk reset)
    if output_dir.exists():
        shutil.rmtree(output_dir)
    
    # Buat struktur folder
    for split in ['train', 'val', 'test']:
        (output_dir / split).mkdir(parents=True, exist_ok=True)

    class_files = {} # Menyimpan path file per kelas untuk training
    
    # Lakukan Splitting Data Asli
    for class_folder in sorted(raw_data_dir.iterdir()):
        if not class_folder.is_dir(): continue
        
        class_name = class_folder.name
        files = list(class_folder.glob('*.png')) # Sesuaikan jika jpg
        
        # Split 80% Train, 10% Val, 10% Test
        train_f, temp_f = train_test_split(files, train_size=0.7, random_state=SEED)
        val_f, test_f = train_test_split(temp_f, train_size=0.5, random_state=SEED)
        
        # Simpan file Train ke dictionary untuk diolah nanti
        class_files[class_name] = train_f
        
        # Pindahkan file Train, Val, Test ke folder tujuan (dataset_final)
        # Train (Raw)
        os.makedirs(output_dir / 'train' / class_name, exist_ok=True)
        for f in train_f: shutil.copy(str(f), str(output_dir / 'train' / class_name / f.name))
            
        # Val (Murni)
        os.makedirs(output_dir / 'val' / class_name, exist_ok=True)
        for f in val_f: shutil.copy(str(f), str(output_dir / 'val' / class_name / f.name))
            
        # Test (Murni)
        os.makedirs(output_dir / 'test' / class_name, exist_ok=True)
        for f in test_f: shutil.copy(str(f), str(output_dir / 'test' / class_name / f.name))

    # --- TAHAP B: TAMPILAN STATUS DATA ORIGINAL (Train Set) ---
    print("Distribusi Data Train Original (Training Set):")
    for cls, files in class_files.items():
        print(f"  Kelas {cls}: {len(files)} samples")
    
    print(f"\nTarget samples per class: {TARGET_SAMPLES}\n")

    # --- TAHAP C: PROSES OVERSAMPLING ---
    total_augmented = 0
    
    for class_name, files in class_files.items():
        print(f"Processing class {class_name}:")
        current_count = len(files)
        needed = TARGET_SAMPLES - current_count
        
        # Path folder training kelas ini
        target_class_dir = output_dir / 'train' / class_name
        
        if needed <= 0:
            print(f"  ✓ Class already meets target. Total: {current_count}")
            continue
            
        print(f"  Need to add: {needed} samples")
        
        created = 0
        while created < needed:
            # Ambil acak gambar asli
            src_file = random.choice(files)
            
            # Augmentasi
            aug_img = augment_image_safe(src_file)
            
            if aug_img is not None:
                save_name = f"aug_{created}_{src_file.name}"
                cv2.imwrite(str(target_class_dir / save_name), aug_img)
                created += 1
                total_augmented += 1
                
                # Print progress setiap 50 gambar (Seperti screenshot)
                if created % 50 == 0:
                    print(f"    Created {created}/{needed} samples...")
        
        print(f"  ✓ Created {created} augmented samples")
        print(f"  Total in class: {TARGET_SAMPLES}\n")

    # --- FOOTER & FINAL SUMMARY ---
    print("="*60)
    print("OVERSAMPLING COMPLETED!")
    print("="*60 + "\n")
    
    print("Distribusi Data Train Setelah Oversampling (Training Set):")
    for class_name in class_files.keys():
        # Hitung file di folder akhir
        final_count = len(list((output_dir / 'train' / class_name).glob('*.png')))
        print(f"  Kelas {class_name}: {final_count} samples")
        
    print(f"\nTotal augmented samples created: {total_augmented}")
    print(f"\n✅ Oversampling berhasil! Data balanced tersimpan di: {output_dir}")

if __name__ == "__main__":
    main()

OVERSAMPLING KELAS MINORITAS (AUGMENTASI AMAN)
Starting oversampling process...

Distribusi Data Train Original (Training Set):
  Kelas 1: 136 samples
  Kelas 2: 265 samples
  Kelas 3: 167 samples
  Kelas 4: 53 samples

Target samples per class: 200

Processing class 1:
  Need to add: 64 samples
    Created 50/64 samples...
  ✓ Created 64 augmented samples
  Total in class: 200

Processing class 2:
  ✓ Class already meets target. Total: 265
Processing class 3:
  Need to add: 33 samples
  ✓ Created 33 augmented samples
  Total in class: 200

Processing class 4:
  Need to add: 147 samples
    Created 50/147 samples...
    Created 100/147 samples...
  ✓ Created 147 augmented samples
  Total in class: 200

OVERSAMPLING COMPLETED!

Distribusi Data Train Setelah Oversampling (Training Set):
  Kelas 1: 200 samples
  Kelas 2: 265 samples
  Kelas 3: 200 samples
  Kelas 4: 200 samples

Total augmented samples created: 244

✅ Oversampling berhasil! Data balanced tersimpan di: dataset_final
