In [1]:
import pathlib
import shutil
from collections import Counter
import random
import numpy as np
from PIL import Image
import cv2

In [5]:
# Konfigurasi
source_dir = pathlib.Path(r"C:\Users\PC\Documents\innar\data valid merah")
output_dir = pathlib.Path(r"C:\Users\PC\Documents\innar\data valid merah balanced")
# source_dir = pathlib.Path(r"D:\SKRIPSI\skripsi\data valid merah")
# output_dir = pathlib.Path(r"D:\SKRIPSI\skripsi\data valid merah balanced")
seed = 42

# Set random seed untuk reproducibility
random.seed(seed)
np.random.seed(seed)

print("="*60)
print("OVERSAMPLING KELAS MINORITAS")
print("="*60)

def count_files_per_class(directory):
    """Hitung jumlah file per kelas"""
    class_counts = {}
    directory = pathlib.Path(directory)
    
    for class_folder in directory.iterdir():
        if class_folder.is_dir():
            files = list(class_folder.glob('*.png'))
            class_counts[class_folder.name] = len(files)
    
    return class_counts

def augment_image(image_path, augmentation_type='flip'):
    """
    Augmentasi gambar untuk oversampling
    
    Augmentation types:
    - 'flip': Horizontal flip
    - 'rotate_90': Rotate 90 degrees
    - 'rotate_270': Rotate 270 degrees
    - 'brightness': Adjust brightness
    - 'contrast': Adjust contrast
    """
    img = cv2.imread(str(image_path))
    
    if augmentation_type == 'flip':
        img = cv2.flip(img, 1)  # Horizontal flip
    elif augmentation_type == 'rotate_90':
        img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
    elif augmentation_type == 'rotate_270':
        img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif augmentation_type == 'brightness':
        # Increase brightness
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        hsv[:, :, 2] = cv2.add(hsv[:, :, 2], 20)
        img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    elif augmentation_type == 'contrast':
        # Increase contrast
        lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        l = clahe.apply(l)
        img = cv2.merge([l, a, b])
        img = cv2.cvtColor(img, cv2.COLOR_LAB2BGR)
    
    return img

def oversample_minority_classes(source_dir, output_dir, target_count=None):
    """
    Oversample kelas minoritas dengan augmentasi
    
    Args:
        source_dir: Directory sumber
        output_dir: Directory output
        target_count: Jumlah target per kelas (None = ikuti kelas mayoritas)
    """
    source_dir = pathlib.Path(source_dir)
    output_dir = pathlib.Path(output_dir)
    
    # Hitung jumlah file per kelas
    class_counts = count_files_per_class(source_dir)
    
    print("\nDistribusi Data Original:")
    for class_name, count in sorted(class_counts.items()):
        print(f"  Kelas {class_name}: {count} samples")
    
    # Tentukan target count (kelas mayoritas)
    if target_count is None:
        target_count = max(class_counts.values())
    
    print(f"\nTarget samples per class: {target_count}")
    
    # Hapus output directory jika sudah ada
    if output_dir.exists():
        print(f"\nMenghapus folder output lama...")
        shutil.rmtree(output_dir)
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Jenis augmentasi yang akan digunakan
    augmentation_types = ['flip', 'rotate_90', 'rotate_270', 'brightness', 'contrast']
    
    # Process setiap kelas
    total_created = 0
    
    for class_folder in source_dir.iterdir():
        if not class_folder.is_dir():
            continue
        
        class_name = class_folder.name
        original_files = list(class_folder.glob('*.png'))
        current_count = len(original_files)
        
        print(f"\nProcessing class {class_name}:")
        print(f"  Current: {current_count} samples")
        
        # Buat folder output untuk kelas ini
        output_class_dir = output_dir / class_name
        output_class_dir.mkdir(parents=True, exist_ok=True)
        
        # Copy semua file original
        for f in original_files:
            shutil.copy2(str(f), str(output_class_dir / f.name))
        
        # Hitung berapa file yang perlu ditambahkan
        needed_samples = target_count - current_count
        
        if needed_samples <= 0:
            print(f"  ✓ Sudah seimbang, tidak perlu oversampling")
            continue
        
        print(f"  Need to add: {needed_samples} samples")
        
        # Oversample dengan augmentasi
        samples_created = 0
        augmentation_idx = 0
        
        while samples_created < needed_samples:
            # Pilih file random dari original files
            source_file = random.choice(original_files)
            
            # Pilih jenis augmentasi
            aug_type = augmentation_types[augmentation_idx % len(augmentation_types)]
            
            # Generate augmented image
            augmented_img = augment_image(source_file, aug_type)
            
            # Generate nama file baru
            base_name = source_file.stem
            new_name = f"{base_name}_aug{samples_created}_{aug_type}.png"
            output_path = output_class_dir / new_name
            
            # Simpan augmented image
            cv2.imwrite(str(output_path), augmented_img)
            
            samples_created += 1
            augmentation_idx += 1
            
            if samples_created % 20 == 0:
                print(f"    Created {samples_created}/{needed_samples} samples...")
        
        total_created += samples_created
        print(f"  ✓ Created {samples_created} augmented samples")
        print(f"  Total in class: {current_count + samples_created}")
    
    print("\n" + "="*60)
    print("OVERSAMPLING COMPLETED!")
    print("="*60)
    
    # Verifikasi hasil
    print("\nDistribusi Data Setelah Oversampling:")
    balanced_counts = count_files_per_class(output_dir)
    for class_name, count in sorted(balanced_counts.items()):
        original = class_counts.get(class_name, 0)
        added = count - original
        print(f"  Kelas {class_name}: {count} samples (original: {original}, added: {added})")
    
    print(f"\nTotal augmented samples created: {total_created}")
    print(f"Output directory: {output_dir}")
    
    return output_dir

# Jalankan oversampling
try:
    print("Starting oversampling process...")
    balanced_dir = oversample_minority_classes(
        source_dir=source_dir,
        output_dir=output_dir,
        target_count=214  # Sesuai dengan kelas mayoritas (kelas 2)
    )
    
    print("\n✅ Oversampling berhasil!")
    print(f"Data balanced tersimpan di: {balanced_dir}")
    print("\nSekarang Anda bisa menggunakan folder ini untuk split data!")
    
except Exception as e:
    print(f"\n❌ Error: {e}")
    import traceback
    traceback.print_exc()


# ========== FUNGSI TAMBAHAN: OVERSAMPLING DENGAN COPY DUPLIKAT ==========
# Alternatif lebih sederhana jika tidak ingin augmentasi

def oversample_with_duplication(source_dir, output_dir, target_count=None):
    """
    Oversample dengan duplikasi sederhana (tanpa augmentasi)
    Lebih cepat tapi kurang variatif
    """
    source_dir = pathlib.Path(source_dir)
    output_dir = pathlib.Path(output_dir)
    
    class_counts = count_files_per_class(source_dir)
    
    if target_count is None:
        target_count = max(class_counts.values())
    
    print(f"\nOversampling with duplication to {target_count} samples per class")
    
    if output_dir.exists():
        shutil.rmtree(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    for class_folder in source_dir.iterdir():
        if not class_folder.is_dir():
            continue
        
        class_name = class_folder.name
        original_files = list(class_folder.glob('*.png'))
        current_count = len(original_files)
        
        output_class_dir = output_dir / class_name
        output_class_dir.mkdir(parents=True, exist_ok=True)
        
        # Copy original files
        for f in original_files:
            shutil.copy2(str(f), str(output_class_dir / f.name))
        
        # Duplicate files jika perlu
        needed_samples = target_count - current_count
        
        if needed_samples > 0:
            print(f"Class {class_name}: duplicating {needed_samples} samples")
            
            for i in range(needed_samples):
                source_file = random.choice(original_files)
                new_name = f"{source_file.stem}_dup{i}.png"
                shutil.copy2(str(source_file), str(output_class_dir / new_name))
    
    print("Duplication completed!")
    return output_dir

# Uncomment baris di bawah jika ingin menggunakan metode duplikasi sederhana
# balanced_dir = oversample_with_duplication(source_dir, output_dir)

OVERSAMPLING KELAS MINORITAS
Starting oversampling process...

Distribusi Data Original:
  Kelas 1: 133 samples
  Kelas 2: 214 samples
  Kelas 3: 112 samples
  Kelas 4: 34 samples

Target samples per class: 214

Processing class 1:
  Current: 133 samples
  Need to add: 81 samples
    Created 20/81 samples...
    Created 40/81 samples...
    Created 60/81 samples...
    Created 80/81 samples...
  ✓ Created 81 augmented samples
  Total in class: 214

Processing class 2:
  Current: 214 samples
  ✓ Sudah seimbang, tidak perlu oversampling

Processing class 3:
  Current: 112 samples
  Need to add: 102 samples
    Created 20/102 samples...
    Created 40/102 samples...
    Created 60/102 samples...
    Created 80/102 samples...
    Created 100/102 samples...
  ✓ Created 102 augmented samples
  Total in class: 214

Processing class 4:
  Current: 34 samples
  Need to add: 180 samples
    Created 20/180 samples...
    Created 40/180 samples...
    Created 60/180 samples...
    Created 80/180 sa