In [9]:
import tensorflow as tf
import pathlib
from tensorflow.keras.applications.resnet50 import preprocess_input
import shutil
from sklearn.model_selection import train_test_split

In [10]:
#path dataset dan parameter
base_dir = pathlib.Path(".") 
data_dir = base_dir / 'data valid merah'
data_split_dir = base_dir / 'data_split_merah'
img_height = 224
img_width = 224
batch_size = 32
seed = 42
autotune = tf.data.AUTOTUNE

# Script pembagian data: 70% train, 20% validasi, 10% test
print("Step 1: Splitting data...")
def split_data_folder(source_dir, dest_dir, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    source_dir = pathlib.Path(source_dir)
    dest_dir = pathlib.Path(dest_dir)
    dest_dir.mkdir(parents=True, exist_ok=True)

    for class_folder in source_dir.iterdir():
        if not class_folder.is_dir():
            continue
        files = list(class_folder.glob('*.png'))
        if len(files) == 0:
            continue

        train_files, temp_files = train_test_split(files, train_size=train_ratio, random_state=42)
        val_size = val_ratio / (val_ratio + test_ratio)
        val_files, test_files = train_test_split(temp_files, train_size=val_size, random_state=42)

        for split_name, split_files in zip(['train', 'val', 'test'], [train_files, val_files, test_files]):
            split_class_dir = dest_dir / split_name / class_folder.name
            split_class_dir.mkdir(parents=True, exist_ok=True)
            for f in split_files:
                shutil.copy(str(f), str(split_class_dir / f.name))

# Contoh penggunaan:
split_data_folder(
    # source_dir=r'C:\Users\PC\Documents\innar\skripsi\data valid merah',
    source_dir=r'D:\SKRIPSI\skripsi\data valid merah',
    dest_dir=data_split_dir,
    train_ratio=0.8, val_ratio=0.2, test_ratio=0.1
)

Step 1: Splitting data...


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\SKRIPSI\\skripsi\\data valid merah'

In [5]:
print("Loading datasets...")
# Load train dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_split_dir / 'train',
    labels="inferred",
    label_mode="int",
    color_mode="rgb",  # RGB untuk ResNet-50
    image_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=True,
    seed=seed
)

# Load validation dataset
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_split_dir / 'val',
    labels="inferred",
    label_mode="int", 
    color_mode="rgb",
    image_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

# Load test dataset
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_split_dir / 'test',
    labels="inferred",
    label_mode="int",
    color_mode="rgb", 
    image_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=False,
    seed=seed
)

class_names = train_ds.class_names
print(f"\nClasses found: {class_names}")

# Convert class names to PPB values for regression
class_to_ppb = {i: float(name) for i, name in enumerate(class_names)}
print(f"Class to PPB mapping: {class_to_ppb}")

Loading datasets...
Found 393 files belonging to 4 classes.
Found 116 files belonging to 4 classes.
Found 69 files belonging to 4 classes.

Classes found: ['1', '2', '3', '4']
Class to PPB mapping: {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0}


Step2: melabelkan masing" file menjadi nilai PPB menjadi groundtruth yang akan menjadi perbandingan utama untuk hasil prediksi model

In [6]:
print("Step 2: Converting labels to PPB values...")

# Function to convert integer labels to float PPB values
def convert_labels_to_ppb(images, labels):
    ppb_labels = tf.cast(labels, tf.float32)
    for class_idx, ppb_val in class_to_ppb.items():
        ppb_labels = tf.where(labels == class_idx, ppb_val, ppb_labels)
    # Ubah dimensi label menjadi (batch_size, 1)
    # Memastikan label memiliki dimensi yang benar untuk regresi
    ppb_labels = tf.expand_dims(ppb_labels, axis = -1) 
    return images, ppb_labels

# Apply label conversion
train_ds = train_ds.map(convert_labels_to_ppb, num_parallel_calls=autotune)
val_ds = val_ds.map(convert_labels_to_ppb, num_parallel_calls=autotune)
test_ds = test_ds.map(convert_labels_to_ppb, num_parallel_calls=autotune)

Step 2: Converting labels to PPB values...


In [7]:
print("Step 3: Setting up data augmentation...")

# Data augmentation using Keras layers (TensorFlow 2.x compatible)
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.5),  # 0.1 radians â‰ˆ 5.7 degrees
    # tf.keras.layers.RandomBrightness(0.1),
    # tf.keras.layers.RandomContrast(0.1)
], name="data_augmentation")

# Preprocessing function with Keras augmentation
def preprocess_with_augmentation(images, labels, training=False):
    # Normalize images to [0, 1] range
    images = tf.cast(images, tf.float32) / 255.0
    
    if training:
        # Apply augmentation (hanya untuk training)
        images = data_augmentation(images, training=True)
    
    # Normalisasi imageNet wajib, fungsi ini mengambil input RGB dan normalisasi ke range yg digunakan ResNet
    images = preprocess_input(images)
    
    return images, labels

# Apply preprocessing
train_ds = train_ds.map(lambda x, y: preprocess_with_augmentation(x, y, training=True), 
                       num_parallel_calls=autotune)
val_ds = val_ds.map(lambda x, y: preprocess_with_augmentation(x, y, training=False), 
                   num_parallel_calls=autotune)
test_ds = test_ds.map(lambda x, y: preprocess_with_augmentation(x, y, training=False), 
                     num_parallel_calls=autotune)


Step 3: Setting up data augmentation...


Step4: mempercepat proses pelatihan model dan mencegah  bottleneck; agar model tidak perlu menunggu krn batch data berikutnya sudah siap dan emnuggu di memori (RAM atau cache disk)

In [8]:
print("Step 4: Optimizing data pipeline...")

# Optimize performance with caching and prefetching
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=autotune)
val_ds = val_ds.cache().prefetch(buffer_size=autotune)
test_ds = test_ds.cache().prefetch(buffer_size=autotune)

print("Data preprocessing completed!")

Step 4: Optimizing data pipeline...
Data preprocessing completed!
