# EfficientNet-B0 for AI vs Real Image Detection (TensorFlow)

This notebook implements the training and evaluation pipeline using EfficientNet-B0 and TensorFlow/Keras.
**Data Loading**: Uses Hugging Face `datasets` library to load data from Parquet files.
**Monitoring**: Includes System (CPU/RAM/Disk) and GPU monitoring.

## 1. Imports and Setup

In [None]:
!pip install datasets pandas pyarrow psutil matplotlib

In [None]:
import os
import time
import psutil
import threading
import subprocess
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from datasets import load_dataset
from PIL import Image
import io

# Check for GPU
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## 2. System Performance Monitoring
We use a background thread to log CPU, RAM, Disk, and GPU usage during training.

In [None]:
class SystemMonitor:
    def __init__(self, interval=1.0):
        self.interval = interval
        self.stop_event = threading.Event()
        self.history = {
            'timestamp': [],
            'cpu_percent': [],
            'ram_percent': [],
            'gpu_percent': [],
            'gpu_mem': [],
            'disk_read': [],
            'disk_write': []
        }
        self.thread = threading.Thread(target=self._monitor_loop)

    def _get_gpu_metrics(self):
        try:
            # Uses nvidia-smi to get GPU utilization and Memory usage
            result = subprocess.check_output(
                ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used', '--format=csv,noheader,nounits'],
                encoding='utf-8'
            )
            # Output example: "45, 1024" -> 45% util, 1024MB mem
            util, mem = map(int, result.strip().split(','))
            return util, mem
        except Exception:
            # Fallback if nvidia-smi is missing
            return 0, 0

    def _monitor_loop(self):
        # Initial disk counters
        last_disk = psutil.disk_io_counters()
        start_time = time.time()
        
        while not self.stop_event.is_set():
            current_time = time.time() - start_time
            cpu = psutil.cpu_percent(interval=None)
            ram = psutil.virtual_memory().percent
            gpu_util, gpu_mem = self._get_gpu_metrics()
            
            if last_disk:
                current_disk = psutil.disk_io_counters()
                disk_read = (current_disk.read_bytes - last_disk.read_bytes) / 1024 / 1024 # MB
                disk_write = (current_disk.write_bytes - last_disk.write_bytes) / 1024 / 1024 # MB
                last_disk = current_disk
            else:
                disk_read, disk_write = 0, 0
            
            self.history['timestamp'].append(current_time)
            self.history['cpu_percent'].append(cpu)
            self.history['ram_percent'].append(ram)
            self.history['gpu_percent'].append(gpu_util)
            self.history['gpu_mem'].append(gpu_mem)
            self.history['disk_read'].append(disk_read)
            self.history['disk_write'].append(disk_write)
            
            time.sleep(self.interval)

    def start(self):
        self.stop_event.clear()
        self.thread = threading.Thread(target=self._monitor_loop) # Recreate thread if restarted
        self.thread.start()
        print("System monitoring started...")

    def stop(self):
        self.stop_event.set()
        self.thread.join()
        print("System monitoring stopped.")
        
    def plot(self):
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))
        
        # CPU & RAM
        ax1.plot(self.history['timestamp'], self.history['cpu_percent'], label='CPU %')
        ax1.plot(self.history['timestamp'], self.history['ram_percent'], label='RAM %')
        ax1.set_title('CPU & RAM Usage')
        ax1.set_xlabel('Time (s)')
        ax1.set_ylabel('Percentage')
        ax1.legend()
        ax1.grid(True)

        # GPU
        ax2.plot(self.history['timestamp'], self.history['gpu_percent'], label='GPU %', color='red')
        ax2.set_ylabel('Utilization %', color='red')
        ax2_mem = ax2.twinx()
        ax2_mem.plot(self.history['timestamp'], self.history['gpu_mem'], label='GPU Mem (MB)', color='orange', linestyle='--')
        ax2_mem.set_ylabel('Memory (MB)', color='orange')
        ax2.set_title('GPU Usage')
        ax2.set_xlabel('Time (s)')
        ax2.grid(True)
        
        # Disk I/O
        ax3.plot(self.history['timestamp'], self.history['disk_read'], label='Disk Read (MB)')
        ax3.plot(self.history['timestamp'], self.history['disk_write'], label='Disk Write (MB)')
        ax3.set_title('Disk I/O (MB per interval)')
        ax3.set_xlabel('Time (s)')
        ax3.set_ylabel('MB')
        ax3.legend()
        ax3.grid(True)
        
        plt.show()

# Create monitor instance
monitor = SystemMonitor(interval=1.0)

## 3. Configuration & Hyperparameters

In [None]:
# Paths (Parquet Files)
DATA_FILES = {
    "train": "/storage/AIGeneratedImages_Midjourney/data/train-*.parquet",
    "validation": "/storage/AIGeneratedImages_Midjourney/data/validation-*.parquet",
    "test": "/storage/AIGeneratedImages_Midjourney/data/test-*.parquet",
}

# Hyperparameters
IMG_SIZE = (224, 224)
BATCH_SIZE = 128 # Optimized for P5000
LEARNING_RATE = 0.001
NUM_EPOCHS = 5 # Reduced
SEED = 42

## 4. Data Loading & Preprocessing
Loading Parquet files and converting to TF Dataset.

In [None]:
print("Loading dataset from parquet...")
dataset = load_dataset(
    "parquet",
    data_files=DATA_FILES
)
print(dataset)

# Assuming the dataset has 'image' (bytes or PIL) and 'label' columns.
# We'll assume the image column needs decoding if it's bytes, or handling if it's already a PIL object

def process_example(example):
    # Import inside function to avoid serialization issues in tf.data generator
    from PIL import Image
    import io
    
    # Check if 'image' is bytes (common in parquet) or dict (HF Image feature)
    img_data = example['image']
    try:
        if isinstance(img_data, bytes):
             image = Image.open(io.BytesIO(img_data))
        elif isinstance(img_data, dict) and 'bytes' in img_data:
             image = Image.open(io.BytesIO(img_data['bytes']))
        else:
             # Already PIL image or path?
             image = img_data
        
        if not isinstance(image, Image.Image):
            # Start simple: try opening if it's a string path
            if isinstance(image, str):
                 image = Image.open(image)
    except Exception as e:
        # Return dummy if failed (filtered out later) or raise
        # For robustness, we generate a black image
        image = Image.new('RGB', IMG_SIZE)
    
    image = image.convert("RGB").resize(IMG_SIZE)
    return np.array(image), example['label']

def tf_data_generator(split_name):
    def generator():
        for example in dataset[split_name]:
            yield process_example(example)
    return generator

def create_tf_dataset(split_name):
    return tf.data.Dataset.from_generator(
        tf_data_generator(split_name),
        output_signature=(
            tf.TensorSpec(shape=(IMG_SIZE[0], IMG_SIZE[1], 3), dtype=tf.uint8),
            tf.TensorSpec(shape=(), dtype=tf.int64)
        )
    )

train_ds = create_tf_dataset('train')
val_ds = create_tf_dataset('validation')
test_ds = create_tf_dataset('test')

# Optimization & Batching
AUTOTUNE = tf.data.AUTOTUNE

# Ensure batching and prefetching
train_ds = train_ds.shuffle(1000).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

## 5. Model Setup: EfficientNet-B0
Including augmentation.

In [None]:
data_augmentation = keras.Sequential(
  [
    layers.RandomFlip("horizontal", input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
    layers.RandomRotation(0.05),
    layers.RandomZoom(0.05),
  ]
)

def build_model():
    # Explicit Input Layer to assist build
    inputs = tf.keras.Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
    x = data_augmentation(inputs)
    
    base_model = tf.keras.applications.EfficientNetB0(
        include_top=False,
        weights="imagenet",
        input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
    )
    
    base_model.trainable = False
    
    x = base_model(x, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs, outputs)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')])
    return model

model = build_model()
model.summary()

## 6. Training

In [None]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "efficientnet_b0_best.keras", save_best_only=True, monitor='val_accuracy'
)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=5, restore_best_weights=True, monitor='val_accuracy'
)


monitor.start()

try:
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=NUM_EPOCHS,
        callbacks=[checkpoint_cb, early_stopping_cb]
    )
finally:
    monitor.stop()

In [None]:
# Visualize System Performance
monitor.plot()

## 7. Evaluation

In [None]:
# Plot history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
print("Evaluating on Test Set...")
results = model.evaluate(test_ds)
print(f"Test Accuracy: {results[1]:.4f}")
print(f"Test Precision: {results[2]:.4f}")
print(f"Test Recall: {results[3]:.4f}")