In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow.keras import layers, regularizers
from tensorflow.keras.preprocessing import image_dataset_from_directory
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Enable CUDA GPU usage for NVIDIA GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth to avoid allocating all GPU memory at once
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Optionally restrict to specific GPU: tf.config.set_visible_devices(gpus[0], 'GPU')
        print(f"CUDA enabled: {len(gpus)} GPU(s) available")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("No GPU found. Training will use CPU.")

# Check available devices
print("\nTensorFlow version:", tf.__version__)
print("Available devices:")
for device in tf.config.list_physical_devices():
    print(f"  - {device}")

# Enable mixed precision training for faster execution on GPU
# Mixed precision (float16) provides significant speedup on NVIDIA GPUs with Tensor Cores
try:
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)
    print(f'\nMixed precision policy: {policy.name}')
except Exception as e:
    print(f'\nMixed precision not available: {e}')

# Enable XLA (Accelerated Linear Algebra) for faster execution on GPU
try:
    tf.config.optimizer.set_jit(True)
    print('XLA compilation enabled')
except Exception as e:
    print(f'XLA not available: {e}')

print('\nGPU optimizations configured (CUDA + mixed precision + XLA)')

In [None]:
# Define dataset directory paths
dataset_dir = "./archive/"
train_dir = os.path.join(dataset_dir, "train")
test_dir = os.path.join(dataset_dir, "test")

# Log resolved paths
print(f"Dataset train path: {train_dir}")
print(f"Dataset test path: {test_dir}")

# Verify paths exist
for name, path in [("Train", train_dir), ("Test", test_dir)]:
    assert os.path.exists(path), f"{name} directory not found: {path}"

In [None]:
# Display sample images from each class
classes = ['REAL', 'FAKE']

def show_examples(base_dir, title):
    """
    Display sample images from each class in the dataset.
    
    Args:
        base_dir: Base directory containing class subdirectories
        title: Title for the plot
    """
    fig, axes = plt.subplots(1, 4, figsize=(10, 3))
    fig.suptitle(title, fontsize=12)

    for i, cls in enumerate(classes):
        cls_dir = os.path.join(base_dir, cls)
        if not os.path.exists(cls_dir):
            print(f"Warning: Class directory not found: {cls_dir}")
            continue
            
        # Get sample files from each class
        all_files = os.listdir(cls_dir)
        if len(all_files) < 2:
            print(f"Warning: Not enough files in {cls_dir}")
            continue
            
        sample_files = random.sample(all_files, 2)
        for j, img_name in enumerate(sample_files):
            img_path = os.path.join(cls_dir, img_name)
            try:
                img = mpimg.imread(img_path)
                ax = axes[i*2 + j]
                ax.imshow(img)
                ax.set_title(f"{cls}")
                ax.axis('off')
            except Exception as e:
                print(f"Error loading image {img_path}: {e}")

    plt.tight_layout()
    plt.show()

# Display examples from train and test datasets
show_examples(train_dir, "Train Dataset Samples")
show_examples(test_dir, "Test Dataset Samples")

In [None]:
# Image preprocessing configuration
IMG_HEIGHT = 32
IMG_WIDTH = 32
BATCH_SIZE = 64
SEED = 42


def load_training_dataset(
    directory: str,
    img_height: int = IMG_HEIGHT,
    img_width: int = IMG_WIDTH,
    batch_size: int = BATCH_SIZE,
    seed: int = SEED,
):
    """Load training data from the given directory with preset image parameters."""
    return image_dataset_from_directory(
        directory,
        image_size=(img_height, img_width),
        batch_size=batch_size,
        shuffle=True,
        seed=seed,
        label_mode="binary",  # binary classification
    )


# Create training dataset
full_train_ds = load_training_dataset(train_dir)

# Dataset split and pipeline configuration
VAL_SPLIT = 0.1
AUTOTUNE = tf.data.AUTOTUNE  # Automatically determines optimal parallelism


def split_train_val(dataset, val_fraction: float = VAL_SPLIT):
    """Split a dataset into train/validation subsets by batches.

    Returns train_ds, val_ds, total_batches, train_batches, val_batches.
    """
    total_batches = tf.data.experimental.cardinality(dataset).numpy()
    val_batches = int(total_batches * val_fraction)
    train_batches = total_batches - val_batches

    train_ds = dataset.take(train_batches)
    val_ds = dataset.skip(train_batches)

    return train_ds, val_ds, total_batches, train_batches, val_batches


def load_test_dataset(
    directory: str,
    img_height: int = IMG_HEIGHT,
    img_width: int = IMG_WIDTH,
    batch_size: int = BATCH_SIZE,
):
    """Load test data from the given directory (no shuffling)."""
    return image_dataset_from_directory(
        directory,
        image_size=(img_height, img_width),
        batch_size=batch_size,
        shuffle=False,
        label_mode="binary",
    )


def optimize_dataset(dataset, shuffle: bool = False, seed: int | None = None, buffer_size: int = 1000):
    """Apply caching, optional shuffling, and prefetching to a dataset."""
    ds = dataset.cache()
    if shuffle:
        ds = ds.shuffle(buffer_size=buffer_size, seed=seed)
    return ds.prefetch(buffer_size=AUTOTUNE)


def count_images(dataset):
    """Count total number of images in a dataset."""
    return sum(batch[0].shape[0] for batch in dataset)


# Split training data: 90% for training, 10% for validation
train_ds, val_ds, total_batches, train_batches, val_batches = split_train_val(full_train_ds)

# Load test data from test folder (not shuffled or modified)
test_ds = load_test_dataset(test_dir)

# Optimize datasets for maximum performance
train_ds = optimize_dataset(train_ds, shuffle=True, seed=SEED)
val_ds = optimize_dataset(val_ds)
test_ds = optimize_dataset(test_ds)

# Count images in each dataset split
train_count = count_images(train_ds)
val_count = count_images(val_ds)
test_count = count_images(test_ds)

# Print dataset summary
print("Dataset Summary:")
print(f"Total batches in train folder: {total_batches}")
print(f"Train batches (90%): {train_batches}")
print(f"Validation batches (10%): {val_batches}")

print("\nNumber of images:")
print(f"Train images: {train_count}")
print(f"Validation images: {val_count}")
print(f"Test images: {test_count}")


In [None]:
# Data Augmentation (Applied only to training data)
# Augmentation helps improve model generalization by creating variations of training images
# Augmentation is applied on-the-fly (after caching) to ensure variety each epoch
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),  # Randomly flip images horizontally
    layers.RandomRotation(0.1),       # Randomly rotate images up to 10%
    layers.RandomZoom(0.1),           # Randomly zoom images up to 10%
], name='data_augmentation')

# Apply augmentation efficiently using map with parallel processing
# Augmentation is applied on-the-fly (not cached) to ensure variety each epoch
# This way we cache original images but get different augmentations each time
train_ds = train_ds.map(
    lambda x, y: (data_augmentation(x, training=True), y),
    num_parallel_calls=AUTOTUNE,  # Parallelize augmentation
    deterministic=False  # Allow non-deterministic order for better performance
)
# Re-prefetch after augmentation to maintain pipeline efficiency
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)


In [None]:
# Visualize original vs augmented images to verify data augmentation is working correctly
for images, labels in train_ds.take(1):
    # Compute augmented batch once
    augmented = data_augmentation(images, training=True)

    plt.figure(figsize=(12, 6))
    for i in range(9):
        # Original image
        plt.subplot(3, 6, 2 * i + 1)
        plt.imshow(tf.cast(images[i], tf.uint8))
        plt.title("Original", fontsize=8)
        plt.axis("off")

        # Augmented image
        plt.subplot(3, 6, 2 * i + 2)
        plt.imshow(tf.cast(augmented[i], tf.uint8))
        plt.title("Augmented", fontsize=8)
        plt.axis("off")

    plt.suptitle("Original (left) vs Augmented (right) Images", fontsize=16)
    plt.tight_layout()
    plt.show()
    break  # Only show one batch

In [None]:
# Fix SSL certificate verification on macOS (required for Keras weight downloads)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
# Load ResNet50 base model with ImageNet pre-trained weights
# Weights are downloaded automatically by Keras
base_model = tf.keras.applications.ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=(img_height, img_width, 3),
    pooling='avg'  # Changed to 'avg' for better performance
)

print("ResNet50 base model loaded with weights: imagenet")

In [None]:
# Build the complete model architecture
# Make base model trainable for fine-tuning
base_model.trainable = True

# Define input layer
inputs = tf.keras.Input(shape=(img_height, img_width, 3))

# Pass input through base ResNet50 model
x = base_model(inputs, training=False)

# Add batch normalization for stable training
x = layers.BatchNormalization()(x)

# Add dense layers with regularization to prevent overfitting
x = layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
x = layers.Dropout(0.4)(x)  # Dropout for regularization
x = layers.Dense(64, activation='relu')(x)

# Output layer: single neuron with sigmoid activation for binary classification
# Use float32 for output layer (required for mixed precision)
outputs = layers.Dense(1, activation='sigmoid', dtype='float32')(x)

# Create the complete model
model = tf.keras.Model(inputs, outputs)

# Compile model with optimizer, loss function, and metrics
# Using Adam optimizer with learning rate scheduling
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',  # Binary cross-entropy for binary classification
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(),  # Precision metric
        tf.keras.metrics.Recall()      # Recall metric
    ]
)

# Display model architecture summary
model.summary()


In [None]:
# Training with callbacks for better performance and convergence

# EarlyStopping: Stop training if validation loss doesn't improve
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',           # Monitor validation loss
    patience=20,                  # Wait 20 epochs before stopping
    restore_best_weights=True,     # Restore weights from best epoch
    verbose=1
)

# ReduceLROnPlateau: Reduce learning rate when validation loss plateaus
# This helps fine-tune the model and improve convergence
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',           # Monitor validation loss
    factor=0.5,                   # Reduce LR by 50%
    patience=5,                   # Wait 5 epochs
    min_lr=1e-7,                  # Minimum learning rate
    verbose=1
)

# ModelCheckpoint: Save best model weights during training
# This ensures we always have the best model even if training continues
os.makedirs('./models', exist_ok=True)
checkpoint_path = './models/best_model_weights.h5'
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path,
    monitor='val_loss',           # Monitor validation loss
    save_best_only=True,          # Only save best model
    save_weights_only=False,     # Save entire model
    verbose=1
)

# Train the model with all callbacks
# Note: train_ds is already shuffled and cached
history = model.fit(
    train_ds,                     # Training dataset (already optimized)
    validation_data=val_ds,       # Validation dataset
    epochs=100,                   # Maximum number of epochs
    callbacks=[early_stop, reduce_lr, model_checkpoint],  # All callbacks
    verbose=1                     # Show training progress
)

In [None]:
# Plot training history to visualize model performance

def plot_training_history(history):
    """Plot accuracy and loss curves from a Keras History object."""
    epochs = range(1, len(history.history["accuracy"]) + 1)

    plt.figure(figsize=(14, 5))

    # Plot accuracy over epochs
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history["accuracy"], label="Train", marker="o")
    plt.plot(epochs, history.history["val_accuracy"], label="Validation", marker="s")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Accuracy over Epochs")
    plt.grid(True, alpha=0.3)
    plt.legend(loc="lower right")

    # Plot loss over epochs
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history["loss"], label="Train", marker="o")
    plt.plot(epochs, history.history["val_loss"], label="Validation", marker="s")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss over Epochs")
    plt.grid(True, alpha=0.3)
    plt.legend(loc="upper right")

    plt.tight_layout()
    plt.show()


plot_training_history(history)

In [None]:
# Evaluate model performance on test data
test_loss, test_acc, test_prec, test_rec = model.evaluate(test_ds, verbose=1)
print(f"\nTest Results:")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall: {test_rec:.4f}")

# Get true labels from test dataset
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# Get predictions from model
y_pred = model.predict(test_ds, verbose=1)
# Convert probabilities to binary predictions using 0.5 threshold
y_pred_classes = (y_pred.flatten() > 0.5).astype("int32")

# Create and display confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['FAKE', 'REAL'], 
            yticklabels=['FAKE', 'REAL'],
            cbar_kws={'label': 'Count'})
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred_classes, target_names=['FAKE', 'REAL']))


In [None]:
# Save the trained model
os.makedirs('./models', exist_ok=True)
model_save_path = "./models/resnet50_binary.h5"
model.save(model_save_path)
print(f"Model saved successfully to: {model_save_path}")
