# Data Augmentation and Saving Records

In [2]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import keras
import tensorflow as tf
from tensorflow import data as tf_data
from tensorboard.plugins.hparams import api as hp

from sklearn.model_selection import train_test_split

from preprocess_common import *

from record_save_load import *

# Define the HyperParameters

We fix some hyperparmeters like IMG_SIZE at a per model level, since each model excels at certain image sizes [1](https://link.springer.com/chapter/10.1007/978-3-030-86340-1_11). 

We also fix batch size and other parameters due to memory and compute constraints as well. We fix the seed for reproducibility

In [3]:
PATH="archive/"

AUTO = tf_data.AUTOTUNE # Parallelize data loading
#Hyperparameters
BUFFER_SIZE = 1024
BATCH_SIZE = 32
VALIDATION_SIZE = 0.2
RESIZE_SIZE = (512,512)

SEED = 44

## Load the CSV's 
We load the csv from the original dataset here for further processing, [the kaggle site](https://www.kaggle.com/datasets/alessandrasala79/ai-vs-human-generated-dataset/data)

In [5]:
class_names=['Human', 'AI']
train_df = pd.read_csv('./archive/train.csv', index_col=0)
test_df = pd.read_csv('./archive/test.csv')

train_paths = train_df["file_name"].array
train_labels = train_df["label"].array

## Splitting Training and Validation Data

We then split the training and validation data based off the class labels to ensure balanced class in the training and validation datasets.

Once we split up the data, we use Tensorflows Data pipeline in order to apply our data augmentation(ie. Flipping, rotating, color jitter), and resizing in a parallelized manner. We also set the seed to ensure some level of reproducibility, but because of the way CutMix works setting the seed always resulted in the same cut, so we weren't able to set the seed and it will still give different Cuts with the same images. Instead we provide the augmented dataset for reproducing our results, and in order to compare model performance individually.

### Explaining the code

Inside `create_datasets()` we duplicate our dataset in order to apply CutMix, we then apply `resize_augment_image` defined in preprocess_common.py which applies the resizing and crops for each model as each model excels at a certain input size [1](https://link.springer.com/chapter/10.1007/978-3-030-86340-1_11). We apply our data augmentation only once in order to reduce computation but it also enhances model invariance and equivariance. We then apply color jitter to improve model robustness to different types of AI images with different color preferences.

We found that the models we are using like EfficientNet and ResNet have their own built in preprocessing function for scaling(ie. [0,1] or [-1,1] instead of [0,255]) and normalizing data so we refrain from applying it ourselves.

In [None]:
train_labels = train_labels.numpy() if isinstance(train_labels, tf.Tensor) else train_labels

# Split the training data into training and validation sets balanced by label
(train_paths, val_paths, 
 train_labels, val_labels) = train_test_split(train_paths, 
                 train_labels, 
                 test_size=VALIDATION_SIZE, 
                 stratify=train_labels,
                 random_state=SEED)
 
train_labels = keras.ops.one_hot(train_labels,2)
val_labels = keras.ops.one_hot(val_labels,2)

def create_datasets(train_paths, train_labels, val_paths, val_labels, image_size):
    """ Creates the training and validation datasets for a certain image size.
    
        Args:
            train_paths (list): list of paths to training images
            train_labels (list): list of labels for training images
            val_paths (list): list of paths to validation images
            val_labels (list): list of labels for validation images
            image_size (tuple): size to crop the images to
        Returns:
            (tuple): image tensor and label
    """
    preprocess = Preprocess(RESIZE_SIZE, image_size)
    # Shuffles and batches the datasets
    train_ds_one = (
        tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
        .shuffle(BUFFER_SIZE, seed=SEED * 3)
        .map(lambda filename, label: (preprocess.resize_augment_image(PATH+filename, augment=True, c_jitter=True),label), num_parallel_calls=AUTO, deterministic=True)
    )
    train_ds_two = (
        tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
        .shuffle(BUFFER_SIZE, seed=SEED * 2) 
        .map(lambda filename, label: (preprocess.resize_augment_image(PATH+filename, augment=True, c_jitter=True),label), num_parallel_calls=AUTO, deterministic=True)
    )
    # Combine the two datasets for CutMix
    train_ds = tf_data.Dataset.zip((train_ds_one, train_ds_two))
    val_ds = (
        tf_data.Dataset.from_tensor_slices((val_paths, val_labels))
        .map(lambda filename, label: (preprocess.resize_augment_image(PATH+filename),label), num_parallel_calls=AUTO, deterministic=True)
        .batch(BATCH_SIZE, num_parallel_calls=AUTO, deterministic=True)
        .prefetch(AUTO)
    )
    return train_ds, val_ds


2025-03-18 19:53:19.931759: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-18 19:53:21.438234: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-18 19:53:21.438282: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-18 19:53:21.439731: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-18 19:53:21.439807: I external/local_xla/xla/stream_executor

## Applying CutMix

After applying the initial augmentation and resizing we now apply CutMix to the two training sets to combine them into a single dataset which has been found to improve model robustness and out of distribution performance. [1](https://arxiv.org/abs/1905.04899)

In [None]:
def create_model_dataset(img_size, ram_budget):
    """ Creates the training and validation datasets.
    
    Args:
        img_size (tuple): size to crop the images to
        ram_budget (int): RAM budget for autotuning
    Returns:
        (): dataset
    """
    mixer = Mix(img_size=img_size[0])
    train_ds, val_ds = create_datasets(train_paths, train_labels, val_paths, val_labels, img_size)

    train_ds_cm = (
        train_ds.shuffle(BUFFER_SIZE)
        .map(mixer.cutmix, num_parallel_calls=AUTO)
        .batch(BATCH_SIZE, num_parallel_calls=AUTO)
        .prefetch(AUTO)
    )


    options = tf_data.Options()
    options.autotune.enabled = True
    options.autotune.ram_budget = ram_budget
    train_ds_cm = train_ds_cm.with_options(options)
    
    return train_ds_cm, val_ds


## Saving to TFRecord

Here we save our processed data into Tensorflow Records so we have a consistent source of training data. For ease we provide the augmented data [here.](https://drive.google.com/file/d/16KvdZW_1Rn5zdopQtbNfej2vXxkhn1i0/view?usp=drive_link)

In [8]:
models = {
    "resnet": (224,224),
    "efficientnet": (380,380),
    "swin_transformer": (256,256)
}
if not os.path.exists("./records"):
    os.makedirs("./records")
for model in models:
    model_train_ds, val_ds = create_model_dataset(models[model], models[model][0]*models[model][1]*models[model][1]*BATCH_SIZE)
    save_to_tfrecord(model_train_ds, f"records/{model}_train.tfrecord")
    save_to_tfrecord(val_ds, f"records/{model}_val.tfrecord")

2025-03-18 19:59:00.975532: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-03-18 19:59:44.873413: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-03-18 20:06:57.929927: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Saving Test Data to Record

Here we resize the testing data to evaluate our trained and optimized models later. We also provide this here [ResNet](https://drive.google.com/file/d/1FD4bQNdrjlFbQ5hj0PW3gE34FTNYpDaQ/view?usp=sharing) [Swin](https://drive.google.com/file/d/1f-L6LkVVF34c8h0VdIEXsX8wV5H-UK7B/view?usp=sharing) [EffNet](https://drive.google.com/file/d/1rF2CbnYUJctaalbUcUgUVyTtO5yTutk_/view?usp=sharing)

In [6]:
models = {
    "resnet": (224,224),
    "efficientnet": (380,380),
    "swin_transformer": (256,256)
}
test_paths = test_df["id"].array
for model in models:
    preprocess = Preprocess(models[model], models[model])
    # Shuffles and batches the datasets
    test_ds = (
        tf.data.Dataset.from_tensor_slices(test_paths)
        .map(lambda filename: (preprocess.resize_augment_image(PATH+filename, augment=False, c_jitter=False)), num_parallel_calls=AUTO, deterministic=True)
        .batch(BATCH_SIZE, num_parallel_calls=AUTO, deterministic=True)
        .prefetch(AUTO)
    )
    save_test_tfrecord(test_ds, f"records/{model}_test.tfrecord")


2025-03-23 22:04:03.153745: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-23 22:04:04.709815: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-23 22:04:04.709934: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-23 22:04:04.713260: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-23 22:04:04.713335: I external/local_xla/xla/stream_executor