# Data Augmentation and Saving Records

In [2]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import keras
import tensorflow as tf
from tensorflow import data as tf_data
from tensorboard.plugins.hparams import api as hp

from sklearn.model_selection import train_test_split

from preprocess_common import *

from record_save_load import *

2025-04-03 20:55:30.137775: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Define the HyperParameters

We fix some hyperparmeters like IMG_SIZE at a per model level, since each model excels at certain image sizes [1](https://link.springer.com/chapter/10.1007/978-3-030-86340-1_11). 

We also fix batch size and other parameters due to memory and compute constraints as well. We fix the seed for reproducibility

In [3]:
PATH="archive/"

AUTO = tf_data.AUTOTUNE # Parallelize data loading
#Hyperparameters
BUFFER_SIZE = 1024
BATCH_SIZE = 32
VALIDATION_SIZE = 0.2
RESIZE_SIZE = (512,512)

SEED = 44

## Load the CSV's 
We load the csv from the original dataset here for further processing, [the kaggle site](https://www.kaggle.com/datasets/alessandrasala79/ai-vs-human-generated-dataset/data)

In [4]:
class_names=['Human', 'AI']
train_df = pd.read_csv('./archive/train.csv', index_col=0)
test_df = pd.read_csv('./archive/test.csv')

train_paths = train_df["file_name"].array
train_labels = train_df["label"].array

## Splitting Training and Validation Data

We then split the training and validation data based off the class labels to ensure balanced class in the training and validation datasets.

Once we split up the data, we use Tensorflows Data pipeline in order to apply our data augmentation`(ie. Flipping, rotating, color jitter)`, and resizing in a parallelized manner. We also set the seed to ensure reproducibility. We do not apply CutMix like in our draft and proposal because we found that since the test set included data generated by many models, CutMix could help learn more localizable features but it did not help learn the many other features prominent in the test set.

### Explaining the code

We apply `resize_augment_image` defined in preprocess_common.py which applies the `resizing, crops, flips, gaussian blur, and color jitter` to enhance model robustness as the test dataset is vastly different from the training set. We crop each image to a certain size for each model as each model excels at a certain input size [1](https://link.springer.com/chapter/10.1007/978-3-030-86340-1_11). 

We found that the models we are using like EfficientNet and ResNet have their own built in preprocessing function for scaling(ie. [0,1] or [-1,1] instead of [0,255]) and normalizing data so we refrain from applying it ourselves.

In [5]:
train_labels = train_labels.numpy() if isinstance(train_labels, tf.Tensor) else train_labels

# Split the training data into training and validation sets balanced by label

(train_paths, _, train_labels, _)= train_test_split(train_paths, train_labels, test_size=0.6, stratify=train_labels, random_state=SEED*2)


(train_paths, val_paths, 
 train_labels, val_labels) = train_test_split(train_paths, 
                 train_labels, 
                 test_size=VALIDATION_SIZE, 
                 stratify=train_labels,
                 random_state=SEED)
 
train_labels = keras.ops.one_hot(train_labels,2)
val_labels = keras.ops.one_hot(val_labels,2)

def create_datasets(train_paths, train_labels, val_paths, val_labels, image_size):
    """ Creates the training and validation datasets for a certain image size.
    
        Args:
            train_paths (list): list of paths to training images
            train_labels (list): list of labels for training images
            val_paths (list): list of paths to validation images
            val_labels (list): list of labels for validation images
            image_size (tuple): size to crop the images to
        Returns:
            (tuple): image tensor and label
    """
    preprocess = Preprocess(image_size, image_size)
    # Shuffles and batches the datasets
    train_ds_one = (
        tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
        .shuffle(BUFFER_SIZE, seed=SEED * 3)
        .map(lambda filename, label: (preprocess.resize_augment_image(PATH+filename, augment=False,c_jitter=False),label), num_parallel_calls=AUTO, deterministic=True)
        .batch(BATCH_SIZE, num_parallel_calls=AUTO, deterministic=True)
        .prefetch(AUTO)
    )
    # train_ds_two = (
    #     tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
    #     .shuffle(BUFFER_SIZE, seed=SEED * 2) 
    #     .map(lambda filename, label: (preprocess.resize_augment_image(PATH+filename, augment=True, c_jitter=True),label), num_parallel_calls=AUTO, deterministic=True)
    # )
    # Combine the two datasets for CutMix
    # train_ds = tf_data.Dataset.zip((train_ds_one, train_ds_two))
    val_ds = (
        tf_data.Dataset.from_tensor_slices((val_paths, val_labels))
        .map(lambda filename, label: (preprocess.resize_augment_image(PATH+filename),label), num_parallel_calls=AUTO, deterministic=True)
        .batch(BATCH_SIZE, num_parallel_calls=AUTO, deterministic=True)
        .prefetch(AUTO)
    )
    return train_ds_one, val_ds


2025-04-03 20:55:38.841951: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-04-03 20:55:40.697231: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-04-03 20:55:40.697285: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-04-03 20:55:40.698470: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-04-03 20:55:40.698560: I external/local_xla/xla/stream_executor

In [6]:
def create_model_dataset(img_size, ram_budget):
    """ Creates the training and validation datasets.
    
    Args:
        img_size (tuple): size to crop the images to
        ram_budget (int): RAM budget for autotuning
    Returns:
        (): dataset
    """
    mixer = Mix(img_size=img_size[0])
    train_ds, val_ds = create_datasets(train_paths, train_labels, val_paths, val_labels, img_size)
    

    # train_ds_cm = (
    #     train_ds.shuffle(BUFFER_SIZE)
    #     .map(lambda x, y: mixer.cutmix(x,y) if tf.random.uniform(()) < 0.5 else x, num_parallel_calls=AUTO)
    #     .batch(BATCH_SIZE, num_parallel_calls=AUTO)
    #     .prefetch(AUTO)
    # )


    options = tf_data.Options()
    options.autotune.enabled = True
    options.autotune.ram_budget = ram_budget
    train_ds = train_ds.with_options(options)
    
    return train_ds, val_ds


## Saving to TFRecord

Here we save our processed data into Tensorflow Records so we have a consistent source of training data. For ease we provide the augmented data [here.](https://drive.google.com/file/d/16KvdZW_1Rn5zdopQtbNfej2vXxkhn1i0/view?usp=drive_link)

In [7]:
models = {
    "resnet": (224,224),
    "efficientnet": (380,380),
    "swin_transformer": (256,256)
}
if not os.path.exists("./records"):
    os.makedirs("./records")
for model in models:
    model_train_ds, val_ds = create_model_dataset(models[model], models[model][0]*models[model][1]*models[model][1]*BATCH_SIZE)
    save_to_tfrecord(model_train_ds, f"records/{model}_train_no_aug.tfrecord")
    save_to_tfrecord(val_ds, f"records/{model}_val_no_aug.tfrecord")

2025-04-03 20:57:05.606170: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-04-03 20:57:22.977548: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-04-03 20:59:11.437001: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Saving Test Data to Record

Here we resize the testing data to evaluate our trained and optimized models later. We also provide this here [ResNet](https://drive.google.com/file/d/1FD4bQNdrjlFbQ5hj0PW3gE34FTNYpDaQ/view?usp=sharing) [Swin](https://drive.google.com/file/d/1f-L6LkVVF34c8h0VdIEXsX8wV5H-UK7B/view?usp=sharing) [EffNet](https://drive.google.com/file/d/1rF2CbnYUJctaalbUcUgUVyTtO5yTutk_/view?usp=sharing)

In [None]:
models = {
    "resnet": (224,224),
    "efficientnet": (380,380),
    "swin_transformer": (256,256)
}
test_paths = test_df["id"].array
for model in models:
    preprocess = Preprocess(RESIZE_SIZE, models[model])
    # Shuffles and batches the datasets
    test_ds = (
        tf.data.Dataset.from_tensor_slices(test_paths)
        .map(lambda filename: (preprocess.resize_augment_image(PATH+filename, augment=False, c_jitter=False)), num_parallel_calls=AUTO, deterministic=True)
        .batch(BATCH_SIZE, num_parallel_calls=AUTO, deterministic=True)
        .prefetch(AUTO)
    )
    save_test_tfrecord(test_ds, f"records/{model}_test.tfrecord")


2025-03-28 19:33:00.146539: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-28 19:33:02.226395: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-28 19:33:02.226453: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-28 19:33:02.228405: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:906] could not open file to read NUMA node: /sys/bus/pci/devices/0000:00:00.3/numa_node
Your kernel may have been built without NUMA support.
2025-03-28 19:33:02.228510: I external/local_xla/xla/stream_executor