# Evaluate NN performance over training sizes
Simple dev notebook to evaluate the performance of a neural network over different training sizes. The goal is to understand how the model's performance changes as we increase the amount of training data.

In [1]:
# Imports
import os
from collections import defaultdict
import numpy as np
import h5py
import matplotlib.pyplot as plt
import datetime
import argparse

import keras_core as keras

import tensorflow_io as tfio
from tensorflow.data import Dataset
from tensorflow.data.experimental import AUTOTUNE

SPLIT_SEED = 36

2025-05-06 13:51:33.226804: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-06 13:51:33.291359: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-06 13:51:33.294165: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


In [2]:
# Hyperparameters
polarity = 'neg'
train_size_fraction = 0.1
bootstrap = 'b1' # 'b1' = True, 'b0' = False
model_version = 'init0'
data_version = 'd1'

In [3]:
data_path = '/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/data/2023_07_01'
data_file = f'{data_path}/{polarity}/model_collection_1AU_90deg_0deg_fixed_training.h5'
    
# 8 input parameters for the NN: alpha, cmf, vspoles, cpa, pwr1par, pwr2par, pwr1perr, and pwr2perr.
# features = ['alpha', 'cmf', 'cpa', 'pwr1par', 'pwr1perr', 'pwr2par', 'pwr2perr', 'vspoles']
with h5py.File(data_file, 'r') as h5:
    num_samples, num_inputs,  = h5['X_minmax'].shape
    _, num_flux,  = h5['Y_log_scaled'].shape
x = tfio.IODataset.from_hdf5(data_file, dataset='/X_minmax')
y = tfio.IODataset.from_hdf5(data_file, dataset='/Y_log_scaled')

# Split
full = Dataset.zip((x, y)).shuffle(
    buffer_size=num_samples, seed=SPLIT_SEED, reshuffle_each_iteration=False
) # Shuffle the dataset. Important to use the same seed and buffer_size
train_cardinality = np.floor(num_samples * .9)
full_train = full.take(train_cardinality) # Keep train set we sample from consistent as 90% of the data
test = full.skip(train_cardinality) # Keep test set consistent as 10% of the data

# Get number of training samples (from the dataset)
train_size = int(np.floor(train_cardinality * train_size_fraction))
print(f'Train size: {train_size} = {train_size_fraction} * {train_cardinality}')

# Choose seed based on model version
data_seeds = {
    'd1': 42,
    'd2': 87,
    'd3': 5,
    'd4': 98,
}
data_seed = data_seeds.get(data_version, None)

2025-05-06 13:52:56.866691: W tensorflow_io/core/kernels/audio_video_mp3_kernels.cc:271] libmp3lame.so.0 or lame functions are not available
2025-05-06 13:52:56.867033: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX AVX2 AVX512F FMA
2025-05-06 13:52:57.056702: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Train size: 178889 = 0.1 * 1788892.0


In [None]:
if bootstrap == 'b1':
    print(f"Using bootstrap sampling (with replacement) for data version {data_version} and seed {data_seed}")

    # Reproducible bootstrap indices
    rng = np.random.default_rng(data_seed)
    sampled_indices = rng.integers(low=0, high=train_cardinality, size=train_size)

    # Load dataset into memory
    train_list = list(full_train.as_numpy_iterator())

    # Sample with replacement
    bootstrapped_data = [train_list[i] for i in sampled_indices]

    # Separate into inputs and outputs
    x_bootstrap, y_bootstrap = zip(*bootstrapped_data)

    # Convert back to tf.data.Dataset
    train = Dataset.from_tensor_slices((list(x_bootstrap), list(y_bootstrap)))

else:
    print(f"Using traditional sampling (without replacement) for data version {data_version} and seed {data_seed}")

    # Shuffle deterministically
    if data_version in data_seeds:
        train_shuffled = full_train.shuffle(
            buffer_size=train_cardinality, seed=data_seed, reshuffle_each_iteration=False
        )
    else:
        train_shuffled = full_train

    # Take subset without replacement
    train = train_shuffled.take(train_size)

# Set batch_size to 128 unless the train size is smaller than 128, then set it to the train size.
if train_size < 128:
    batch_size = train_size
else:
    batch_size = 128
print(f'Setting batch size: {batch_size}')

train = train.batch(batch_size, drop_remainder=True).prefetch(AUTOTUNE)
test = test.batch(batch_size, drop_remainder=True).prefetch(AUTOTUNE)

# Some calcs
steps_per_epoch = int(train_size / batch_size )
validation_steps = int(num_samples * .1 / batch_size)
print(f'Steps per epoch: {steps_per_epoch}, validation steps: {validation_steps}')

Using bootstrap sampling (with replacement) for data version d1 and seed 42


2025-05-06 13:53:26.248651: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 1 of 1987658
2025-05-06 13:53:26.248717: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 2 of 1987658
2025-05-06 13:53:30.643172: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 1025 of 1987658
2025-05-06 13:53:40.902952: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 7169 of 1987658
2025-05-06 13:53:47.672107: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] Filling up shuffle buffer (this may take a while): 173324 of 1987658
2025-05-06 13:53:53.800744: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] Shuffle buffer filled.


In [None]:
# Define model. 
l2 = keras.regularizers.L2(l2=1e-2)
model = keras.Sequential(layers=[
    keras.layers.Input(shape=(8,)),
    keras.layers.Dense(256, activation='selu', kernel_regularizer=l2),
    keras.layers.Dense(256, activation='selu', kernel_regularizer=l2),
    keras.layers.Dense(32, activation='linear', kernel_regularizer=l2),
])

# Create save and log directories
save_ending_name = 'regularized'
save_name = f'data_{data_version}_bootstrap_{bootstrap}_model_{model_version}_train_size_{train_size_fraction}_{polarity}'
log_dir = f'../../tensorboard_logs/{save_ending_name}/{save_name}/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'
print("\nTensorboard log dir: ", log_dir)

# Callbacks
callbacks = [
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10),
    keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
]


Tensorboard log dir:  ../../tensorboard_logs/regularized/data_d1_bootstrap_b1_model_init0_train_size_0.5_neg/20250506-124827


In [None]:
# Compile and fit the model
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
model.compile(loss='mae', optimizer=optimizer)

history = model.fit(
    train,
    epochs=50,
    steps_per_epoch=steps_per_epoch,
    validation_data=test,
    shuffle=False,
    verbose=2,
    callbacks=callbacks,
)

Epoch 1/50


In [None]:
train_mae = model.evaluate(train)
test_mae = model.evaluate(test)

print(f'Train MAE: {train_mae}, Test MAE: {test_mae}')