# Evaluate NN performance over training sizes
Simple dev notebook to evaluate the performance of a neural network over different training sizes. The goal is to understand how the model's performance changes as we increase the amount of training data.

In [6]:
# Imports
import h5py
import numpy as np
# import keras_core as keras
from tensorflow import keras
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow.data import Dataset
from tensorflow.data.experimental import AUTOTUNE

import os
os.sys.path.append("/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/GalacticCosmicRays/scripts/nn_train_size_analysis")
from rtdl_num_embeddings_tf import (
    LinearEmbeddings,
    LinearReLUEmbeddings,
    PeriodicEmbeddings,
    PiecewiseLinearEmbeddings,
    compute_bins,
)

In [7]:
# Test rtdl_num_embeddings_tf
B, F = 32, 10
x = tf.random.normal((B, F))

# Linear embeddings
emb = LinearEmbeddings(n_features=F, d_embedding=32)
y = emb(x)  # (B, F, 32)

# Periodic embeddings
pemb = PeriodicEmbeddings(n_features=F, d_embedding=64, k=64, sigma=0.02, activation=True, version="B")
yp = pemb(x)  # (B, F, 64)

# Piecewise: compute quantile bins then encode
bins = compute_bins(x, n_bins=32)        # list of length F, per-feature edges
pe  = PiecewiseLinearEmbeddings(bins, d_embedding=32, activation=True, version="A")
ype = pe(x)

# Print shapes
print("Input shape:", x.shape)
print("Linear embeddings shape:", y.shape)
print("Periodic embeddings shape:", yp.shape)
print("Piecewise linear embeddings shape:", ype.shape)

Input shape: (32, 10)
Linear embeddings shape: (32, 10, 32)
Periodic embeddings shape: (32, 10, 64)
Piecewise linear embeddings shape: (32, 10, 32)


In [8]:
def load_dataset(polarity, data_version, train_size_fraction, bootstrap):
    # 8 input parameters for the NN: alpha, cmf, vspoles, cpa, pwr1par, pwr2par, pwr1perr, and pwr2perr.
    # features = ['alpha', 'cmf', 'cpa', 'pwr1par', 'pwr1perr', 'pwr2par', 'pwr2perr', 'vspoles']
    data_path = '/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/data/shuffled_may2025'
    train_file = f'{data_path}/{polarity}/train.h5'
    test_file = f'{data_path}/{polarity}/test.h5'

    # Load train data
    with h5py.File(train_file, 'r') as h5:
        num_train_samples, num_inputs,  = h5['X_minmax'].shape
        _, num_flux,  = h5['Y_log_scaled'].shape
    x_train = tfio.IODataset.from_hdf5(train_file, dataset='/X_minmax')
    y_train = tfio.IODataset.from_hdf5(train_file, dataset='/Y_log_scaled')
    full_train = Dataset.zip((x_train, y_train))

    # Load test data
    with h5py.File(test_file, 'r') as h5:
        num_test_samples, num_inputs,  = h5['X_minmax'].shape
        _, num_flux,  = h5['Y_log_scaled'].shape
    x_test = tfio.IODataset.from_hdf5(test_file, dataset='/X_minmax')
    y_test = tfio.IODataset.from_hdf5(test_file, dataset='/Y_log_scaled')
    test = Dataset.zip((x_test, y_test))

    # Get number of training samples (from the dataset)
    train_size = int(np.floor(num_train_samples * train_size_fraction))
    print(f"Number of training samples: {train_size} out of {num_train_samples} total")
    print(f"Number of test samples: {num_test_samples}")

    # Choose seed based on model version
    data_seeds = {
        'd1': 42,
        'd2': 87,
        'd3': 5,
        'd4': 98,
        'd5': 123,
    }
    data_seed = data_seeds.get(data_version, None)

    if bootstrap == 'b1':
        # Reproducible bootstrap indices
        rng = np.random.default_rng(data_seed)
        sampled_indices = rng.integers(low=0, high=num_train_samples, size=train_size)

        # Load dataset into memory
        train_list = list(full_train.as_numpy_iterator())

        # Sample with replacement
        bootstrapped_data = [train_list[i] for i in sampled_indices]

        # Separate into inputs and outputs
        x_bootstrap, y_bootstrap = zip(*bootstrapped_data)

        # Convert back to tf.data.Dataset
        train = Dataset.from_tensor_slices((list(x_bootstrap), list(y_bootstrap)))

    else:
        # Shuffle deterministically
        if data_version in data_seeds:
            train_shuffled = full_train.shuffle(
                buffer_size=num_train_samples, seed=data_seed, reshuffle_each_iteration=False
            )
        else:
            train_shuffled = full_train

        # Take subset without replacement
        train = train_shuffled.take(train_size)

    # Set batch_size to 128 unless the train size is smaller than 128, then set it to the train size.
    if train_size < 128:
        batch_size = train_size
    else:
        batch_size = 128

    train = train.batch(batch_size, drop_remainder=True).prefetch(AUTOTUNE)
    test = test.batch(batch_size, drop_remainder=True).prefetch(AUTOTUNE)

    return train, test, train_size, num_test_samples, batch_size, num_inputs

def build_model(input_dim, n_layers, units, embedding_method, embed_dim=12, n_bins=48):
    print(f"Building model with embedding {embedding_method}, {n_layers} layers, and {units} units per layer")

    model = keras.Sequential([keras.Input(shape=(input_dim,), dtype="float32")])

    # Tabular embedding layer
    if embedding_method == "linear":
        model.add(LinearEmbeddings(input_dim, embed_dim))
        model.add(keras.layers.Flatten())
    elif embedding_method == "linear_relu":
        model.add(LinearReLUEmbeddings(input_dim, embed_dim))
        model.add(keras.layers.Flatten())
    elif embedding_method == "periodic":
        # Defaults: k=64, sigma=0.02, activation=True (you can change)
        model.add(PeriodicEmbeddings(input_dim, embed_dim))
        model.add(keras.layers.Flatten())
    # TODO: fix this
    # elif embedding_method in {"piecewise_linear", "piecewise_linear_relu"}:
    #     # Compute bins **once** outside the training loop; pass numpy or a dense tensor
    #     bins = compute_bins(x_train, n_bins)
    #     model.add(PiecewiseLinearEmbeddings(
    #         bins, embed_dim,
    #         activation=(embedding_method == "piecewise_linear_relu"),
    #         version="B"  # residual linear, as in your code
    #     ))
    #     model.add(keras.layers.Flatten())
    else:
        raise ValueError(f"Unknown embedding_method: {embedding_method}")

    # If you’re using SELU, pair with lecun_normal + AlphaDropout (recommended for SELU)
    for _ in range(n_layers):
        model.add(keras.layers.Dense(units, activation="selu", kernel_initializer="lecun_normal"))
        # optional:
        # model.add(keras.layers.AlphaDropout(0.05))

    model.add(keras.layers.Dense(32, activation="linear"))
    return model

In [9]:
# Fixed args – customize if needed
args = {
    "polarity": "neg",
    "train_size_fraction": 0.1,
    "bootstrap": "b0",
    "data_version": "d1",
    "n_layers": 2,
    "units": 1024,
    "embedding_method": "periodic", # Options: none, linear_relu, periodic, piecewise_linear_relu
    "learning_rate": 1.918416336823577e-05,
    "weight_decay": 3.251785236175247e-06
}

train, test, train_size, num_test_samples, batch_size, num_inputs = load_dataset(
    args["polarity"], args["data_version"], args["train_size_fraction"], args["bootstrap"]
)

# # Get x_final_train from the zipped and shuffled and batched train
# # Collect all batches into memory
# x_batches = []
# for x_batch, _ in train:   # iterate through the dataset
#     x_batches.append(x_batch.numpy())  # convert to numpy

# # Concatenate into a single array
# x_final_train = np.concatenate(x_batches, axis=0)
# x_final_train_tensor = tf.convert_to_tensor(x_final_train, dtype=tf.float32)

# Define model
model = build_model(num_inputs, args["n_layers"], args["units"], args["embedding_method"])

print(model.summary())

Number of training samples: 178889 out of 1788892 total
Number of test samples: 198766
Building model with embedding periodic, 2 layers, and 1024 units per layer
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 periodic_embeddings_3 (Per  (None, 8, 12)             12896     
 iodicEmbeddings)                                                
                                                                 
 flatten_1 (Flatten)         (None, 96)                0         
                                                                 


 dense_3 (Dense)             (None, 1024)              99328     
                                                                 
 dense_4 (Dense)             (None, 1024)              1049600   
                                                                 
 dense_5 (Dense)             (None, 32)                32800     
                                                                 
Total params: 1194624 (4.56 MB)
Trainable params: 1194624 (4.56 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [10]:
# Compile model
optimizer = keras.optimizers.AdamW(learning_rate=args["learning_rate"], weight_decay=args["weight_decay"])
model.compile(optimizer=optimizer, loss='mae', metrics=['mse'])

# Train
history = model.fit(
    train,
    epochs=10,
    validation_data=test,
    shuffle=False,
    verbose=2,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10),
    ]
)

Epoch 1/10


2025-09-08 15:58:42.975046: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 12388095878121654777
2025-09-08 15:58:42.975159: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 802256284724381102


1397/1397 - 16s - loss: 0.0386 - mse: 0.0087 - val_loss: 0.0152 - val_mse: 6.5556e-04 - lr: 1.9184e-05 - 16s/epoch - 11ms/step
Epoch 2/10
1397/1397 - 11s - loss: 0.0138 - mse: 5.7485e-04 - val_loss: 0.0124 - val_mse: 5.0422e-04 - lr: 1.9184e-05 - 11s/epoch - 8ms/step
Epoch 3/10
1397/1397 - 10s - loss: 0.0116 - mse: 4.4578e-04 - val_loss: 0.0107 - val_mse: 3.9535e-04 - lr: 1.9184e-05 - 10s/epoch - 7ms/step
Epoch 4/10
1397/1397 - 9s - loss: 0.0104 - mse: 3.6566e-04 - val_loss: 0.0100 - val_mse: 3.3991e-04 - lr: 1.9184e-05 - 9s/epoch - 7ms/step
Epoch 5/10
1397/1397 - 12s - loss: 0.0095 - mse: 3.1794e-04 - val_loss: 0.0093 - val_mse: 3.0115e-04 - lr: 1.9184e-05 - 12s/epoch - 9ms/step
Epoch 6/10
1397/1397 - 9s - loss: 0.0091 - mse: 2.8649e-04 - val_loss: 0.0088 - val_mse: 2.7408e-04 - lr: 1.9184e-05 - 9s/epoch - 7ms/step
Epoch 7/10
1397/1397 - 10s - loss: 0.0087 - mse: 2.6183e-04 - val_loss: 0.0086 - val_mse: 2.5616e-04 - lr: 1.9184e-05 - 10s/epoch - 7ms/step
Epoch 8/10
1397/1397 - 9s - los

In [11]:
# Evaluate on train
train_results = model.evaluate(train, verbose=0)
train_mae = train_results[0]  # This is the loss (MAE)
train_mse = train_results[1]  # This is the metric (MSE)

# Evaluate on test
test_results = model.evaluate(test, verbose=0)
test_mae = test_results[0]
test_mse = test_results[1]

print(f"Train MAE: {train_mae}, Train MSE: {train_mse}")
print(f"Test MAE: {test_mae}, Test MSE: {test_mse}")

2025-09-08 16:00:51.721511: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 11988745245769156963
2025-09-08 16:00:51.721696: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 15565479340471374195
2025-09-08 16:00:58.699620: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 11988745245769156963
2025-09-08 16:00:58.699706: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 15565479340471374195


Train MAE: 0.00745767168700695, Train MSE: 0.00020026136189699173
Test MAE: 0.007487054448574781, Test MSE: 0.00020170820062048733
