In [None]:
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # on NERSC filelocking is not allowed
import h5py
import tensorflow.keras as keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense
import tensorflow.keras.backend as K
import pickle as pkl

import tensorflow as tf
# Make notebook run on other GPUS. GPT's solution ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')  # change 1 to 0, 2, 3 as needed
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import sys
# Path to dir model.py lives in -------
# NOTE: This needs to be modified to where your repo lives, path to /repo/path/VAE_FS/models/
# If the jupyter notebook kernel is running from VAE_FS/models/ the

# line below is not needed
sys.path.append('/global/homes/j/jananinf/projs/VAE_FS/models/')

# import the custom models and functions
from models import Qmake_encoder_set_weights, Qmake_decoder_set_weights, Qmake_discriminator, VAE_GAN_Model
from data_and_eval_utils import load_preprocessed_snl

# in gan1. We train the VAE_GAN model

2025-07-25 13:58:23.104412: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data
home_path = "/global/cfs/cdirs/m2616/jananinf/projsIO/VAE_FS/" # Updated to NERSC
data = load_preprocessed_snl()
X_train = data['X_train']

Data loaded from preprocessed_SNL_data.h5


In [3]:
# To be copied from gan_training_history.txt ---
NUM_TRAIN      = 10 # Number of iterations to train for.
# VAE Architecture
INPUT_SZ       = 57
H1_SZ          = 32 # Hidden layer 1 size
H2_SZ          = 16 # "          " 2 "  "
LATENT_SZ      = 3
# Discriminator Architecture # 8, 2 is on ATLAS-VAE-GAN
DISC_H1_SZ     = 8 # Size of first hidden layer of discriminator  
DISC_H2_SZ     = 2 # "" second hidden layer ""
# Training schedule and parameters
NUM_EPOCHS     = 100
STEPS_EPOCH    = 20 # Steps per epoch
BATCH_SIZE     = 1024
STOP_PATIENCE  = 40
LR_PATIENCE    = 20
LR             = 0.001 # Learning rate
REDUCE_LR_FACTOR = 0.5
VAL_SPLIT      = 0.2 # Validation split
CYCLE_LEN      = 20
SHUFFLE_BOOL   = True
# Hyperparameters
MIN_BETA       = 0
MAX_BETA       = 1
MIN_GAMMA      = 1
MAX_GAMMA      = 50
# ---


### Simple training loop. No parameter sweeping

In [None]:
train = True
save = True
SAVE_PATH = home_path+f"/GAN_trainings/attempt6/" #

# Next attempt should go to 2
# Attempt History. The original code for each folder should also be tied to the commits. 
# 0: First attempt. GAN as copied from other repo https://github.com/max-cohen54/AD_trigger_training/blob/main/L1AD/software/VAE_GAN/L1_VAE_Analyzer_FDL_GAN_ALT.ipynb
# 1: Added GAN loss to 
# 2: Various parametric sweeps
# 3: Better file naming convention and varied clipnorm
# Notes: Smaller clipnorm ~ 0.1 tended to bring down the losses rather than blowing up.
# Keeping clipnorm to 0.1 in future trainings
# 4: Varying gamma maxes
# 5: gamma = 0 sanity check
# 6: Changed Discriminator to 8, 2 for hidden layers as in https://github.com/max-cohen54/AD_trigger_training/blob/main/L1AD/software/VAE_GAN/L1_VAE_Analyzer_FDL_GAN_ALT.ipynb
## No longer sweeping for now.
# 7: Learning rate set to 0.00001, 
# 8: Learning rate set to 0.000001
# 9: Learning rate set to 0.0000001

early_stopping = EarlyStopping(patience=STOP_PATIENCE, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=LR, patience=LR_PATIENCE, verbose=1)

for i in range(NUM_TRAIN):
    if train:
        tf.keras.backend.clear_session()

        print(f"TRAINING ITERATION {i} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
        enc = Qmake_encoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
        dec = Qmake_decoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
        disc = Qmake_discriminator(INPUT_SZ, DISC_H1_SZ, DISC_H2_SZ) # Modified this to the ATLAS-VAE-GAN

        steps_per_epoch = X_train.shape[0] // BATCH_SIZE
        
        # Modified these setting to match atlas VAE gan repo
        vae = VAE_GAN_Model(
                            enc
                            ,dec
                            ,disc
                            ,cycle_length=CYCLE_LEN
                            ,min_beta=MIN_BETA
                            ,max_beta=MAX_BETA
                            ,min_gamma=MIN_GAMMA
                            ,max_gamma=MAX_GAMMA
                            ,max_epochs=NUM_EPOCHS
                            ,steps_per_epoch=STEPS_EPOCH
                            )
        opt = keras.optimizers.Adam(learning_rate=LR)
        # --
        vae.compile(optimizer=opt)
        history = vae.fit(x=X_train, validation_split=VAL_SPLIT, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping,reduce_lr], shuffle=SHUFFLE_BOOL)

        
        # Iterative training. 
        save_path = SAVE_PATH+f"n_{i}/" 
        if save:
            print(f"SAVING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
            vae.save_weights(filepath=save_path, save_format='tf')

            # Now save the histories
            with open(save_path + f"training_history.pkl", 'wb') as f:
                pkl.dump(history.history, f)

Moving over to large parametric sweep to find something that will work

In [None]:
from matplotlib import pyplot as plt
# NUM_TRAIN = 4 # Train just once for now
SAVE_PATH = home_path+f"GAN_trainings/attempt6/" #
train = False
save = True
NUM_TRAIN = 3 # Train just once for now
# Next attempt should go to 2
# Attempt History. The original code for each folder should also be tied to the commits. 
# 0: First attempt. GAN as copied from other repo https://github.com/max-cohen54/AD_trigger_training/blob/main/L1AD/software/VAE_GAN/L1_VAE_Analyzer_FDL_GAN_ALT.ipynb
# 1: Added GAN loss to 
# 2: Various parametric sweeps
# 3: Better file naming convention and varied clipnorm
# Notes: Smaller clipnorm ~ 0.1 tended to bring down the losses rather than blowing up.
# Keeping clipnorm to 0.1 in future trainings
# 4: Varying gamma maxes
# 5: gamma = 0 sanity check
# 6: Changed Discriminator to 8, 2 for hidden layers as in https://github.com/max-cohen54/AD_trigger_training/blob/main/L1AD/software/VAE_GAN/L1_VAE_Analyzer_FDL_GAN_ALT.ipynb
## No longer sweeping for now.


early_stopping = EarlyStopping(patience=STOP_PATIENCE, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=LR_PATIENCE, verbose=1)

parameters = [0]
parameters_key = "max_gamma"
SAVE_PATH = SAVE_PATH + parameters_key + "/"

n_train = 3 # Train at least 3 models per parameter

if train:
    for param in parameters:
        for i in range(n_train):
            save_path = SAVE_PATH + f"{parameters_key}_{param}/"

            # Manually make the directories and file. Python can do it, but its cleaner to do it manually
            with open(SAVE_PATH +"out.txt", "a") as f:
                print(f"Variant: {parameters_key} = {param} TRAINING ITERATION {i} ~~~~~~~~~~~\n", file=f)


            tf.keras.backend.clear_session()

            enc = Qmake_encoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
            dec = Qmake_decoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
            disc = Qmake_discriminator(INPUT_SZ, 8, 2) # Testing out these values for now

            steps_per_epoch = X_train.shape[0] // BATCH_SIZE
            vae = VAE_GAN_Model(enc, dec, disc, cycle_length=20, min_beta=0, max_beta=1, min_gamma=1, max_gamma=50)
            opt = keras.optimizers.Adam(learning_rate=0.001) 
            history = vae.fit(x=X_train, validation_split=0.2, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping,reduce_lr], shuffle=True)

            # Make loss plot
            plt.figure(figsize=(12, 8))
            # Plot training losses
            for key, val in history.history.items():
                if key == 'lr':
                    continue
                plt.plot(val, label=key, 
                        linestyle = "dashed" if key[0:3] == 'val' else "solid") 
                
            # Customize the plot
            plt.title(f'Variant: {parameters_key} = {param} Training and Validation Losses Run: {i}')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.legend()
            plt.grid(True)
            plt.semilogy()

            # # Show the plot
            # plt.show()
            
            # Iterative training. 
            # save_path = save_path + f"n_{i}/" # As of 7/8/25. Should be synced with vae0_analysis
            if save:
                iter_save_path = save_path +  f"n_{i}/"

                # Save progress to main out file
                with open(SAVE_PATH + "out.txt", "a") as f: 
                    print(f"SAVING Variant: {parameters_key} = {param} TRAINING ITERATION {i} ~~~~~~~~~~~\n Save path: {iter_save_path}\n", file=f, flush = True)

                # Save weights to iter specific folder
                vae.save_weights(filepath=iter_save_path , save_format='tf')
                # Now save the histories
                with open(iter_save_path + f"training_history.pkl", 'wb') as f:
                    pkl.dump(history.history, f)
                plt.savefig(iter_save_path + parameters_key + f"_{param}.png")




Plot Loss vs epoch history.

In [None]:
import matplotlib.pyplot as plt
import pickle as pkl
# Assuming 'history' is the object returned by your model.fit() call

for i in range(NUM_TRAIN):
    save_path = SAVE_PATH + f"n_{i}/"
    with open(save_path + 'training_history.pkl', 'rb') as f:
        history = pkl.load(f)

    # Extract the loss values
    total_loss = history['loss']
    reco_loss = history['reco_loss']
    kl_loss = history['kl_loss']
    val_total_loss = history['val_loss']
    val_reco_loss = history['val_reco_loss']
    val_kl_loss = history['val_kl_loss']
    gamma = history['gamma']

    # Create a new figure
    plt.figure(figsize=(12, 8))

    # Plot training losses
    for key, val in history.items():
        if key == 'lr':
            continue
        plt.plot(val, label=key, 
                 linestyle = "dashed" if key[0:3] == 'val' else "solid") 
    # plt.plot(total_loss, label='Total Loss', color='blue')
    # plt.plot(reco_loss, label='Reconstruction Loss', color='green')
    # plt.plot(kl_loss, label='KL Loss', color='red')

    # plt.plot(history['beta'],label="beta")
    # plt.plot(history['gamma'], label="$\gamma$")

    # # Plot validation losses
    # plt.plot(val_total_loss, label='Val Total Loss', color='blue', linestyle='--')
    # plt.plot(val_reco_loss, label='Val Reconstruction Loss', color='green', linestyle='--')
    # plt.plot(val_kl_loss, label='Val KL Loss', color='red', linestyle='--')

    # Customize the plot
    plt.title(f'Training and Validation Losses Run: {i}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.semilogy()

    # Show the plot
    plt.show()

In [None]:
# What do we want as a AD metric? the discriminator or latent space vars