In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # on NERSC filelocking is not allowed
import h5py
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense
import tensorflow.keras.backend as K
from sklearn.metrics import roc_curve, auc
import sklearn.metrics as sk
import pickle as pkl
import pandas as pd

import sys
# Path to dir model.py lives in -------
# NOTE: This needs to be modified to where your repo lives, path to /repo/path/VAE_FS/models/
# If the jupyter notebook kernel is running from VAE_FS/models/ the
# line below is not needed
sys.path.append('/global/homes/j/jananinf/projs/VAE_FS/models/')

# import the custom models and functions
from models import Qmake_encoder_set_weights, Qmake_decoder_set_weights, Qmake_discriminator, VAE_GAN_Model
from data_and_eval_utils import load_preprocessed_snl, plot_rocs, calc_anomaly_dist, AD_score_KL, AD_score_CKL, get_truth_and_scores, eval_rocs, SIG_KEYS
# from models import VAE_Model_ATLAS_beta as NNmodel


# # Make notebook run on other GPUS. GPT's solution ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[1], 'GPU')  # change 1 to 0, 2, 3 as needed
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

2025-08-11 16:00:44.375779: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from gan_params import *
print_base_params()

CONSTANTS IMPORTED:
            NUM_TRAIN      = 10 # Number of iterations to train for.
            # VAE Architecture
            INPUT_SZ       = 57
            H1_SZ          = 32 # Hidden layer 1 size
            H2_SZ          = 16 # "          " 2 "  "
            LATENT_SZ      = 3
            # Discriminator Architecture # 8, 2 is on ATLAS-VAE-GAN
            DISC_H1_SZ     = 8 # Size of first hidden layer of discriminator  
            DISC_H2_SZ     = 2 # "" second hidden layer ""
            # Training schedule and parameters
            NUM_EPOCHS     = 100
            STEPS_EPOCH    = 20 # Steps per epoch
            BATCH_SIZE     = 1024
            STOP_PATIENCE  = 40
            LR_PATIENCE    = 20
            LR             = 0.001 # Learning rate
            REDUCE_LR_FACTOR = 0.5
            VAL_SPLIT      = 0.2 # Validation split
            CYCLE_LEN      = 20
            SHUFFLE_BOOL   = True
            # Hyperparameters
            MIN_BETA       = 0
          

In [3]:
home_path = "/global/cfs/cdirs/m2616/jananinf/projsIO/VAE_FS/" # Updated to NERSC
SAVE_PATH = home_path+f"GAN_trainings/" 

### Loss plots.

In [4]:
# History Keys
keys = [
        'loss'               # VAE total loss term.
        ,'reco_loss'         # VAE loss term
        ,'kl_loss'           # VAE Loss term
        ,'disc_loss'         # VAE loss due to discriminator "failure to fool disc"
        # # ,'raw_loss'          # Reco_loss + kl_loss
        ,'w_kl_loss'         # kl_loss * beta
        ,'w_disc_loss'       # disc_loss * gamma
        ,'d_loss'
        # Validation version
        ,'val_loss'          
        ,'val_reco_loss'
        ,'val_kl_loss'
        ,'val_disc_loss'
        # ,'val_raw_loss'
        ,'val_w_kl_loss'
        ,'val_w_disc_loss'
        ,'val_d_loss'
        # --
        # ,'beta'              # hyperparameter
        # ,'gamma'             # hyperparameter
        # ,'val_gamma'         # hyperparameter
        # ,'val_beta'          # hyperparameter
        # ,'lr'              # learning rate
        ]

color_key = {
             'loss' : 'k'               # VAE total loss term.
            ,'val_loss' : 'k'         
            ,'reco_loss': 'tab:blue'         # VAE loss term
            ,'val_reco_loss' : 'tab:blue'
            ,'kl_loss': 'crimson'          # VAE Loss term
            ,'val_kl_loss': 'crimson'
            ,'disc_loss' : 'c'        # VAE loss due to discriminator "failure to fool disc"
            ,'val_disc_loss' : 'c'
            ,'w_kl_loss'  : 'tab:orange'        # kl_loss * beta
            ,'val_w_kl_loss' : 'tab:orange'
            ,'w_disc_loss'  : 'tab:green'     # disc_loss * gamma
            ,'val_w_disc_loss' : 'tab:green'
            ,'d_loss': 'r'
            ,'val_d_loss' :'r'
        }
# d_loss : discriminator loss
# loss : generator total loss
# raw_loss : reconstruction and kl_loss without beta weighting


# # Generate cleaner legend
# proxy_lines = {}
# for key in keys:
#     base_key = key.replace('val_', '')  # Strip 'val_' to group them
#     if base_key not in proxy_lines and key in color_key:
#         proxy_lines[base_key] = Line2D([0], [0], 
#                                     color=color_key[key], 
#                                     lw=2, 
#                                     label=base_key)
# clean_leg = list(proxy_lines.values())
# for att_n in range(6, 38): # plot all attempts. most recent is 18.
#     att_path = SAVE_PATH + f"attempt{att_n}/"

#     # Make folder for loss plots if it doesn't exist
#     plot_dir = os.path.join(SAVE_PATH, f"loss_plots/attempt{att_n}/")
#     os.makedirs(plot_dir, exist_ok=True)
    
#     for i in range(10): # Currently only training 10 models at a time.
#         save_path = att_path + f"n_{i}/"
#         with open(save_path + 'training_history.pkl', 'rb') as f:
#             history = pkl.load(f)
    
        
#         # Plot training losses
#         # fig, (ax, ax2) = plt.subplots(nrows=2, sharex=True, figsize=(8,10))
#         fig = plt.figure(figsize=(12, 8))
#         gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1])  # 3:1 means top gets 75%, bottom 25%

#         ax = fig.add_subplot(gs[0])
#         ax2 = fig.add_subplot(gs[1], sharex=ax)

#         # Calculate fractional contributions to total VAE loss
#         loss = np.array(history['loss'])
#         reco_loss = np.array(history['reco_loss'])
#         beta = np.array(history['beta'])
#         reco_loss_frac = (reco_loss * (1 - beta))/loss

#         w_kl_loss_frac = np.array(history['w_kl_loss'])/loss
#         w_disc_loss_frac = np.array(history['w_disc_loss'])/loss
#         ax2.plot(reco_loss_frac, label='reco_loss_frac')
#         ax2.plot(w_kl_loss_frac, label='w_kl_loss_frac')
#         ax2.plot(w_disc_loss_frac, label='w_disc_loss_frac')

#         # Tweak fractional plot
#         # ax2.set_ylim((0,1))
#         ax2.set_ylabel('Approximate\nTotal VAE Loss fraction')
#         # ax2.tick_params(axis='y', labelcolor='b')
#         ax2.legend()
#         ax2.grid()
#         ax2.set_xlabel('Epoch')

#         for key in keys:
#             if key == 'lr' or history.get(key) == None:
#                 continue
#             ax.plot(np.abs(history[key]),
#                      label=key, 
#                      linestyle = "dashed" if key[0:3] == 'val' else "solid",
#                      marker= "x" if key[0:3] == 'val' else "o",
#                      markersize=6.5,
#                      color=color_key[key])
    
#         # Customize the plot
#         ax.set_title(f'Training and Validation Losses, Attempt: {att_n} Run: {i}')
#         ax.set_ylabel('Loss')
#         # ax.tick_params(axis='y', labelcolor='r')
#         # ax.legend()
#         ax.grid(True)
#         ax.set_yscale('log')
#         ax.legend(handles=clean_leg, title="○ = train, x = val")
#         plt.savefig(SAVE_PATH + f"loss_plots/attempt{att_n}/" + f"loss_attempt_{att_n}_run_{i}.png", bbox_inches='tight')
#         # plt.show()
#         plt.close(fig)
#     print(f"Attempt {att_n} plotting complete!")

In [5]:
# Load data
data = load_preprocessed_snl()
# X_train = data['X_train']

Data loaded from preprocessed_SNL_data.h5


##### Calculate Anomaly scores

After inspecting the graphs a few notable models remain

In [6]:
 # mins 88, 90, 89, 79 for AUC. 
# # 16 did the best AUC and I think also has higher TPR @ target FPR

Generate ROC Curves for all trained iterations of the model. Save them and 
generate a list of of models with their AUC to rank them later

In [None]:
# Load the model
new_enc = Qmake_encoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
new_dec = Qmake_decoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
new_disc = Qmake_discriminator(INPUT_SZ, DISC_H1_SZ, DISC_H2_SZ)
new_VAE = VAE_GAN_Model(new_enc, new_dec, new_disc)
opt = keras.optimizers.Adam(learning_rate=LR) # These help silence benign warnings and is cleaner ----
new_VAE.compile(optimizer=opt)                # ---

roc_results = {}

FIG_SAVE_PATH = SAVE_PATH + "roc_plots/"

for att_n in range(17,28): # Splitting everythingup
    roc_results = {}
    bad_iters = {}

    # Iterate through its iterations
    fig_save_path = FIG_SAVE_PATH + f"attempt{att_n}/"

    for i in range(NUM_TRAIN):
        save_path = SAVE_PATH + f"attempt{att_n}/n_{i}/"

        new_VAE.load_weights(save_path)
        just_enc = new_VAE.get_layer("encoder") # We only need encoder output

        roc_perfs = eval_rocs(just_enc, data, AD_score_CKL)

        if roc_perfs is None:
            print(f"Bad Iteration. Attempt: {att_n}, Iteration: {i}.")
            bad_iters.setdefault(f"Attempt{att_n}", []).append(i) # Create the entry and list if it doesn't exit, otherwise append it to the current list of that entry
            continue
        # Save its auc performance
        roc_results[f"Attempt{att_n}_iter_{i}"] = {k: roc_perfs[k]['auc'] for k in SIG_KEYS.keys()}

        # Commented out because we don't need to make the plots rn
        # Make folder for roc plots if it doesn't exist
        # plot_dir = os.path.join(fig_save_path)
        # os.makedirs(plot_dir, exist_ok=True)
        # f = plot_rocs(roc_perfs, f"ROC Curves. CKL as Anomaly Score. Attempt: {att_n}, Iter: {i}")
        # f.savefig(fig_save_path + f"roc_att{att_n}_iter{i}.png", bbox_inches = 'tight')
        # plt.close(f)
        # print(f"Roc curve plotted and saved. for Attempt: {att_n}, iter: {i}!")

        print(f"Completed Attempt: {att_n} Iter: {i}")
        
    with open(SAVE_PATH + f"temp_pkls/roc_results_att_{att_n}.pkl", "wb") as f:
        pkl.dump([bad_iters, roc_results], f)

 5621/25000 [=====>........................] - ETA: 13s  

In [None]:

# data = [leptoquark_data, Ato4l_data, hChToTauNu_data, hToTauTau_data
#                , X_train
#                , X_test
#                ] # Already defined.
data_names_tex = [ # latex version
                "Leptoquark"
                , "$A\\rightarrow 4\ell$"
                , "$h^{\pm}\\rightarrow\\tau \\nu$"
                , "$h^0\\rightarrow\\tau\\tau$"
                , "Training Set (BG)" # Background
                , "Test Set (BG)" # Background
                ]

anomaly_scores = []
for _, dat in data.items():
    s = calc_anomaly_dist(dat, just_enc, AD_score_CKL)
    anomaly_scores.append(s)


In [None]:
# plot setting for CKL
bin_n = 125
xlims = (0, 40)
ylims = (0, 0.03)
bins  = np.linspace(xlims[0], xlims[1], bin_n)
xlabel = "Clipped KL"


# # Investigating around the threshold at 161
# ckl_roc_threshold = 161.84
# bin_n = 10
# l_margin = 10 
# r_margin = 300

# xlims = ( ckl_roc_threshold - l_margin , ckl_roc_threshold + r_margin)
# ylims = (0, 0.01)
# bins  = np.linspace(xlims[0], xlims[1], bin_n)
# xlabel = "Clipped KL"

# Plot settings for KL
# bin_n = 125
# xlims = (0, 40)
# ylims = (0, 0.0125)
# bins  = np.linspace(0, xlims[1], bin_n)
# xlabel = "KL Divergence"

for i in range(len(data_names_tex)):
    dat = anomaly_scores[i]
    # print(bin_n)
    plt.hist(dat
             , bins = bins
             , label=data_names_tex[i] # + " " + str(bin_n)
             , histtype = "step"
             , density=True
             )
plt.legend(loc="upper right")
# plt.vlines(ckl_roc_threshold, 0, 1)
# plt.loglog()
# plt.semilogy()
# plt.semilogx()
plt.xlabel(xlabel)
plt.ylabel("Density")
plt.grid()
plt.ylim(ylims)
plt.xlim(xlims)
plt.title("Anomaly Score Distribution Across Datasets")