In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # on NERSC filelocking is not allowed
import h5py
import tensorflow as tf
# Make notebook run on other GPUS. GPT's solution ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# gpus = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(gpus[2], 'GPU')  # change 1 to 0, 2, 3 as needed
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# import tensorflow.math as tfmath
import tensorflow.keras as keras
# from scipy.optimize imporjun26t curve_fit
# from tensorflow.keras import layers, Model
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# from tensorflow.keras.models import load_model
from sklearn.metrics import roc_curve, auc
import sklearn.metrics as sk
# from tensorflow.keras.models import Model
from tensorflow.keras.layers import PReLU, Input, LSTM, Flatten, Concatenate, Dense, Conv2D, TimeDistributed, MaxPooling2D, LeakyReLU, ReLU, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam, SGD
# from tensorflow.keras.metrics import Precision
# # from qkeras import QActivation, QDense, QConv2D, QBatchNormalization, QConv2DBatchnorm # These don't seem to be used
# # from qkeras import quantized_relu, quantized_bits
from tensorflow.keras.regularizers import l1, l2, l1_l2


## import the custom models and functions
import sys
# NOTE: This needs to be modified to where your repo lives, path to /repo/path/VAE_FS/models/
sys.path.append('/global/homes/j/jananinf/projs/VAE_FS/models/') 
from models import VAE_Model, Qmake_encoder_set_weights, Qmake_decoder_set_weights

In [None]:
# Load data
home_path = "/global/cfs/cdirs/m2616/jananinf/projsIO/VAE_FS/" # Updated to NERSC
file_path = home_path + "preprocessed_SNL_data.h5"
with h5py.File(file_path, 'r') as hf:           # Shapes:
    X_train = hf['X_train'][:]                  # (3200000, 57)
    X_test  = hf['X_test'][:]                   # (800000,  57)
    Ato4l_data  = hf['Ato4l_data'][:]           # (55969,   57) Signal data? 
    hToTauTau_data  = hf['hToTauTau_data'][:]   # (691283,  57)
    hChToTauNu_data  = hf['hChToTauNu_data'][:] # (760272,  57)
    leptoquark_data = hf['leptoquark_data'][:]  # (340544,  57)
    print("Data loaded from preprocessed_SNL_data.h5")

In [None]:
INPUT_SZ = 57
H1_SZ = 32
H2_SZ = 16
LATENT_SZ = 3
enc = Qmake_encoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
enc.summary()

In [None]:
dec = Qmake_decoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
dec.summary()

In [None]:
# def _custom_MSE(reconstruction, data):
#     # "We use a dataset with standardized p_T as a target so that all quantities are O(1)" arXiv: 2108.03986 

#     # Q: is the input also standardized?
    
#     loss = keras.losses.mse(data, reconstruction)
#     return loss



### Training

In [None]:
NUM_EPOCHS = 100
BATCH_SIZE = 16384
STOP_PATIENCE = 15
LR_PATIENCE = 10
steps_per_epoch = X_train.shape[0] // BATCH_SIZE
vae = VAE_Model(enc, dec, steps_per_epoch=steps_per_epoch, cycle_length=10, min_beta=0.1, max_beta=0.8)
opt = keras.optimizers.Adam(learning_rate=0.0001, clipnorm=1000)
vae.compile(optimizer=opt) # Not sure what weighted_mse is doing.



In [None]:
# looks like early_stopping is needed for val_loss
early_stopping = EarlyStopping(patience=STOP_PATIENCE, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=LR_PATIENCE, verbose=1)

In [None]:
train = True
if train:
    history = vae.fit(x=X_train, validation_split=0.2, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping,reduce_lr], shuffle=True)


In [None]:
save = False
# Last save is in attempt 1. New save should go to attempt 2
save_path = home_path+'/VAE_trainings/attempt1/'
if save:
    vae.save_weights(filepath= save_path, save_format='tf')

# Attempt History. The original code for each folder should also be tied to the commits. 
# 0: no weighted MSE, no call_backs
# 1: adding ReduceLRonPlatueau and early_stopping and the test_step

In [None]:
import matplotlib.pyplot as plt

# Assuming 'history' is the object returned by your model.fit() call

# Extract the loss values
total_loss = history.history['loss']
reco_loss = history.history['reconstruction_loss']
kl_loss = history.history['kl_loss']
val_total_loss = history.history['val_loss']
val_reco_loss = history.history['val_reconstruction_loss']
val_kl_loss = history.history['val_kl_loss']

# Create a new figure
plt.figure(figsize=(12, 8))

# Plot training losses
plt.plot(total_loss, label='Total Loss', color='blue')
plt.plot(reco_loss, label='Reconstruction Loss', color='green')
plt.plot(kl_loss, label='KL Loss', color='red')

# Plot validation losses
plt.plot(val_total_loss, label='Val Total Loss', color='blue', linestyle='--')
plt.plot(val_reco_loss, label='Val Reconstruction Loss', color='green', linestyle='--')
plt.plot(val_kl_loss, label='Val KL Loss', color='red', linestyle='--')

# Customize the plot
plt.title('Training and Validation Losses')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.yscale("log")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()

##### Reload from the latest save

In [None]:
new_enc = Qmake_encoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
new_dec = Qmake_decoder_set_weights(INPUT_SZ, H1_SZ, H2_SZ, LATENT_SZ)
new_VAE = VAE_Model(new_enc, new_dec)
new_VAE.load_weights(save_path)
just_enc = new_VAE.get_layer("encoder") # We only need encoder output

##### Encode data, calculate anomaly score and plot ROC

In [None]:
# helper functions. These are intended as the AD metric funciton in calc_anomaly_scores
def AD_score_KL(z_mean, z_log_var):
    kl_loss = np.mean(-0.5 * (1 + z_log_var - (z_mean) ** 2 - np.exp(z_log_var)))
    # Comparing this to eq 2 in arXiv: 2108.03986 z_log_var = log(sigma**2)
    return kl_loss


def AD_score_CKL(z_mean, _): # z_log_var not used
    CKL = np.mean(z_mean**2)
    return CKL 

def AD_score_Rz(z_mean, z_log_var):
    return z_mean**2/np.exp(z_log_var)

def calc_anomaly_scores(data, encoder: keras.Model, AD_metric, debug = True):
    """
    Parameters:
    -----------
    debug: Optional bool to skip latent space vectors that produce infinities.
    Currently set to true as it seems only 2 specific cases are affected
    """
    dat_encoded = np.array(encoder.predict(data))[0] # This outputs shape (3, len(X_test), 3). Can't find satisfactory explanation for this behavior. (len(X_test), 3) makes sense. (3, len, 3) does not
    # Kenny only uses the first list so we'll follow that convention.
    # has shape (len(data), 3), where col 1 is z_mean, 2 is z_log_var and z. This is by design of encoder.
    scores = np.zeros(len(data))
    for i in range(len(scores)):
        z_mean, z_log_var = dat_encoded[i][0], dat_encoded[i][1]
        score = AD_metric(z_mean, z_log_var)
        if debug and (score == np.inf):
            print("RUNTIME WARNING: inf encountered. Skipping these values\n"
                  + f"z_mean: {z_mean}\n"
                  + f"z_log_var: {z_log_var}")
            continue
        scores[i] = score

    return scores

##### Calculate Anamoly scores

In [None]:
def get_truth_and_scores(encoder, AD_metric, debug=True):
    bg_score = calc_anomaly_scores(X_test, encoder, AD_metric)
    scores = []
    truths = []
    zeros = np.zeros(len(X_test))
    signal_data = [leptoquark_data, Ato4l_data, hChToTauNu_data, hToTauTau_data]

    # Generate Truth and score lists ready for ROC curve calculation
    for dat in signal_data:
        truths.append(np.concatenate((zeros, np.ones(len(dat)))))

        s = calc_anomaly_scores(dat, encoder, AD_metric, debug=debug)
        scores.append(np.concatenate((bg_score,s) ))

    return (truths, scores)

In [None]:
def plot_rocs(truths, scores, fig_title):
    target_fpr = 1e-5
    tpr_at_target = []
    signal_names_tex = [ # latex version
                    "Leptoquark"
                    , "$A\\rightarrow 4\ell$"
                    , "$h^{\pm}\\rightarrow\\tau \\nu$"
                    , "$h^0\\rightarrow\\tau\\tau$"
                    ]
    signal_names_hum = [ # human readable
                    "Leptoquark"
                    ,"A to 4L"
                    , "h to Tau Nu"
                    , "h to Tau Tau"
                    ]

    for truth, score, l in zip(truths, scores, signal_names_tex):
        fpr, tpr, thresholds = roc_curve(truth, score)
        auc = sk.roc_auc_score(truth, score)
        plt.plot(fpr, tpr, label=l + f": {str(round(auc, 3))}") # plot roc curve



        # Find tpr at fpr target
        idx = np.argmin(np.abs(fpr - target_fpr))
        tpr_at_target.append(tpr[idx])
        
    plt.plot(np.linspace(0, 1, 1000), np.linspace(0, 1, 1000), "--")
    plt.vlines(10**-5, 0, 1, colors="r", linestyles="dashed")

    # Plot teaks
    plt.loglog()
    plt.legend()
    plt.grid()
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.title(fig_title) 
    plt.show()

    for sig_nam, tpr in zip(signal_names_hum, tpr_at_target):
        print(sig_nam + " TPR @ FPR 10e-5 (%): " + f"{tpr*100:.2f}")
    

In [None]:
t, s = get_truth_and_scores(just_enc, AD_score_KL)
plot_rocs(t, s, "ROC Curves using $D_{KL}$ as Anomaly Score")

In [None]:
t, s = get_truth_and_scores(just_enc, AD_score_CKL)
plot_rocs(t, s, "ROC Curves using $CKL$ as Anomaly Score")

In [None]:
t, s = get_truth_and_scores(just_enc, AD_score_Rz)
plot_rocs(t, s, "ROC Curves using $R_z$ as Anomaly Score")