In [None]:
import numpy as np
import glob
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import h5py

from utils import *
from build_model import *

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
n_per_sample = 10000

with h5py.File('HHbbbb.h5', 'r') as f:
    X_HHbbbb_isHS = np.sum(tf.cast(f['HS'][:n_per_sample], tf.float32), axis=-1, keepdims=True)
    X_HHbbbb_isPU = np.sum(tf.cast(f['PU'][:n_per_sample], tf.float32), axis=-1, keepdims=True)

with h5py.File('PJZ0.h5', 'r') as f:
    X_PJZ0 = np.sum(tf.cast(f['data'][:n_per_sample], tf.float32), axis=-1, keepdims=True)

print(X_HHbbbb_isHS.shape)
print(X_HHbbbb_isPU.shape)
print(X_PJZ0.shape)

In [None]:
plot_layers(event_idx=1, X=X_HHbbbb_isHS+X_HHbbbb_isPU, label='[HHbbbb, PU=200]', n_layers=1)
plot_layers(event_idx=1, X=X_HHbbbb_isHS, label='[HHbbbb, PU=0]', n_layers=1)
plot_layers(event_idx=2, X=X_PJZ0, label='[QCD dijet, PU=200]', n_layers=1)

In [None]:
target_pu = 100
#x_augmented, total_e_before_removal, total_e_after_removal, total_e_scale = augment_pu(image=X_HHbbbb_isPU[0], target_pu=target_pu, shift_phi=False, threshold=1)
x_augmented = augment_pu(image=X_HHbbbb_isPU[0], target_pu=target_pu, shift_phi=True, threshold=1)

plot_layers(event_idx=None, X=X_HHbbbb_isPU[0], label='[Pure PU, 200]', n_layers=1)
plot_layers(event_idx=None, X=x_augmented, label=f'[Pure PU, aug. {target_pu}]', n_layers=1)

#print("total_e_before_removal: "+str(total_e_before_removal))
#print("total_e_after_removal: "+str(total_e_after_removal))
#print("total_e_scale: "+str(total_e_scale))

## train vicreg

In [None]:
batch_size = 2048
pu_min = 100
pu_max = 200
steps_per_epoch = (X_HHbbbb_isHS.shape[0] + X_PJZ0.shape[0]) // batch_size
threshold = 1

gen_data_contrastive = generate_batch_for_contrastive(X_hs=X_HHbbbb_isHS,
                                                      X_pu=X_HHbbbb_isPU,
                                                      X_bkg=X_PJZ0,
                                                      pu_min=pu_min,
                                                      pu_max=pu_max,
                                                      batch_size=batch_size,
                                                      threshold=threshold)

encoder = build_encoder(input_shape=input_shape, embedding_dim=embedding_dim)
projection_head = build_projection_head(embedding_dim=embedding_dim, projection_dim=projection_dim)

vicreg_model = VICRegModel(encoder=encoder, projection_head=projection_head, c_inv=c_inv, c_var=c_var, c_cov=c_cov)
vicreg_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005))

encoder.summary()
print('\n\n')
projection_head.summary()

history = vicreg_model.fit(gen_data_contrastive, steps_per_epoch=steps_per_epoch, epochs=20)

plt.figure(figsize = (6,4))
axes = plt.subplot(1,1,1)
axes.plot(history.history['loss'], label = 'loss (total)')
axes.plot(history.history['loss_inv'], label = 'invariance')
axes.plot(history.history['loss_var'], label = 'variance')
axes.plot(history.history['loss_cov'], label = 'covariance')
axes.legend(loc = "upper right")
axes.set_xlabel('Epoch')
axes.set_ylabel('Loss')
#axes.set_yscale('log')
#axes.set_ylim((0.001, 100))

In [None]:
if os.path.exists('weights_encoder.h5'):
    os.remove('weights_encoder.h5')
    
if os.path.exists('weights_projection_head.h5'):
    os.remove('weights_projection_head.h5')

encoder.save_weights('weights_encoder.h5')
projection_head.save_weights('weights_projection_head.h5')

In [None]:
encoder_loaded = build_encoder(input_shape=input_shape, embedding_dim=embedding_dim)
encoder_loaded.load_weights('weights_encoder.h5')

projection_head_loaded = build_projection_head(embedding_dim=embedding_dim, projection_dim=projection_dim)
projection_head_loaded.load_weights('weights_projection_head.h5')

vicreg_model_loaded = VICRegModel(encoder=encoder_loaded,
                                  projection_head=projection_head_loaded,
                                  c_inv=c_inv,
                                  c_var=c_var,
                                  c_cov=c_cov)

## train embedding classifier

In [None]:
batch_size = 1024
pu_min = 100
pu_max = 101
steps_per_epoch = (X_HHbbbb_isHS.shape[0] + X_PJZ0.shape[0]) // batch_size
threshold = 1

gen_data_classification = generate_batch_for_classifier(X_hs=X_HHbbbb_isHS,
                                                        X_pu=X_HHbbbb_isPU,
                                                        X_bkg=X_PJZ0,
                                                        pu_min=pu_min,
                                                        pu_max=pu_max,
                                                        batch_size=batch_size,
                                                        threshold=threshold)

# no fine tuning for the encoder
embedding_classifier_nofinetune = build_embedding_classifier(encoder=encoder_loaded, input_shape=input_shape, encoder_trainable=False)
embedding_classifier_nofinetune.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005), loss='binary_crossentropy', metrics=['accuracy'])
history_nofinetune = embedding_classifier_nofinetune.fit(gen_data_classification, steps_per_epoch=steps_per_epoch, epochs=20)

# fine tuning for the encoder (tuning phase)
embedding_classifier_finetune = build_embedding_classifier(encoder=encoder_loaded, input_shape=input_shape, encoder_trainable=True)
embedding_classifier_finetune.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005), loss='binary_crossentropy', metrics=['accuracy'])
history_finetune1 = embedding_classifier_finetune.fit(gen_data_classification, steps_per_epoch=steps_per_epoch, epochs=1)

# fine tuning for the encoder (non tuning phase)
embedding_classifier_finetune.get_layer('encoder').trainable = False
embedding_classifier_finetune.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005), loss='binary_crossentropy', metrics=['accuracy'])
history_finetune2 = embedding_classifier_finetune.fit(gen_data_classification, steps_per_epoch=steps_per_epoch, epochs=19)

history_finetune = history_finetune1.history['loss'] + history_finetune2.history['loss']
plt.figure(figsize=(6,4))
axes = plt.subplot(1,1,1)
axes.plot(history_nofinetune.history['loss'], label = 'loss (no fine-tuning)')
axes.plot(history_finetune, label='loss (with fine-tuning)')
axes.legend(loc="upper right")
axes.set_xlabel('Epoch')
axes.set_ylabel('Loss')
plt.show()

In [None]:
if os.path.exists('weights_embedding_classifier_nofinetune.h5'):
    os.remove('weights_embedding_classifier_nofinetune.h5')

if os.path.exists('weights_embedding_classifier_finetune.h5'):
    os.remove('weights_embedding_classifier_finetune.h5')

embedding_classifier_nofinetune.save_weights('weights_embedding_classifier_nofinetune.h5')
embedding_classifier_finetune.save_weights('weights_embedding_classifier_finetune.h5')

In [None]:
encoder_loaded = build_encoder(input_shape=input_shape, embedding_dim=embedding_dim)
encoder_loaded.load_weights('weights_encoder.h5')

embedding_classifier_nofinetune_loaded = build_embedding_classifier(encoder=encoder_loaded, input_shape=input_shape)
embedding_classifier_nofinetune_loaded.load_weights('weights_embedding_classifier_nofinetune.h5')

embedding_classifier_finetune_loaded = build_embedding_classifier(encoder=encoder_loaded, input_shape=input_shape)
embedding_classifier_finetune_loaded.load_weights('weights_embedding_classifier_finetune.h5')

## train standalone classifier

In [None]:
batch_size = 1024
pu_min = 100
pu_max = 101
steps_per_epoch = (X_HHbbbb_isHS.shape[0] + X_PJZ0.shape[0]) // batch_size
threshold = 1

gen_data_classification = generate_batch_for_classifier(X_hs=X_HHbbbb_isHS,
                                                        X_pu=X_HHbbbb_isPU,
                                                        X_bkg=X_PJZ0,
                                                        pu_min=pu_min,
                                                        pu_max=pu_max,
                                                        batch_size=batch_size,
                                                        threshold=threshold)

standalone_classifier = build_standalone_classifier(input_shape=input_shape)
standalone_classifier.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005),
                              loss='binary_crossentropy',
                              metrics=['accuracy'])

history = standalone_classifier.fit(gen_data_classification, steps_per_epoch=steps_per_epoch, epochs=20)

plt.figure(figsize = (6,4))
axes = plt.subplot(1,1,1)
axes.plot(history.history['loss'], label = 'loss')
axes.legend(loc = "upper right")
axes.set_xlabel('Epoch')
axes.set_ylabel('Loss')

In [None]:
if os.path.exists('weights_standalone_classifier.h5'):
    os.remove('weights_standalone_classifier.h5')

standalone_classifier.save_weights('weights_standalone_classifier.h5')

In [None]:
standalone_classifier_loaded = build_standalone_classifier(input_shape=input_shape)
standalone_classifier_loaded.load_weights('weights_standalone_classifier.h5')

## evaluate

In [None]:
n_per_sample = 80000

with h5py.File('HHbbbb.h5', 'r') as f:
    X_HHbbbb_isHS = tf.cast(f['HS'][-n_per_sample:], tf.float32)
    X_HHbbbb_isPU = tf.cast(f['PU'][-n_per_sample:], tf.float32)

with h5py.File('PJZ0.h5', 'r') as f:
    X_PJZ0 = tf.cast(f['data'][-n_per_sample:], tf.float32)

print(X_HHbbbb_isHS.shape)
print(X_HHbbbb_isPU.shape)
print(X_PJZ0.shape)

In [None]:
test_pu = list(range(100, 201, 20))
X_test = []
Y_test = []
Y_pred_embedding_nofinetune = []
Y_pred_embedding_finetune = []
Y_pred_standalone = []
threshold = 1
for pu in test_pu:
    x, y = generate_dataset_for_classifier(X_hs=X_HHbbbb_isHS, X_pu=X_HHbbbb_isPU, X_bkg=X_PJZ0, target_pu=pu, threshold=threshold)
    X_test.append(x)
    Y_test.append(y)

    Y_pred_embedding_nofinetune.append(embedding_classifier_nofinetune_loaded.predict(x))
    Y_pred_embedding_finetune.append(embedding_classifier_finetune_loaded.predict(x))
    Y_pred_standalone.append(standalone_classifier_loaded.predict(x))
    print(pu)

In [None]:
plot_roc(Y_test=Y_test, Y_pred_embedding=Y_pred_embedding_nofinetune, Y_pred_standalone=Y_pred_standalone, test_pu=test_pu)
#plot_sig_eff_vs_pu_at_single_bkgeff(Y_test=Y_test, Y_pred_embedding=Y_pred_embedding_nofinetune, Y_pred_standalone=Y_pred_standalone, test_pu=test_pu, bkg_eff_list=[0.05,0.1,0.2,0.5])
plot_eff_vs_pu_at_single_threshold(Y_test=Y_test, Y_pred_embedding=Y_pred_embedding_nofinetune, Y_pred_standalone=Y_pred_standalone, test_pu=test_pu, threshold_by_target_bkgeff_pu=[0.001, 100])

In [None]:
plot_roc(Y_test=Y_test, Y_pred_embedding=Y_pred_embedding_finetune, Y_pred_standalone=Y_pred_standalone, test_pu=test_pu)
#plot_sig_eff_vs_pu_at_single_bkgeff(Y_test=Y_test, Y_pred_embedding=Y_pred_embedding_finetune, Y_pred_standalone=Y_pred_standalone, test_pu=test_pu, bkg_eff_list=[0.05,0.1,0.2,0.5])
plot_eff_vs_pu_at_single_threshold(Y_test=Y_test, Y_pred_embedding=Y_pred_embedding_finetune, Y_pred_standalone=Y_pred_standalone, test_pu=test_pu, threshold_by_target_bkgeff_pu=[0.001, 100])