In [None]:
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.metrics import roc_curve, auc
import qkeras
from qkeras import *
#import tensorflow_probability as tfp
#import keras_tuner
#from keras_tuner import Hyperband
import joblib

# data files

All input files have data already sorted in Calo regions (i, j) ~ (18, 14)<br>
i = 0 -> 17 corresponds to GCT_Phi = 0 -> 17<br>
j = 0 -> 13 corresponds to RCT_Eta = 4 -> 17

In [None]:
X = np.concatenate((h5py.File('bkg2023/ZB_RunB.h5', 'r')['CaloRegions'][:].astype('float32'),
                    h5py.File('bkg2023/ZB_RunC.h5', 'r')['CaloRegions'][:].astype('float32'),
                    h5py.File('bkg2023/ZB_RunD.h5', 'r')['CaloRegions'][:].astype('float32'),
                    h5py.File('bkg2023/EZB0_RunC.h5', 'r')['CaloRegions'][:].astype('float32'))
                  )
X = np.reshape(X, (-1,18,14,1))

X_A = np.reshape(h5py.File('bkg2023/ZB_RunA.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
X_B = np.reshape(h5py.File('bkg2023/ZB_RunB.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
X_C = np.reshape(h5py.File('bkg2023/ZB_RunC.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
X_D = np.reshape(h5py.File('bkg2023/ZB_RunD.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
X_EphC = np.reshape(h5py.File('bkg2023/EZB0_RunC.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))

print('X      shape: ' + str(X.shape))
print('X_A    shape: ' + str(X_A.shape))
print('X_B    shape: ' + str(X_B.shape))
print('X_C    shape: ' + str(X_C.shape))
print('X_D    shape: ' + str(X_D.shape))
print('X_EphC shape: ' + str(X_EphC.shape))

MC_files = []
MC_files.append('sig2023/H_ToLongLived.h5')
MC_files.append('sig2023/SUEP.h5')
MC_files.append('sig2023/SUSYGGBBH.h5')
MC_files.append('sig2023/TT.h5')
MC_files.append('sig2023/VBFHto2C.h5')
    
MC = []
Acceptance_Flag = []
Acceptance_Filter = []
MC_flag = []
for i in range(len(MC_files)):
    MC.append(np.reshape(h5py.File(MC_files[i], 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1)))
    MC_flag.append(np.reshape(h5py.File(MC_files[i], 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1)))
    Acceptance_Flag.append(h5py.File(MC_files[i], 'r')['AcceptanceFlag'][:].astype('int32'))
    Acceptance_Filter.append([])
    for j in range(MC[i].shape[0]):
        if Acceptance_Flag[i][j] == 1:
            Acceptance_Filter[i].append(True)
        else:
            Acceptance_Filter[i].append(False)
    MC_flag[i] = MC_flag[i][Acceptance_Filter[i]]
    print('i = ' + str(i) + ': ' + str(MC_flag[i].shape) + ' / ' + str(MC[i].shape) + '; accepted ' + str(np.round(np.mean(Acceptance_Flag[i]), 4)))

In [None]:
train_ratio = 0.5
val_ratio = 0.1
test_ratio = 1 - train_ratio - val_ratio
X_train_val, X_test = train_test_split(X, test_size = test_ratio, random_state = 42)
X_train, X_val = train_test_split(X_train_val, test_size = val_ratio/(val_ratio + train_ratio), random_state = 42)
print('X_train shape: ' + str(X_train.shape))
print('X_val   shape: ' + str(X_val.shape))
print('X_test  shape: ' + str(X_test.shape))
del X_train_val

Take a look at some ZB statistics.

In [None]:
ZB_A = np.reshape(h5py.File('bkg2023/ZB_RunA.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
ZB_B = np.reshape(h5py.File('bkg2023/ZB_RunB.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
ZB_C = np.reshape(h5py.File('bkg2023/ZB_RunC.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
ZB_D = np.reshape(h5py.File('bkg2023/ZB_RunD.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))
EZB_C = np.reshape(h5py.File('bkg2023/EZB0_RunC.h5', 'r')['CaloRegions'][:].astype('float32'), (-1,18,14,1))

print('ZeroBias2023A   shape: ' + str(ZB_A.shape))
print('ZeroBias2023B   shape: ' + str(ZB_B.shape))
print('ZeroBias2023C   shape: ' + str(ZB_C.shape))
print('ZeroBias2023D   shape: ' + str(ZB_D.shape))
print('EZeroBias2023C  shape: ' + str(EZB_C.shape))

ZB_A_mean = np.mean(ZB_A, axis = 0)
ZB_B_mean = np.mean(ZB_B, axis = 0)
ZB_C_mean = np.mean(ZB_C, axis = 0)
ZB_D_mean = np.mean(ZB_D, axis = 0)
EZB_C_mean = np.mean(EZB_C, axis = 0)

fig, ax = plt.subplots(figsize = (10,10))
ax = plt.subplot(2, 2, 2)
ax = sns.heatmap(ZB_A_mean.reshape(18, 14), vmin = 0, vmax = ZB_A_mean.max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_title('Mean Et (ZB2023RunA)')
plt.show()

fig, ax = plt.subplots(figsize = (10,10))
ax = plt.subplot(2, 2, 2)
ax = sns.heatmap(ZB_B_mean.reshape(18, 14), vmin = 0, vmax = ZB_B_mean.max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_title('Mean Et (ZB2023RunB)')
plt.show()

fig, ax = plt.subplots(figsize = (10,10))
ax = plt.subplot(2, 2, 2)
ax = sns.heatmap(ZB_C_mean.reshape(18, 14), vmin = 0, vmax = ZB_C_mean.max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_title('Mean Et (ZB2023RunC)')
plt.show()

fig, ax = plt.subplots(figsize = (10,10))
ax = plt.subplot(2, 2, 2)
ax = sns.heatmap(ZB_D_mean.reshape(18, 14), vmin = 0, vmax = ZB_D_mean.max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_title('Mean Et (ZB2023RunD)')
plt.show()

fig, ax = plt.subplots(figsize = (10,10))
ax = plt.subplot(2, 2, 2)
ax = sns.heatmap(EZB_C_mean.reshape(18, 14), vmin = 0, vmax = EZB_C_mean.max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_title('Mean Et (EphemeralZB2023RunC)')
plt.show()

In [None]:
plt.hist(ZB_A.reshape((-1)), bins = 100, range=(0,1024), density=1, label='2023RunA', log = True, histtype='step')
plt.hist(ZB_B.reshape((-1)), bins = 100, range=(0,1024), density=1, label='2023RunB', log = True, histtype='step')
plt.hist(ZB_C.reshape((-1)), bins = 100, range=(0,1024), density=1, label='2023RunC', log = True, histtype='step')
plt.hist(ZB_D.reshape((-1)), bins = 100, range=(0,1024), density=1, label='2023RunD', log = True, histtype='step')
plt.hist(EZB_C.reshape((-1)), bins = 100, range=(0,1024), density=1, label='2023EphemeralRunC', log = True, histtype='step')

plt.xlabel("ZB Et")
plt.legend(loc='best')
plt.show()

print('Mean ZB2023A pT = ' + str(np.mean(ZB_A.reshape(-1))))
print('Mean ZB2023B pT = ' + str(np.mean(ZB_B.reshape(-1))))
print('Mean ZB2023C pT = ' + str(np.mean(ZB_C.reshape(-1))))
print('Mean ZB2023D pT = ' + str(np.mean(ZB_D.reshape(-1))))
print('Mean EphemeralZB2023C pT = ' + str(np.mean(EZB_C.reshape(-1))))

# CNN AE (teacher model)

In [None]:
encoder_input = tf.keras.Input(shape=(18,14,1), name='input')

encoder = layers.Conv2D(20, (3,3), strides=1, padding='same', name='conv2d_1')(encoder_input)
encoder = layers.Activation('relu', name='relu_1')(encoder)
encoder = layers.AveragePooling2D((2,2), name='pool_1')(encoder)
encoder = layers.Conv2D(30, (3,3), strides=1, padding='same', name='conv2d_2')(encoder)
encoder = layers.Activation('relu', name='relu_2')(encoder)
encoder = layers.Flatten(name='flatten')(encoder)

encoder_output = layers.Dense(80, activation='relu', name='latent')(encoder)

encoder = tf.keras.models.Model(encoder_input, encoder_output)
encoder.summary()

In [None]:
decoder = layers.Dense(9*7*30, name='dense')(encoder_output)
decoder = layers.Reshape((9,7,30), name='reshape2')(decoder)
decoder = layers.Activation('relu', name='relu_3')(decoder)
decoder = layers.Conv2D(30, (3,3), strides=1, padding='same', name='conv2d_3')(decoder)
decoder = layers.Activation('relu', name='relu_4')(decoder)
decoder = layers.UpSampling2D((2,2), name='upsampling')(decoder)
decoder = layers.Conv2D(20, (3,3), strides=1, padding='same', name='conv2d_4')(decoder)
decoder = layers.Activation('relu', name='relu_5')(decoder)

decoder_output = layers.Conv2D(1, (3,3), activation='relu', strides=1, padding='same', name='output')(decoder)

In [None]:
teacher = tf.keras.Model(encoder_input, decoder_output)
teacher.summary()

In [None]:
teacher.compile(optimizer = keras.optimizers.Adam(learning_rate=0.001), loss = 'mse')

# Training

In [None]:
history = teacher.fit(X_train, X_train,
                      epochs = 40,
                      validation_data = (X_val, X_val),
                      batch_size = 128)

In [None]:
plt.figure(figsize = (15,10))
axes = plt.subplot(2, 2, 1)
axes.plot(history.history['loss'], label = 'train loss')
axes.plot(history.history['val_loss'], label = 'val loss')
axes.legend(loc = "upper right")
axes.set_xlabel('Epoch')
axes.set_ylabel('Loss')

# Save/load trained models

In [None]:
teacher.save('saved_models/teacher2023_aug1')

In [None]:
teacher = tf.keras.models.load_model('saved_models/teacher2023_aug1')
teacher.summary()

In [None]:
#student.save('saved_models/student2023_aug1_v1/')
student.save('saved_models/student2023_aug1_v2/')

In [None]:
tf.get_logger().setLevel('ERROR')
#student = qkeras.utils.load_qmodel('saved_models/student2023_aug1_v1') # 2023 v1
student = qkeras.utils.load_qmodel('saved_models/student2023_aug1_v2') # 2023 v2
student.summary()
student.get_config()

# Loss distribution

In [None]:
X_train_predict_teacher = teacher.predict(X_train)
X_val_predict_teacher = teacher.predict(X_val)
X_test_predict_teacher = teacher.predict(X_test)
MC_predict_teacher = []
MC_flag_predict_teacher = []
for i in range(len(MC)):
    MC_predict_teacher.append(teacher.predict(MC[i]))
    MC_flag_predict_teacher.append(teacher.predict(MC_flag[i]))

In [None]:
def loss(y_true, y_pred, choice):
    if choice == 'mse':
        loss = np.mean((y_true - y_pred)**2, axis = (1,2,3))
        return loss

In [None]:
X_train_loss_teacher = loss(X_train, X_train_predict_teacher, 'mse')
X_val_loss_teacher = loss(X_val, X_val_predict_teacher, 'mse')
X_test_loss_teacher = loss(X_test, X_test_predict_teacher, 'mse')

MC_loss_teacher = []
MC_flag_loss_teacher = []
for i in range(len(MC)):
    MC_loss_teacher.append(loss(MC[i], MC_predict_teacher[i], 'mse'))
    MC_flag_loss_teacher.append(loss(MC_flag[i], MC_flag_predict_teacher[i], 'mse'))

In [None]:
nbins = 80
rmin = 0
rmax = 100
plt.hist(X_train_loss_teacher, density = 1, bins = nbins, alpha = 0.1, label = 'train (ZeroBias)', range = (rmin, rmax), log = True)
plt.hist(X_val_loss_teacher, density = 1, bins = nbins, alpha = 0.1, label = 'val (ZeroBias)', range = (rmin, rmax), log = True)
plt.hist(X_test_loss_teacher, density = 1, bins = nbins, alpha = 0.1, label = 'test (ZeroBias)', range = (rmin, rmax), log = True)
plt.hist(MC_flag_loss_teacher[0], density = 1, bins = nbins, label = 'H_ToLongLived', color='green', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_teacher[1], density = 1, bins = nbins, label = 'SUEP', color='red', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_teacher[2], density = 1, bins = nbins, label = 'SUSYGGBBH', color='blue', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_teacher[3], density = 1, bins = nbins, label = 'TT', color='orange', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_teacher[4], density = 1, bins = nbins, label = 'VBFHto2C', color='purple', histtype = 'step', range = (rmin, rmax))
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel("Anomaly score (teacher)")
plt.show()

# Comparison between original and reconstructed inputs

In [None]:
#show_ZB = True
show_ZB = False
n = 3
for i in range(280,300):
    fig, ax = plt.subplots(figsize = (17,17))
    if show_ZB == True:
        print('ZB test\nloss = ' + str(X_test_loss_teacher[i]))
    else:
        print(str(MC_files[n]) + '\nloss = ' + str(MC_flag_loss_teacher[n][i]))
    ax = plt.subplot(3, 3, 1)
    if show_ZB == True:
        ax = sns.heatmap(X_test[i].reshape(18, 14), vmin = 0, vmax = X_test[i].max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
    else:
        ax = sns.heatmap(MC_flag[n][i].reshape(18, 14), vmin = 0, vmax = MC_flag[n][i].max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.set_title('Original')
    
    ax = plt.subplot(3, 3, 2)
    if show_ZB == True:
        ax = sns.heatmap(X_test_predict_teacher[i].reshape(18, 14), vmin = 0, vmax = X_test[i].max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
    else:
        ax = sns.heatmap(MC_flag_predict_teacher[n][i].reshape(18, 14), vmin = 0, vmax = MC_flag[n][i].max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.set_title('Reconstructed')
    
    ax = plt.subplot(3, 3, 3)
    if show_ZB == True:
        ax = sns.heatmap(np.absolute(X_test_predict_teacher[i].reshape(18, 14) - X_test[i].reshape(18, 14)), vmin = 0, vmax = X_test[i].max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
    else:
        ax = sns.heatmap(np.absolute(MC_flag_predict_teacher[n][i].reshape(18, 14) - MC_flag[n][i].reshape(18, 14)), vmin = 0, vmax = MC_flag[n][i].max(), cmap = "Purples", cbar_kws = {'label': 'ET (GeV)'})
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.set_title('abs(original-reconstructed)')

    plt.show()

# Knowledge Distillation (+ quantizing with QKeras)

In [None]:
# v1
x_in = layers.Input(shape=(252,), name="In")

x = QDense(15, kernel_quantizer=quantized_bits(2, 0, 1, alpha=1.0),
           use_bias=False, name='dense1')(x_in)
x = QBatchNormalization(beta_quantizer=quantized_bits(10, 2, 1, alpha='auto'),
                        gamma_quantizer=quantized_bits(10, 2, 1, alpha='auto'),
                        mean_quantizer=quantized_bits(10, 2, 1, alpha='auto'),
                        variance_quantizer=quantized_bits(10, 2, 1, alpha='auto'),
                        name = 'QBN1')(x)
x = QActivation('quantized_relu(5, 2)', name='relu1')(x)
x = QDense(1, kernel_quantizer=quantized_bits(4, 0, 1, alpha=1.0),
           use_bias=False, name='output')(x)

student = tf.keras.models.Model(x_in, x)
student.summary()
student.compile(optimizer = 'adam', loss = 'mse')

In [None]:
# v2
x_in = layers.Input(shape=(252,), name="In")
x = layers.Reshape((18,14,1), name='reshape')(x_in)

x = QConv2D(3,(3,3), strides=2, padding="valid", use_bias=False,
            kernel_quantizer=quantized_bits(16,4,1,alpha='auto'), name='conv')(x)
x = QActivation('quantized_relu(16,4)', name='relu1')(x)
x = layers.Flatten(name='flatten')(x)
x = QDense(20, kernel_quantizer=quantized_bits(16,4,1,alpha='auto'),
           use_bias=False, name='dense1')(x)
x = QActivation('quantized_relu(16,4)', name='relu2')(x)
x = QDense(1, kernel_quantizer=quantized_bits(16,2,1,alpha='auto'),
           use_bias=False, name='output')(x)

student = tf.keras.models.Model(x_in, x)
student.summary()
student.compile(optimizer = 'adam', loss = 'mse')

In [None]:
history = student.fit(X_train.reshape((-1,252,1)), X_train_loss_teacher,
                      epochs = 30,
                      validation_data = (X_val.reshape((-1,252,1)), X_val_loss_teacher),
                      batch_size = 1024)

In [None]:
plt.figure(figsize = (15,10))
axes = plt.subplot(2, 2, 1)
axes.plot(history.history['loss'], label = 'train loss')
#axes.set_yscale(value = "log")
axes.plot(history.history['val_loss'], label = 'val loss')
axes.legend(loc = "upper right")
axes.set_xlabel('Epoch')
axes.set_ylabel('Loss')

In [None]:
X_train_loss_student = student.predict(X_train.reshape((-1,252,1)))
X_val_loss_student = student.predict(X_val.reshape((-1,252,1)))
X_test_loss_student = student.predict(X_test.reshape((-1,252,1)))
MC_loss_student = []
MC_flag_loss_student = []
for i in range(len(MC)):
    MC_loss_student.append(student.predict(MC[i].reshape((-1,252,1))))
    MC_flag_loss_student.append(student.predict(MC_flag[i].reshape((-1,252,1))))

In [None]:
nbins = 60
rmin = 0
rmax = 60
#plt.hist(X_train_loss_student, density = 1, bins = nbins, alpha = 0.1, label = 'ZB23train', range = (rmin, rmax), log = True)
#plt.hist(X_val_loss_student, density = 1, bins = nbins, alpha = 0.1, label = 'ZB23val', range = (rmin, rmax), log = True)
plt.hist(X_test_loss_student, density = 1, bins = nbins, alpha = 0.1, label = 'ZB23', range = (rmin, rmax), log = True)
plt.hist(MC_flag_loss_student[0], density = 1, bins = nbins, label = 'H_ToLongLived', color='green', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_student[1], density = 1, bins = nbins, label = 'SUEP', color='red', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_student[2], density = 1, bins = nbins, label = 'SUSYGGBBH', color='blue', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_student[3], density = 1, bins = nbins, label = 'TT', color='orange', histtype = 'step', range = (rmin, rmax))
plt.hist(MC_flag_loss_student[4], density = 1, bins = nbins, label = 'VBFHto2C', color='purple', histtype = 'step', range = (rmin, rmax))
#plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
#plt.legend(loc='center left', bbox_to_anchor=(0.57, 0.5))
plt.title('CICADA_v3_v2')
plt.legend(loc='best')
plt.xlabel("Score")
#plt.xticks(np.arange(rmin, rmax, step = 0.0002))
plt.show()

In [None]:
X_test_A_loss_student = student.predict(X_A.reshape((-1,252,1)))
X_test_B_loss_student = student.predict(X_B.reshape((-1,252,1)))
X_test_C_loss_student = student.predict(X_C.reshape((-1,252,1)))
X_test_D_loss_student = student.predict(X_D.reshape((-1,252,1)))
X_test_EphC_loss_student = student.predict(X_EphC.reshape((-1,252,1)))

In [None]:
nbins = 100
rmin = 0
rmax = 25
plt.hist(X_test_A_loss_student, density = 1, bins = nbins, label = 'ZB23A', range = (rmin, rmax), log = True, histtype = 'step')
plt.hist(X_test_B_loss_student, density = 1, bins = nbins, label = 'ZB23B', range = (rmin, rmax), log = True, histtype = 'step')
plt.hist(X_test_C_loss_student, density = 1, bins = nbins, label = 'ZB23C', range = (rmin, rmax), log = True, histtype = 'step')
plt.hist(X_test_D_loss_student, density = 1, bins = nbins, label = 'ZB23D', range = (rmin, rmax), log = True, histtype = 'step')
plt.hist(X_test_EphC_loss_student, density = 1, bins = nbins, label = 'EZB23C', range = (rmin, rmax), log = True, histtype = 'step')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.legend(loc='center left', bbox_to_anchor=(0.57, 0.5))
plt.title('CICADA_v3_v2, scores on different ZB runs')
plt.legend(loc='best')
plt.xlabel("Score")
#plt.xticks(np.arange(rmin, rmax, step = 0.0002))
plt.show()

# ROC plotting

### Assigning labels and arranging for ROC plotting

In [None]:
signal_acceptance_flag = True

#Assign labels for various signals (y = 1) and backgrounds (y = 0)
Y_bkg = np.zeros((X_test.shape[0], 1))
Y_sig = []
for i in range(len(MC)):
    if signal_acceptance_flag == False:
        Y_sig.append(np.ones((MC[i].shape[0], 1)))
    else:
        Y_sig.append(np.ones((MC_flag[i].shape[0], 1)))

#Concatenate datasets to make ROC curves

#True labels
Y_true = []
#Model scores
Y_teacher = []
Y_student = []

for i in range(len(MC)):
    if signal_acceptance_flag == False:
        Y_true.append(np.concatenate((Y_sig[i], Y_bkg)))
        Y_teacher.append(np.concatenate((MC_loss_teacher[i], X_test_loss_teacher)))
        Y_student.append(np.concatenate((MC_loss_student[i], X_test_loss_student)))
    else:
        Y_true.append(np.concatenate((Y_sig[i], Y_bkg)))
        Y_teacher.append(np.concatenate((MC_flag_loss_teacher[i], X_test_loss_teacher)))
        Y_student.append(np.concatenate((MC_flag_loss_student[i], X_test_loss_student)))

### Teacher model ROC

In [None]:
plt.figure(figsize = (13, 13))
axes = plt.subplot(2, 2, 1)
fpr_teacher = []
tpr_teacher = []
thresholds_teacher = []
roc_auc_teacher = []
for i in range(len(MC)):
    fpr_teacher.append(np.empty((Y_true[i].shape[0],1)))
    tpr_teacher.append(np.empty((Y_true[i].shape[0],1)))
    thresholds_teacher.append(np.empty((Y_true[i].shape[0],1)))
    roc_auc_teacher.append(np.empty((Y_true[i].shape[0],1)))
    fpr_teacher[i], tpr_teacher[i], thresholds_teacher[i] = roc_curve(Y_true[i], Y_teacher[i])
    roc_auc_teacher[i] = auc(fpr_teacher[i], tpr_teacher[i])
    fpr_teacher[i] *= 28.61
    if i == 0:
        axes.plot(fpr_teacher[0], tpr_teacher[0], linestyle = '-', lw = 1.5, color = 'green', label = 'H_ToLongLived (AUC = %.5f)' % (roc_auc_teacher[0]))
    if i == 1:
        axes.plot(fpr_teacher[1], tpr_teacher[1], linestyle = '-', lw = 1.5, color = 'red', label = 'SUEP (AUC = %.5f)' % (roc_auc_teacher[1]))
    if i == 2:
        axes.plot(fpr_teacher[2], tpr_teacher[2], linestyle = '-', lw = 1.5, color = 'blue', label = 'SUSYGGBBH (AUC = %.5f)' % (roc_auc_teacher[2]))
    if i == 3:
        axes.plot(fpr_teacher[3], tpr_teacher[3], linestyle = '-', lw = 1.5, color = 'orange', label = 'TT (AUC = %.5f)' % (roc_auc_teacher[3]))
    if i == 4:
        axes.plot(fpr_teacher[4], tpr_teacher[4], linestyle = '-', lw = 1.5, color = 'purple', label = 'VBFHto2C (AUC = %.5f)' % (roc_auc_teacher[4]))
axes.plot([0.003, 0.003], [0, 1], linestyle = '--', lw = 1, color = 'black', label = 'Trigger rate = 3 kHz')
#axes.set_xlim([0.00002861, 28.61])
axes.set_xlim([0.0001, 28.61])
axes.set_ylim([0.000001, 1.0])
axes.set_xscale(value = "log")
axes.set_yscale(value = "log")
axes.set_xlabel('Trigger Rate (MHz)',size=15)
axes.set_ylabel('Signal Efficiency',size=15)
axes.set_title('Teacher Network',size=15)
#axes.legend(loc='center left', bbox_to_anchor = (0.3, 0.3),fontsize=12)
axes.legend(loc='center left', bbox_to_anchor=(0.26, 0.3),fontsize=12)
plt.show()

### Student model ROC

In [None]:
plt.figure(figsize = (13, 13))
axes = plt.subplot(2, 2, 1)
fpr_student = []
tpr_student = []
thresholds_student = []
roc_auc_student = []

for i in range(len(MC)):
    fpr_student.append(np.empty((Y_true[i].shape[0],1)))
    tpr_student.append(np.empty((Y_true[i].shape[0],1)))
    thresholds_student.append(np.empty((Y_true[i].shape[0],1)))
    roc_auc_student.append(np.empty((Y_true[i].shape[0],1)))
    fpr_student[i], tpr_student[i], thresholds_student[i] = roc_curve(Y_true[i], Y_student[i])
    roc_auc_student[i] = auc(fpr_student[i], tpr_student[i])
    fpr_student[i] *= 28.61
    if i == 0:
        axes.plot(fpr_student[0], tpr_student[0], linestyle = '-', lw = 1.5, color = 'green', label = 'H_ToLongLived (AUC = %.5f)' % (roc_auc_student[0]))
    if i == 1:
        axes.plot(fpr_student[1], tpr_student[1], linestyle = '-', lw = 1.5, color = 'red', label = 'SUEP (AUC = %.5f)' % (roc_auc_student[1]))
    if i == 2:
        axes.plot(fpr_student[2], tpr_student[2], linestyle = '-', lw = 1.5, color = 'blue', label = 'SUSYGGBBH (AUC = %.5f)' % (roc_auc_student[2]))
    if i == 3:
        axes.plot(fpr_student[3], tpr_student[3], linestyle = '-', lw = 1.5, color = 'orange', label = 'TT (AUC = %.5f)' % (roc_auc_student[3]))
    if i == 4:
        axes.plot(fpr_student[4], tpr_student[4], linestyle = '-', lw = 1.5, color = 'purple', label = 'VBFHto2C (AUC = %.5f)' % (roc_auc_student[4]))
axes.plot([0.003, 0.003], [0, 1], linestyle = '--', lw = 1, color = 'black', label = 'Trigger rate = 3 kHz')
#axes.set_xlim([0.00002861, 28.61])
axes.set_xlim([0.0001, 28.61])
axes.set_ylim([0.000001, 1])
axes.set_xscale(value = "log")
axes.set_yscale(value = "log")
axes.set_xlabel('Trigger Rate (MHz)',size=15)
axes.set_ylabel('Signal Efficiency',size=15)
axes.set_title('CICADA_v3_v2, signal(Run3) vs ZB(2023)',size=15)
axes.legend(loc='center left', bbox_to_anchor = (0.26, 0.3),fontsize=12)
#axes.legend(loc='center left', bbox_to_anchor=(1, 0.3),fontsize=12)
plt.show()

# cross-validation

In [None]:
signal_acceptance_flag = True

Y_bkg = np.zeros((X_test.shape[0], 1))
Y_sig = []
for i in range(len(MC)):
    if signal_acceptance_flag == False:
        Y_sig.append(np.ones((MC[i].shape[0], 1)))
    else:
        Y_sig.append(np.ones((MC_flag[i].shape[0], 1)))
        
Y_true = []
Y_student = []

for i in range(len(MC)):
    if signal_acceptance_flag == False:
        Y_true.append(np.concatenate((Y_sig[i], Y_bkg)))
        Y_student.append(np.concatenate((MC_loss_student[i], X_test_loss_student)))
    else:
        Y_true.append(np.concatenate((Y_sig[i], Y_bkg)))
        Y_student.append(np.concatenate((MC_flag_loss_student[i], X_test_loss_student)))

def kfold(y, k):
    N=y.shape[0]
    n=np.floor(N/k).astype(int)
    y_kf=[]
    for i in range(k):
        y_kf.append(y[i*n : (i+1)*n])
    return y_kf

kf=10
X_test_loss_model = X_test_loss_student
Y_model = Y_student
if signal_acceptance_flag == False:
    MC_loss_model = MC_loss_student
else:
    MC_loss_model = MC_flag_loss_student

X_test_loss_model_kf=kfold(X_test_loss_model,kf)
Y_bkg_kf=kfold(Y_bkg,kf)

plt.figure(figsize = (16, 16))
axes = plt.subplot(2, 2, 1)

fpr = []
tpr = []
thresholds = []
roc_auc = []

for j in range(len(MC)):
    fpr.append(np.empty((Y_true[j].shape[0],1)))
    tpr.append(np.empty((Y_true[j].shape[0],1)))
    thresholds.append(np.empty((Y_true[j].shape[0],1)))
    roc_auc.append(np.empty((Y_true[j].shape[0],1)))
    fpr[j], tpr[j], thresholds[j] = roc_curve(Y_true[j], Y_model[j])
    roc_auc[j] = auc(fpr[j], tpr[j])
    fpr[j] *= 28.61
    
    MC_loss_model_kf=kfold(MC_loss_model[j],kf)
    Y_sig_kf=kfold(Y_sig[j],kf)

    Y_true_kf=[]
    Y_model_kf=[]
    for i in range(kf):
        Y_true_kf.append(np.concatenate((Y_sig_kf[i],Y_bkg_kf[i])))
        Y_model_kf.append((np.concatenate((MC_loss_model_kf[i],X_test_loss_model_kf[i]))))

    fpr_mean=np.linspace(0,1,10000000)
    tpr_kf=[]
    fpr_kf=[]
    thresholds_kf=[]
    roc_auc_kf=[]
    for i in range(kf):
        tpr_kf.append(np.empty((Y_true_kf[i].shape[0],1)))
        fpr_kf.append(np.empty((Y_true_kf[i].shape[0],1)))
        thresholds_kf.append(np.empty((Y_true_kf[i].shape[0],1)))
        roc_auc_kf.append(np.empty((Y_true_kf[i].shape[0],1)))
        fpr_kf[i], tpr_kf[i], thresholds_kf[i] = roc_curve(Y_true_kf[i], Y_model_kf[i])
        roc_auc_kf[i] = auc(fpr_kf[i], tpr_kf[i])

    tpr_total=[]
    for i in range(kf):
        interp_tpr=np.interp(fpr_mean, fpr_kf[i], tpr_kf[i])
        interp_tpr[0]=0.0
        tpr_total.append(interp_tpr)
    tpr_mean=np.mean(tpr_total, axis=0)
    tpr_mean[-1]=1.0
    roc_auc_mean=auc(fpr_mean,tpr_mean)
    roc_auc_std=np.std(roc_auc_kf)
    
    tpr_std=np.std(tpr_total, axis=0)
    tpr_up=np.minimum(tpr_mean+tpr_std,1)
    tpr_down=np.maximum(tpr_mean-tpr_std,0)

    fpr_mean *= 28.61
    
    if j == 0:
        axes.plot(fpr_mean, tpr_mean, linestyle = '-', lw = 1.5, color = 'green', label = 'H_ToLongLived (AUC = %.5f $\pm$ %0.5f)' % (roc_auc_mean, roc_auc_std))
        axes.fill_between(fpr_mean, tpr_down, tpr_up, color='green', alpha=0.1)
    if j == 1:
        axes.plot(fpr_mean, tpr_mean, linestyle = '-', lw = 1.5, color = 'red', label = 'SUEP (AUC = %.5f $\pm$ %0.5f)' % (roc_auc_mean, roc_auc_std))
        axes.fill_between(fpr_mean, tpr_down, tpr_up, color='red', alpha=0.1)
    if j == 2:
        axes.plot(fpr_mean, tpr_mean, linestyle = '-', lw = 1.5, color = 'blue', label = 'SUSYGGBBH (AUC = %.5f $\pm$ %0.5f)' % (roc_auc_mean, roc_auc_std))
        axes.fill_between(fpr_mean, tpr_down, tpr_up, color='blue', alpha=0.1)
    if j == 3:
        axes.plot(fpr_mean, tpr_mean, linestyle = '-', lw = 1.5, color = 'orange', label = 'TT (AUC = %.5f $\pm$ %0.5f)' % (roc_auc_mean, roc_auc_std))
        axes.fill_between(fpr_mean, tpr_down, tpr_up, color='orange', alpha=0.1)
    if j == 4:
        axes.plot(fpr_mean, tpr_mean, linestyle = '-', lw = 1.5, color = 'purple', label = 'VBFHto2C (AUC = %.5f $\pm$ %0.5f)' % (roc_auc_mean, roc_auc_std))
        axes.fill_between(fpr_mean, tpr_down, tpr_up, color='purple', alpha=0.1)

axes.plot([0.003, 0.003], [0, 1], linestyle = '--', lw = 1, color = 'black', label = 'Trigger rate = 3 kHz')
axes.set_xlim([0.0001, 28.61])
axes.set_ylim([0.000001, 1])
axes.set_xscale(value = "log")
axes.set_yscale(value = "log")
axes.set_xlabel('Trigger Rate (MHz)',size=17)
axes.set_ylabel('Signal Efficiency',size=17)
axes.set_title('CICADA_v3_v2, signal(Run3) vs ZB(2023)',size=17)
axes.legend(loc='center left', bbox_to_anchor = (0.28, 0.3),fontsize=11)
plt.show()