In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [2]:
NOISE_DIM = 128 # 64 in Gitlab
TESTING= False
BATCH_SIZE = 64
SAMPLE_SIZE= 50000
BINS = 25

In [3]:
filenames = {
    "herwig": "../GAN-data/events_anomalydetection_DelphesHerwig_qcd_features.h5",
    "pythiabg": "../GAN-data/events_anomalydetection_DelphesPythia8_v2_qcd_features.h5",
    "pythiasig": "../GAN-data/events_anomalydetection_DelphesPythia8_v2_Wprime_features.h5"
}

datatypes = ["herwig", "pythiabg", "pythiasig"]

train_features = ["ptj1", "etaj1", "mj1", "ptj2", "etaj2", "phij2", "mj2", "tau21j1", "tau21j2"]
condition_features = ["mjj"]

features = train_features + condition_features
GEN_DIM = NOISE_DIM + len(condition_features)
DISC_DIM = len(features)

In [4]:
def cut_data(uncut_data, pTmin = 1200, etamax = 2.5):
    # Column 0: ptj1
    # Column 1: etaj1
    # Column 3: ptj2
    # Column 4: etaj2
    return uncut_data[((uncut_data[:,0] > pTmin) & (np.abs(uncut_data[:,1]) < etamax)) | ((uncut_data[:,3] > pTmin) & (np.abs(uncut_data[:,4]) < etamax))]

np_bg_SB = np.load('..\data\processed\\np_bg_SB_2.npy')
np_bg_SR = np.load('..\data\processed\\np_bg_SR_2.npy')
np_sig_SR = np.load('..\data\processed\\np_sig_SR_2.npy')

np_sig_SR_labeled = np.copy(np_sig_SR)
np_bg_SR_labeled = np.copy(np_bg_SR)

np_sig_SR_labeled = np.append(np_sig_SR_labeled,np.ones([len(np_sig_SR),1]),1)
np_bg_SR_labeled = np.append(np_bg_SR_labeled,np.zeros([len(np_bg_SR),1]),1)
np_combined_SR = np.concatenate((np_bg_SR, np_sig_SR), axis = 0)
np_combined_SR_labeled = np.concatenate((np_sig_SR_labeled,np_bg_SR_labeled),axis=0)

gc.collect()

31

In [5]:
np_sig_SR.shape[0]/np_bg_SR.shape[0]

0.6205640442400567

In [6]:
gen_model = tf.keras.models.load_model('..\Results\cdijetgan\saverun4\models\epoch1000-generator.h5')



In [7]:
def generate_gan(generator, realdata):


    labels = sample_fake(refdata = realdata, size = SAMPLE_SIZE) # Sample mjj from the existing distribution of mjj for comparison
    labels_scaled = scaler_mjj.transform(labels.reshape(-1,1))
    
    fakedata_uncut_unscaled = generator(tf.concat([tf.random.uniform((SAMPLE_SIZE, NOISE_DIM)), labels_scaled], 1), training=False)
    fakedata_uncut = np.concatenate((scaler.inverse_transform(fakedata_uncut_unscaled), labels.reshape(-1,1)), axis = 1)
   

    # At least one jet has pT > 1200 and |eta| < 2.5
    fakedata = cut_data(fakedata_uncut)

    # mjj = sqrt(Ejj**2 - pxjj**2 - pyjj**2 - pzjj**2)
    fakedata_mjj = mjj(fakedata)

    return fakedata
def mjj(output):
    pt1 = output[:,0]
    eta1 = output[:,1]
    m1 = output[:,2]
    pt2 = output[:,3]
    eta2 = output[:,4]
    phi2 = output[:,5]
    m2 = output[:,6]
    ejj = np.sqrt((pt1 * np.cosh(eta1))**2 + m1**2) + np.sqrt((pt2 * np.cosh(eta2))**2 + m2**2)
    pxjj = pt1 + pt2 * np.cos(phi2)
    pyjj = pt2 * np.sin(phi2)
    pzjj = pt1 * np.sinh(eta1) + pt2 * np.sinh(eta2)
    return np.sqrt(ejj**2 - pxjj**2 - pyjj**2 - pzjj**2)
def sample_fake(refdata = np_bg_SR, size = BATCH_SIZE):
    rand_idx = np.random.choice(refdata.shape[0], size = size)
    return refdata[rand_idx, -1].reshape((-1,1))
def sample_data(refdata = np_combined_SR_labeled,size= 10000):
    rand_idx = np.random.choice(refdata.shape[0], size)
    return refdata[rand_idx, :]


In [8]:
real_data = sample_data(size=100000)

In [25]:
np_bg_SB_trimmed = np.delete(np_bg_SB, [i for i in range(np_bg_SB.shape[0] % (BATCH_SIZE * 4))], axis = 0)

# Normalize inputs between -1 and 1, mjj between 0 and 1
scaler = MinMaxScaler((-1,1)).fit(np_bg_SB_trimmed[:,:-1])
scaler_mjj = MinMaxScaler((0,1)).fit(np_bg_SB_trimmed[:,-1].reshape(-1,1))
np_bg_SB_scaled = np.concatenate((scaler.transform(np_bg_SB_trimmed[:,:-1]), scaler_mjj.transform(np_bg_SB_trimmed[:,-1].reshape(-1,1))), axis = 1)

In [10]:
generated_data = generate_gan(gen_model,np_combined_SR)
generated_data2 = generate_gan(gen_model,np_combined_SR)
generated_data = np.concatenate((generated_data,generated_data2), axis = 0)
generated_data_labeled  = np.copy(generated_data)
generated_data_labeled = np.append(generated_data_labeled,np.zeros([len(generated_data_labeled),1]),1)


In [11]:
np_bg_SR_labeled  = np.copy(np_bg_SR)
np_bg_SR_labeled  = np.append(np_bg_SR_labeled ,np.zeros([len(np_bg_SR_labeled) ,1]),1)
np_sig_SR_labeled  = np.copy(np_sig_SR)
np_sig_SR_labeled  = np.append(np_sig_SR_labeled ,np.ones([len(np_sig_SR_labeled) ,1]),1)

In [12]:
gen_data_df = pd.DataFrame(generated_data_labeled, columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])
np_sig_df = pd.DataFrame(np_sig_SR_labeled, columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [13]:
gen_data_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1723.593219,-0.447040,231.848995,1632.559321,-1.270426,3.183638,177.785872,0.485057,0.901190,3518.852295,0.0
1,1254.255195,0.586991,356.887180,1236.716592,-1.228783,3.079381,35.761310,0.309715,0.462774,3539.431885,0.0
2,1320.811424,0.573153,144.292481,1053.669343,-1.414499,2.826740,83.105907,0.764269,0.819507,3532.048584,0.0
3,1535.654695,0.073059,337.541149,1456.906301,1.130604,3.195737,483.767412,0.778110,0.506943,3374.443848,0.0
4,1436.127038,1.300800,138.939476,850.611133,-0.725379,3.324772,319.502633,0.484082,0.397005,3441.649170,0.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,1429.223118,-0.767454,70.913082,1413.135724,0.618603,3.149499,100.440952,0.839759,0.462240,3481.821777,0.0
99996,1913.059458,-0.072881,56.876220,1757.349127,-0.582130,3.152659,185.348648,0.474615,0.296354,3638.103760,0.0
99997,1404.281612,0.645874,120.390235,1243.818380,-0.865011,3.168753,136.292011,0.794827,0.366358,3332.172607,0.0
99998,1705.343780,-0.557433,191.180824,1525.443643,0.300916,3.122352,81.460575,0.257720,0.680162,3378.238037,0.0


In [14]:
np_sig_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1914.942993,0.369530,105.035004,1583.804443,-0.185737,2.898982,461.574005,0.552809,0.121353,3662.211182,1.0
1,1684.598755,-0.523116,159.865997,1647.186768,0.110357,3.141156,514.883972,0.440781,0.299984,3586.710693,1.0
2,1789.997070,0.156652,93.665901,1569.509399,0.144243,3.235663,475.316986,0.136103,0.135523,3421.777344,1.0
3,1672.631348,-1.015185,116.327003,1568.322998,-0.350886,3.165926,561.236023,0.617014,0.294746,3536.982910,1.0
4,1431.694946,-0.700751,513.015991,1099.721313,0.945019,3.245961,108.752998,0.183145,0.456454,3481.573486,1.0
...,...,...,...,...,...,...,...,...,...,...,...
150591,1678.012939,0.827268,473.352997,1653.355347,0.978250,3.045114,111.844002,0.090573,0.308552,3409.779297,1.0
150592,1741.585083,-0.203934,96.165001,1728.791870,0.121508,3.133633,472.475006,0.202213,0.157020,3581.979492,1.0
150593,1289.501831,0.922850,115.719002,1153.867065,-0.919407,3.193555,489.053009,0.271544,0.203001,3622.836914,1.0
150594,1787.707764,0.032824,508.045013,1381.171143,0.933776,3.163839,91.104897,0.166132,0.588186,3546.809082,1.0


In [8]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      
]
def classifier_model():    
    model = tf.keras.Sequential()
    model.add(layers.Dense(64,input_dim = 4,activation ='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[METRICS])
    return model


In [9]:
classifier = classifier_model()
classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4

In [10]:

def learningCurveLoss(history):
    plt.figure(figsize=(10,8))
    plt.plot(history.history['loss'], linewidth=1)
    plt.plot(history.history['val_loss'], linewidth=1)
    plt.title('Model Loss over Epochs')
    plt.ylabel('Loss')
    #plt.ylim(0,5)
    plt.xlabel('Epoch')
    plt.legend(['training sample loss','validation sample loss'])
    plt.savefig('5_tag_learning_curve.png')
    plt.show()
    plt.close()
    #plt.savefig("Learning_Curve")
def plot_roc_curve(y_test, y_test_score):

    fpr, tpr, _ = roc_curve(y_test, y_test_score)
    auc_value = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr,label=' AUC = %.1f%%'%(auc_value*100.))
    plt.plot([0, 1], [0, 1], 'k-')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc='lower right')
    plt.show()

In [11]:
# split background in SR into testing and training. Testing is to inject signals
training_idx = np.random.randint(np_bg_SR_labeled.shape[0], size=(int)(np_bg_SR_labeled.shape[0]/2))
test_idx = np.random.randint(np_bg_SR_labeled.shape[0], size=(int)(np_bg_SR_labeled.shape[0]/2))
training, test = np_bg_SR_labeled[training_idx,:], np_bg_SR_labeled[test_idx,:]

In [12]:
training.shape

(121338, 11)

In [13]:
# other half of the bg combined with FULL signal
testing_sample = np.concatenate((np_sig_SR_labeled,test),axis=0)

In [14]:
testing_sample.shape

(271934, 11)

In [18]:
x_test = testing_sample[:,[2,6,7,8]]
sby_test = testing_sample[:,-1]
rfy_test = np.ones([len(x_test),1])

In [19]:
x_test.shape

(271934, 4)

In [20]:
ts_df = pd.DataFrame(testing_sample, columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [21]:
ts_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1914.942993,0.369530,105.035004,1583.804443,-0.185737,2.898982,461.574005,0.552809,0.121353,3662.211182,1.0
1,1684.598755,-0.523116,159.865997,1647.186768,0.110357,3.141156,514.883972,0.440781,0.299984,3586.710693,1.0
2,1789.997070,0.156652,93.665901,1569.509399,0.144243,3.235663,475.316986,0.136103,0.135523,3421.777344,1.0
3,1672.631348,-1.015185,116.327003,1568.322998,-0.350886,3.165926,561.236023,0.617014,0.294746,3536.982910,1.0
4,1431.694946,-0.700751,513.015991,1099.721313,0.945019,3.245961,108.752998,0.183145,0.456454,3481.573486,1.0
...,...,...,...,...,...,...,...,...,...,...,...
271929,1449.061035,0.949814,427.515015,911.284607,-0.902628,2.916589,123.153000,0.641955,0.656351,3396.115234,0.0
271930,1710.843628,0.213326,180.125000,1626.670898,-0.250507,3.055198,402.979004,0.580612,0.308331,3480.578857,0.0
271931,1621.522949,0.225366,216.429993,1479.811768,-0.863732,3.083251,141.806000,0.607509,0.405151,3587.497314,0.0
271932,1457.715210,1.070146,100.156998,1332.523193,-0.480979,3.156274,134.921997,0.874045,0.716113,3677.205811,0.0


In [23]:
testing_sample[:,10]
unique, counts = np.unique(testing_sample[:,10], return_counts=True)
dict(zip(unique, counts))

{0.0: 121338, 1.0: 150596}

In [28]:
sb_ratio = np.logspace(-3,-0.205,5)
#sb_ratio = np.linspace(0,0.62,10)
mixedsb = []
generated_data = []
for i in sb_ratio:
    sampled_signal = np.random.choice(np_sig_SR_labeled.shape[0], (int)(i * training.shape[0]))
    combined = np.concatenate((np_sig_SR_labeled[sampled_signal,:],training), axis =0)
    gen = generate_gan(gen_model,combined)
    gen2 = generate_gan(gen_model,combined)
    gen_data = np.concatenate((gen,gen2),axis=0)
    generated_data_labeled  = np.copy(gen_data)
    generated_data_labeled = np.append(generated_data_labeled,np.zeros([len(generated_data_labeled),1]),1)
    mixedsb.append(sample_data(combined,100000))
    generated_data.append(generated_data_labeled)

In [29]:
dataset_label_df = pd.DataFrame(mixedsb[4], columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])


In [30]:
dataset_label_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1254.600098,-0.563055,99.529602,729.849121,1.913283,2.923359,53.182201,0.467114,0.497011,3576.400146,0.0
1,1593.913940,1.394608,183.125000,1449.924438,0.378416,3.172369,565.619019,0.849766,0.360669,3590.404541,0.0
2,1528.364502,-1.428399,327.631989,1381.010254,-0.299397,3.139722,151.988007,0.373334,0.741344,3414.918213,0.0
3,1517.183838,-0.613952,469.332001,1510.345581,0.529763,3.162623,94.184196,0.135307,0.283744,3598.461914,1.0
4,1642.973145,-0.073967,118.470001,1625.909058,-0.645671,3.147611,492.571014,0.487898,0.404778,3472.105225,1.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,1370.128174,-1.430378,44.167599,1129.233765,0.215655,3.128694,695.244019,0.226933,0.382866,3615.487793,0.0
99996,1510.500488,0.953385,509.928986,1386.458374,-0.413353,3.195020,98.030998,0.361423,0.192808,3658.879395,1.0
99997,1465.834106,-0.492934,155.746002,1245.379150,1.097612,3.145136,81.159103,0.521240,0.436445,3611.982910,0.0
99998,1824.210571,0.745977,120.557999,1679.010986,1.091275,3.191449,672.534973,0.452797,0.240912,3670.796631,1.0


In [31]:
scaled_dataset = []
for i in range(len(mixedsb)):
    classifier_real = mixedsb[i][:,[2,6,7,8]]
    classifier_fake = generated_data[i][:,[2,6,7,8]]
    sblabel_real = mixedsb[i][:,10]
    sblabel_fake = generated_data[i][:,10]
    unscaled_data = np.concatenate((classifier_real,classifier_fake),axis=0)
    sblabel = np.concatenate((sblabel_real,sblabel_fake),axis = 0)
    scaler = StandardScaler().fit(unscaled_data)
    scaled_data = scaler.transform(unscaled_data)
    sblabel = sblabel.reshape(len(sblabel),1)
    rflabels = np.concatenate((np.ones([len(classifier_real),1]),np.zeros([len(classifier_fake),1])),axis=0)
    scaled_data = np.concatenate((scaled_data,sblabel),axis=1)
    scaled_dataset.append(np.concatenate((scaled_data,rflabels),axis=1))

In [32]:
print(generated_data[0].shape)
print(mixedsb[0].shape)
print(scaled_dataset[0].shape)
print(len(scaled_dataset))

(100000, 11)
(100000, 11)
(200000, 6)
5


In [33]:
scaled_dataset_label_df = pd.DataFrame(scaled_dataset[4], columns = ['m1','m2','tau21j1','tau21j2','sblabel','rflabel'])

In [34]:
scaled_dataset_label_df

Unnamed: 0,m1,m2,tau21j1,tau21j2,sblabel,rflabel
0,-1.246334,-1.368990,-0.409667,1.015285,0.0,1.0
1,-1.046928,-0.322792,0.895907,0.507934,0.0,1.0
2,-0.702225,-1.167267,-0.729633,1.924482,0.0,1.0
3,-0.364218,-1.285280,-1.541762,0.221688,1.0,1.0
4,-1.201154,-0.471928,-0.338753,0.672072,1.0,1.0
...,...,...,...,...,...,...
199995,1.232146,0.966919,1.227899,-0.828604,0.0,0.0
199996,1.318266,0.967124,1.189557,-0.828604,0.0,0.0
199997,0.771070,0.966813,0.829758,-0.828604,0.0,0.0
199998,1.221459,0.967287,1.351200,-0.828604,0.0,0.0


In [35]:
ypred4c = []
fp_4c, tp_4c,th_4c= [],[],[]
auc_list_4c = []
for i in range(len(mixedsb)): 
    x_train,x_val,y_train,y_val = train_test_split(scaled_dataset[i][:,0:4],scaled_dataset[i][:,-1], test_size = 0.2, random_state=42)
    _,_,sby_train,sby_val = train_test_split(scaled_dataset[i][:,0:4],scaled_dataset[i][:,-2], test_size = 0.2, random_state=42)
    classifier = classifier_model()
    history = classifier.fit(x_train, y_train, epochs=10, batch_size=128,validation_data=(x_val,y_val))
    learningCurveLoss(history)
    y_pred = classifier.predict(x_test)
    ypred4c.append(y_pred)
    fpr, tpr, thresholds = roc_curve(sby_test, y_pred)
    auc_value = auc(fpr,tpr)
    auc_list_4c.append(auc_value)
    fp_4c.append(fpr)
    tp_4c.append(tpr)
    th_4c.append(thresholds)
    

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/10

KeyboardInterrupt: 