In [4]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [5]:
NOISE_DIM = 128 # 64 in Gitlab
TESTING= False
BATCH_SIZE = 64
SAMPLE_SIZE= 50000
BINS = 25

In [6]:
filenames = {
    "herwig": "../GAN-data/events_anomalydetection_DelphesHerwig_qcd_features.h5",
    "pythiabg": "../GAN-data/events_anomalydetection_DelphesPythia8_v2_qcd_features.h5",
    "pythiasig": "../GAN-data/events_anomalydetection_DelphesPythia8_v2_Wprime_features.h5"
}

datatypes = ["herwig", "pythiabg", "pythiasig"]

train_features = ["ptj1", "etaj1", "mj1", "ptj2", "etaj2", "phij2", "mj2", "tau21j1", "tau21j2"]
condition_features = ["mjj"]

features = train_features + condition_features
GEN_DIM = NOISE_DIM + len(condition_features)
DISC_DIM = len(features)

In [7]:
def cut_data(uncut_data, pTmin = 1200, etamax = 2.5):
    # Column 0: ptj1
    # Column 1: etaj1
    # Column 3: ptj2
    # Column 4: etaj2
    return uncut_data[((uncut_data[:,0] > pTmin) & (np.abs(uncut_data[:,1]) < etamax)) | ((uncut_data[:,3] > pTmin) & (np.abs(uncut_data[:,4]) < etamax))]

np_bg_SB = np.load('..\data\processed\\np_bg_SB_2.npy')
np_bg_SR = np.load('..\data\processed\\np_bg_SR_2.npy')
np_sig_SR = np.load('..\data\processed\\np_sig_SR_2.npy')

np_sig_SR_labeled = np.copy(np_sig_SR)
np_bg_SR_labeled = np.copy(np_bg_SR)

np_sig_SR_labeled = np.append(np_sig_SR_labeled,np.ones([len(np_sig_SR),1]),1)
np_bg_SR_labeled = np.append(np_bg_SR_labeled,np.zeros([len(np_bg_SR),1]),1)
np_combined_SR = np.concatenate((np_bg_SR, np_sig_SR), axis = 0)
np_combined_SR_labeled = np.concatenate((np_sig_SR_labeled,np_bg_SR_labeled),axis=0)

gc.collect()

31

In [8]:
np_sig_SR.shape[0]/np_bg_SR.shape[0]

0.6205640442400567

In [9]:
gen_model = tf.keras.models.load_model('..\Results\cdijetgan\saverun4\models\epoch1000-generator.h5')



In [40]:
def generate_gan(generator, realdata):


    labels = sample_fake(refdata = realdata, size = SAMPLE_SIZE) # Sample mjj from the existing distribution of mjj for comparison
    labels_scaled = scaler_mjj.transform(labels.reshape(-1,1))
    
    fakedata_uncut_unscaled = generator(tf.concat([tf.random.uniform((SAMPLE_SIZE, NOISE_DIM)), labels_scaled], 1), training=False)
    fakedata_uncut = np.concatenate((scaler.inverse_transform(fakedata_uncut_unscaled), labels.reshape(-1,1)), axis = 1)
   

    # At least one jet has pT > 1200 and |eta| < 2.5
    fakedata = cut_data(fakedata_uncut)

    # mjj = sqrt(Ejj**2 - pxjj**2 - pyjj**2 - pzjj**2)
    fakedata_mjj = mjj(fakedata)

    return fakedata
def mjj(output):
    pt1 = output[:,0]
    eta1 = output[:,1]
    m1 = output[:,2]
    pt2 = output[:,3]
    eta2 = output[:,4]
    phi2 = output[:,5]
    m2 = output[:,6]
    ejj = np.sqrt((pt1 * np.cosh(eta1))**2 + m1**2) + np.sqrt((pt2 * np.cosh(eta2))**2 + m2**2)
    pxjj = pt1 + pt2 * np.cos(phi2)
    pyjj = pt2 * np.sin(phi2)
    pzjj = pt1 * np.sinh(eta1) + pt2 * np.sinh(eta2)
    return np.sqrt(ejj**2 - pxjj**2 - pyjj**2 - pzjj**2)
def sample_fake(refdata = np_bg_SR, size = BATCH_SIZE):
    rand_idx = np.random.choice(refdata.shape[0], size = size)
    return refdata[rand_idx, -1].reshape((-1,1))
def sample_data(refdata = np_combined_SR_labeled,size= 10000):
    rand_idx = np.random.choice(refdata.shape[0], size)
    return refdata[rand_idx, :]


In [41]:
real_data = sample_data(size=100000)

In [12]:
np_bg_SB_trimmed = np.delete(np_bg_SB, [i for i in range(np_bg_SB.shape[0] % (BATCH_SIZE * 4))], axis = 0)

# Normalize inputs between -1 and 1, mjj between 0 and 1
scaler = MinMaxScaler((-1,1)).fit(np_bg_SB_trimmed[:,:-1])
scaler_mjj = MinMaxScaler((0,1)).fit(np_bg_SB_trimmed[:,-1].reshape(-1,1))
np_bg_SB_scaled = np.concatenate((scaler.transform(np_bg_SB_trimmed[:,:-1]), scaler_mjj.transform(np_bg_SB_trimmed[:,-1].reshape(-1,1))), axis = 1)

In [13]:
generated_data = generate_gan(gen_model,np_combined_SR)
generated_data2 = generate_gan(gen_model,np_combined_SR)
generated_data = np.concatenate((generated_data,generated_data2), axis = 0)
generated_data_labeled  = np.copy(generated_data)
generated_data_labeled = np.append(generated_data_labeled,np.zeros([len(generated_data_labeled),1]),1)


In [42]:
np_bg_SR_labeled  = np.copy(np_bg_SR)
np_bg_SR_labeled  = np.append(np_bg_SR_labeled ,np.zeros([len(np_bg_SR_labeled) ,1]),1)
np_sig_SR_labeled  = np.copy(np_sig_SR)
np_sig_SR_labeled  = np.append(np_sig_SR_labeled ,np.ones([len(np_sig_SR_labeled) ,1]),1)

In [43]:
gen_data_df = pd.DataFrame(generated_data_labeled, columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])
np_sig_df = pd.DataFrame(np_sig_SR_labeled, columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [44]:
gen_data_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1642.760750,-1.025744,106.174972,1438.201699,0.031338,3.130930,321.065707,0.388924,0.423540,3419.760010,0.0
1,1781.470024,0.376664,67.810966,1750.465074,0.329847,3.169115,299.945280,0.665493,0.575450,3396.933594,0.0
2,1366.499094,0.570329,179.026325,442.670160,-2.228212,3.771356,24.635818,0.482289,0.745350,3598.440918,0.0
3,1324.982278,0.415817,750.147466,1292.371942,-1.231357,3.107160,51.926321,0.268194,0.681407,3657.390625,0.0
4,1919.748249,-0.180005,158.286049,1568.113896,-0.548647,3.353240,107.163969,0.646919,0.843282,3378.073730,0.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,1558.378601,0.368532,132.528368,1520.764497,-0.590046,3.159183,569.003156,0.763448,0.517466,3402.349365,0.0
99996,1693.498343,-0.216648,208.085725,1713.780581,-0.862785,3.087981,163.642405,0.726286,0.288618,3456.394043,0.0
99997,1770.393668,-0.238184,497.459941,1698.880692,-0.497467,3.158949,155.995063,0.855192,0.827768,3420.027100,0.0
99998,1743.861253,0.351210,465.128546,1757.856130,0.229255,3.158819,49.746296,0.373978,0.647160,3436.368652,0.0


In [45]:
np_sig_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1914.942993,0.369530,105.035004,1583.804443,-0.185737,2.898982,461.574005,0.552809,0.121353,3662.211182,1.0
1,1684.598755,-0.523116,159.865997,1647.186768,0.110357,3.141156,514.883972,0.440781,0.299984,3586.710693,1.0
2,1789.997070,0.156652,93.665901,1569.509399,0.144243,3.235663,475.316986,0.136103,0.135523,3421.777344,1.0
3,1672.631348,-1.015185,116.327003,1568.322998,-0.350886,3.165926,561.236023,0.617014,0.294746,3536.982910,1.0
4,1431.694946,-0.700751,513.015991,1099.721313,0.945019,3.245961,108.752998,0.183145,0.456454,3481.573486,1.0
...,...,...,...,...,...,...,...,...,...,...,...
150591,1678.012939,0.827268,473.352997,1653.355347,0.978250,3.045114,111.844002,0.090573,0.308552,3409.779297,1.0
150592,1741.585083,-0.203934,96.165001,1728.791870,0.121508,3.133633,472.475006,0.202213,0.157020,3581.979492,1.0
150593,1289.501831,0.922850,115.719002,1153.867065,-0.919407,3.193555,489.053009,0.271544,0.203001,3622.836914,1.0
150594,1787.707764,0.032824,508.045013,1381.171143,0.933776,3.163839,91.104897,0.166132,0.588186,3546.809082,1.0


In [21]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      
]
def classifier_model():    
    model = tf.keras.Sequential()
    model.add(layers.Dense(64,input_dim = 4,activation ='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[METRICS])
    return model


In [22]:
classifier = classifier_model()
classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4

In [23]:

def learningCurveLoss(history):
    plt.figure(figsize=(10,8))
    plt.plot(history.history['loss'], linewidth=1)
    plt.plot(history.history['val_loss'], linewidth=1)
    plt.title('Model Loss over Epochs')
    plt.ylabel('Loss')
    #plt.ylim(0,5)
    plt.xlabel('Epoch')
    plt.legend(['training sample loss','validation sample loss'])
    plt.savefig('5_tag_learning_curve.png')
    plt.show()
    plt.close()
    #plt.savefig("Learning_Curve")
def plot_roc_curve(y_test, y_test_score):

    fpr, tpr, _ = roc_curve(y_test, y_test_score)
    auc_value = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr,label=' AUC = %.1f%%'%(auc_value*100.))
    plt.plot([0, 1], [0, 1], 'k-')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc='lower right')
    plt.show()

In [31]:
# split background in SR into testing and training. Testing is to inject signals
training_idx = np.random.randint(np_bg_SR_labeled.shape[0], size=(int)(np_bg_SR_labeled.shape[0]/2))
test_idx = np.random.randint(np_bg_SR_labeled.shape[0], size=(int)(np_bg_SR_labeled.shape[0]/2))
training, test = np_bg_SR_labeled[training_idx,:], np_bg_SR_labeled[test_idx,:]

In [46]:
training.shape

(121338, 11)

In [35]:
# other half of the bg combined with FULL signal
testing_sample = np.concatenate((np_sig_SR_labeled,test),axis=0)

In [47]:
testing_sample.shape

(271934, 11)

In [36]:
ts_df = pd.DataFrame(testing_sample, columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [37]:
ts_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1914.942993,0.369530,105.035004,1583.804443,-0.185737,2.898982,461.574005,0.552809,0.121353,3662.211182,1.0
1,1684.598755,-0.523116,159.865997,1647.186768,0.110357,3.141156,514.883972,0.440781,0.299984,3586.710693,1.0
2,1789.997070,0.156652,93.665901,1569.509399,0.144243,3.235663,475.316986,0.136103,0.135523,3421.777344,1.0
3,1672.631348,-1.015185,116.327003,1568.322998,-0.350886,3.165926,561.236023,0.617014,0.294746,3536.982910,1.0
4,1431.694946,-0.700751,513.015991,1099.721313,0.945019,3.245961,108.752998,0.183145,0.456454,3481.573486,1.0
...,...,...,...,...,...,...,...,...,...,...,...
271929,1359.633911,0.745962,200.356003,1108.893433,-0.877124,3.267900,142.541000,0.356389,0.610910,3324.132080,0.0
271930,1381.264038,1.317877,47.827900,1333.037720,-0.325582,3.098527,59.168598,0.706115,0.183702,3684.278320,0.0
271931,1655.104614,-0.269907,318.651001,1318.416382,1.028864,2.741848,365.652008,0.532545,0.607478,3616.794189,0.0
271932,1302.850708,-0.669769,133.298996,773.597778,1.623327,3.462919,242.063995,0.673634,0.603720,3484.684570,0.0


In [39]:
testing_sample[:,10]
unique, counts = np.unique(testing_sample[:,10], return_counts=True)
dict(zip(unique, counts))

{0.0: 121338, 1.0: 150596}

In [49]:
sb_ratio = np.logspace(-3,-0.205,5)
#sb_ratio = np.linspace(0,0.62,10)
mixedsb = []
generated_data = []
for i in sb_ratio:
    sampled_signal = np.random.choice(np_sig_SR.shape[0], (int)(i * training.shape[0]))
    combined = np.concatenate((np_sig_SR_labeled[sampled_signal,:],training), axis =0)
    gen = generate_gan(gen_model,combined)
    gen2 = generate_gan(gen_model,combined)
    gen_data = np.concatenate((gen,gen2),axis=0)
    generated_data_labeled  = np.copy(gen_data)
    generated_data_labeled = np.append(generated_data_labeled,np.zeros([len(generated_data_labeled),1]),1)
    mixedsb.append(sample_data(combined,100000))
    generated_data.append(generated_data_labeled)

In [50]:
dataset_label_df = pd.DataFrame(mixedsb[4], columns = ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])


In [51]:
dataset_label_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1540.473389,0.223300,103.888000,1491.122803,1.292159,3.136605,512.163025,0.320008,0.099555,3536.390869,1.0
1,1440.791992,-0.528059,284.596008,1422.056396,0.765371,3.124997,467.510010,0.481643,0.499015,3565.357910,0.0
2,1298.666016,-0.987181,437.763000,1142.459106,0.660296,3.133340,42.643600,0.466969,0.728824,3359.856934,0.0
3,1208.166504,-1.298273,60.212898,1172.483887,0.589808,3.139624,93.815804,0.617938,0.910200,3526.458008,0.0
4,1679.516602,0.333428,96.210403,1600.089722,-0.291809,3.105348,128.330002,0.049997,0.580430,3447.192139,1.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,2026.749268,-0.511651,236.503998,1019.270691,-1.821643,3.544259,35.225899,0.384620,0.741260,3485.869385,0.0
99996,1778.166504,0.485632,523.215027,1652.175171,0.783328,3.183554,108.714996,0.401237,0.527075,3549.722656,1.0
99997,1728.041626,0.538613,136.613007,1676.928589,-0.003937,3.138898,540.773010,0.243464,0.169897,3623.607666,1.0
99998,1732.292236,0.170773,133.610992,1573.427490,0.763055,3.056173,531.919983,0.490440,0.164551,3525.276123,1.0


In [47]:
scaled_dataset = []
for i in range(len(mixedsb)):
    classifier_real = mixedsb[i][:,[2,6,7,8]]
    classifier_fake = generated_data[i][:,[2,6,7,8]]
    sblabel_real = mixedsb_labeled[i][:,10]
    sblabel_fake = generateddata_labeled[i][:,10]
    unscaled_data = np.concatenate((classifier_real,classifier_fake),axis=0)
    sblabel = np.concatenate((sblabel_real,sblabel_fake),axis = 0)
    scaler = StandardScaler().fit(unscaled_data)
    scaled_data = scaler.transform(unscaled_data)
    labels = np.concatenate((np.ones([len(classifier_real),1]),np.zeros([len(classifier_fake),1])),axis=0)
    scaled_dataset.append(np.concatenate((scaled_data,labels),axis=1))

NameError: name 'mixedsb_labeled' is not defined

In [None]:
print(generated_data[0].shape)
print(mixedsb[0].shape)
print(scaled_dataset[i].shape)
print(len(scaled_dataset))

In [None]:
ypred4c = []
fp_4c, tp_4c,th_4c= [],[],[]
auc_list_4c = []
for i in range(len(mixedsb)): 
    classifier_real = mixedsb[i][:,[2,6,7,8]]
    classifier_fake = generated_data[i][:,[2,6,7,8]]
    
    real = np.append(classifier_real,np.ones([len(classifier_real),1]),1)
    fake = np.append(classifier_fake,np.zeros([len(classifier_fake),1]),1)
    dataset = np.concatenate((real,fake),axis=0)
    x_train,x_test,y_train,y_test = train_test_split(dataset[:,0:4],dataset[:,-1], test_size = 0.2, random_state=42)
    x_train,x_val,y_train,y_val= train_test_split(dataset[:,0:4],dataset[:,-1], test_size = 0.2, random_state=42)
    classifier = classifier_model_4features()
    history = classifier.fit(x_train, y_train, epochs=20, batch_size=128,validation_data=(x_val,y_val))
    learningCurveLoss(history)
    y_pred = classifier.predict(x_test)
    ypred4c.append(y_pred)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc_value = auc(fpr,tpr)
    auc_list_4c.append(auc_value)
    fp_4c.append(fpr)
    tp_4c.append(tpr)
    th_4c.append(thresholds)
    