In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import gc
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [2]:
NOISE_DIM = 128 # 64 in Gitlab
TESTING= False
BATCH_SIZE = 64
SAMPLE_SIZE= 50000
BINS = 25

In [3]:
filenames = {
    "herwig": "../GAN-data/events_anomalydetection_DelphesHerwig_qcd_features.h5",
    "pythiabg": "../GAN-data/events_anomalydetection_DelphesPythia8_v2_qcd_features.h5",
    "pythiasig": "../GAN-data/events_anomalydetection_DelphesPythia8_v2_Wprime_features.h5"
}

datatypes = ["herwig", "pythiabg", "pythiasig"]

train_features = ["ptj1", "etaj1", "mj1", "ptj2", "etaj2", "phij2", "mj2", "tau21j1", "tau21j2"]
condition_features = ["mjj"]

features = train_features + condition_features
GEN_DIM = NOISE_DIM + len(condition_features)
DISC_DIM = len(features)

In [4]:
def cut_data(uncut_data, pTmin = 1200, etamax = 2.5):
    # Column 0: ptj1
    # Column 1: etaj1
    # Column 3: ptj2
    # Column 4: etaj2
    return uncut_data[((uncut_data[:,0] > pTmin) & (np.abs(uncut_data[:,1]) < etamax)) | ((uncut_data[:,3] > pTmin) & (np.abs(uncut_data[:,4]) < etamax))]

np_bg_SB = np.load('./data/processed/np_bg_SB_2.npy')
np_bg_SR = np.load('./data/processed/np_bg_SR_2.npy')
np_sig_SR = np.load('./data/processed/np_sig_SR_2.npy')

np_sig_SR_labeled = np.copy(np_sig_SR)
np_bg_SR_labeled = np.copy(np_bg_SR)
#add sb label
np_sig_SR_labeled = np.append(np_sig_SR_labeled,np.ones([len(np_sig_SR),1]),1)
np_bg_SR_labeled = np.append(np_bg_SR_labeled,np.zeros([len(np_bg_SR),1]),1)
np_combined_SR = np.concatenate((np_bg_SR, np_sig_SR), axis = 0)
np_combined_SR_labeled = np.concatenate((np_sig_SR_labeled,np_bg_SR_labeled),axis=0)

gc.collect()

9

In [5]:
gen_model = tf.keras.models.load_model('./Results/epoch300-generator.h5')



In [6]:
def generate_gan(generator, realdata):


    labels = sample_fake(refdata = realdata, size = SAMPLE_SIZE) # Sample mjj from the existing distribution of mjj for comparison
    labels_scaled = scaler_mjj.transform(labels.reshape(-1,1))
    fakedata_uncut_unscaled = generator(tf.concat([tf.random.uniform((SAMPLE_SIZE, NOISE_DIM)), labels_scaled], 1), training=False)
    fakedata_uncut = np.concatenate((scaler.inverse_transform(fakedata_uncut_unscaled), labels.reshape(-1,1)), axis = 1)

    # At least one jet has pT > 1200 and |eta| < 2.5
    fakedata = cut_data(fakedata_uncut)
    # mjj = sqrt(Ejj**2 - pxjj**2 - pyjj**2 - pzjj**2)
    fakedata_mjj = mjj(fakedata)
    return fakedata
def mjj(output):
    pt1 = output[:,0]
    eta1 = output[:,1]
    m1 = output[:,2]
    pt2 = output[:,3]
    eta2 = output[:,4]
    phi2 = output[:,5]
    m2 = output[:,6]
    ejj = np.sqrt((pt1 * np.cosh(eta1))**2 + m1**2) + np.sqrt((pt2 * np.cosh(eta2))**2 + m2**2)
    pxjj = pt1 + pt2 * np.cos(phi2)
    pyjj = pt2 * np.sin(phi2)
    pzjj = pt1 * np.sinh(eta1) + pt2 * np.sinh(eta2)
    return np.sqrt(ejj**2 - pxjj**2 - pyjj**2 - pzjj**2)
def sample_fake(refdata = np_combined_SR, size = BATCH_SIZE):
    rand_idx = np.random.choice(refdata.shape[0], size = size)
    return refdata[rand_idx, -1].reshape((-1,1))
def sample_data(refdata = np_combined_SR_labeled,size= 10000):
    rand_idx = np.random.choice(refdata.shape[0], size)
    return refdata[rand_idx, :]


In [15]:
np_bg_SB_trimmed = np.delete(np_bg_SB, [i for i in range(np_bg_SB.shape[0] % (BATCH_SIZE * 4))], axis = 0)

# Normalize inputs between -1 and 1, mjj between 0 and 1
scaler = MinMaxScaler((-1,1)).fit(np_bg_SB_trimmed[:,:-1])
scaler_mjj = MinMaxScaler((0,1)).fit(np_bg_SB_trimmed[:,-1].reshape(-1,1))
np_bg_SB_scaled = np.concatenate((scaler.transform(np_bg_SB_trimmed[:,:-1]), scaler_mjj.transform(np_bg_SB_trimmed[:,-1].reshape(-1,1))), axis = 1)

In [16]:
#add signal or background labels to "real" data
np_bg_SR_labeled  = np.copy(np_bg_SR)
np_bg_SR_labeled  = np.append(np_bg_SR_labeled ,np.zeros([len(np_bg_SR_labeled) ,1]),1)
np_sig_SR_labeled  = np.copy(np_sig_SR)
np_sig_SR_labeled  = np.append(np_sig_SR_labeled ,np.ones([len(np_sig_SR_labeled) ,1]),1)

In [17]:
np_bg_SR_labeled.shape

(242676, 11)

In [18]:
# split background in SR into testing and training. Testing is to inject signals
train_bg,test_bg,train_sb,test_sb =train_test_split(np_bg_SR_labeled[:,0:10],np.zeros([len(np_bg_SR_labeled) ,1]), test_size = 0.5, random_state=42)
sb = train_sb.reshape(len(train_sb),1)
training_dataset =  np.concatenate((train_bg,sb),axis=1)
test_dataset = np.concatenate((test_bg,sb),axis=1)
testing_sample = np.concatenate((np_sig_SR_labeled,test_dataset),axis=0)

In [19]:
train_bg_df = pd.DataFrame(training_dataset, columns =  ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [20]:
train_bg_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1478.963989,-0.087060,587.213013,1315.960449,-1.179599,3.145958,88.580399,0.117466,0.656521,3351.266357,0.0
1,1244.853760,-1.453661,109.531998,1224.353027,0.290794,3.111021,164.820007,0.622247,0.661712,3484.205566,0.0
2,1672.719604,0.379480,115.539001,1450.374268,-0.323801,3.158839,336.877014,0.647902,0.250641,3350.227051,0.0
3,1919.398315,0.021479,780.789978,1440.344116,0.595646,3.079967,113.733002,0.483610,0.921526,3625.344971,0.0
4,1671.050781,-0.474471,575.271973,1666.301025,-0.009151,3.078633,189.287994,0.205169,0.547426,3526.838135,0.0
...,...,...,...,...,...,...,...,...,...,...,...
121333,1567.139160,-0.666979,286.657990,1197.136475,0.685944,3.121261,97.951202,0.600849,0.713911,3414.934814,0.0
121334,1260.581299,1.373300,215.798996,1235.346558,-0.144256,3.096345,724.150024,0.317927,0.403231,3490.529053,0.0
121335,1325.088623,0.552296,139.050995,1275.960571,-1.087261,3.141881,38.967701,0.642636,0.608677,3531.351074,0.0
121336,1885.110962,-0.631907,92.702301,1640.640259,-0.542369,3.163306,85.492699,0.402115,0.427621,3525.103027,0.0


In [21]:
#inject signal into train dataset
sb_ratio = np.logspace(-3,-0.205,5)
#sb_ratio = np.linspace(0,0.62,10)
mixedsb = []
generated_data = []

for i in sb_ratio:
    sampled_signal = np.random.choice(np_sig_SR_labeled.shape[0], (int)(i * training_dataset.shape[0]))
    combined = np.concatenate((np_sig_SR_labeled[sampled_signal,:],training_dataset), axis =0) #inject signal (have sb label)
    
    #generate fake data
    gen = generate_gan(gen_model,combined) #50k events
    gen2 = generate_gan(gen_model,combined) #50k events
    gen_data = np.concatenate((gen,gen2),axis=0) #100k events
    #print(gen_data.shape)
    generated_data_labeled  = np.copy(gen_data)
    generated_data_labeled = np.append(generated_data_labeled,np.zeros([len(generated_data_labeled),1]),1) #create fake bg data with sb label
    generated_data.append(generated_data_labeled)
    
    mixedsb.append(sample_data(combined,100000)) #sample 100,000 events from injected signal+ bg


In [22]:
mixed_df = pd.DataFrame(mixedsb[4], columns =  ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [23]:
mixed_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1270.833252,-0.259898,406.126007,1221.439941,1.246697,3.089530,172.764008,0.342629,0.642710,3307.843262,0.0
1,1864.143799,-0.081016,114.033997,1515.708984,0.371670,3.024424,538.432007,0.325389,0.479810,3534.490967,1.0
2,1264.771118,-0.989593,91.161301,966.119080,1.120650,3.191941,51.256901,0.725381,0.664113,3562.451904,0.0
3,1245.471924,-0.089172,207.184006,1112.860107,1.645016,3.192005,299.756012,0.431744,0.258235,3337.204346,0.0
4,1203.139526,-1.140966,100.759003,1155.042114,0.648668,3.139835,494.944000,0.065871,0.191099,3456.160400,1.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,1220.059937,0.652252,74.109703,1006.534485,-1.397408,3.151838,48.737301,0.316178,0.773008,3488.199707,0.0
99996,1619.277954,0.240322,394.984009,1519.014526,-0.583430,3.143633,561.000000,0.167473,0.149722,3540.457031,0.0
99997,1359.847046,-0.423438,92.339996,1308.480103,1.077055,3.169245,498.334991,0.153354,0.523415,3517.327148,1.0
99998,1671.450195,0.170745,153.380997,1530.712891,-0.532533,3.131903,489.957001,0.309327,0.135606,3474.035889,1.0


In [24]:
unique, counts = np.unique(mixedsb[4][:,-1], return_counts=True)
dict(zip(unique, counts))

{0.0: 61430, 1.0: 38570}

In [25]:
fixed_gen = []
for i in generated_data:
    df =  pd.DataFrame(i, columns =  ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])
    del df['mjj']
    fixed_gen.append(df.to_numpy())

In [26]:
generated_df = pd.DataFrame(fixed_gen[4], columns =  ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','sb'])

In [27]:
generated_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,sb
0,1223.234089,0.567763,1127.937392,1712.499205,0.154751,4.017511,1129.350784,0.057110,0.524470,0.0
1,1207.028711,-0.168744,1287.121173,1548.733451,-0.499819,3.706205,920.890624,0.015403,0.928354,0.0
2,1229.307171,0.292340,1279.623253,1241.027127,-0.366420,3.747677,1086.636512,0.015309,0.803112,0.0
3,1243.130886,-0.130785,1246.979655,1281.025420,-0.753219,3.285466,279.115009,0.015389,0.951631,0.0
4,1209.443661,0.064011,1295.455502,1539.698562,-0.212279,3.567239,1183.796315,0.015226,0.272709,0.0
...,...,...,...,...,...,...,...,...,...,...
99995,1217.719398,0.752869,1174.963976,1575.853261,0.549349,3.716582,1116.088686,0.032617,0.186601,0.0
99996,1231.918028,0.248126,1242.376857,1466.951602,-0.191744,3.811757,843.401762,0.015940,0.972976,0.0
99997,1253.011127,-0.194880,1266.734836,1458.839275,-0.917994,3.805858,1048.810629,0.015493,0.212394,0.0
99998,1241.607203,0.046924,1269.092743,1426.517714,-0.214307,3.846486,698.123494,0.016332,0.725097,0.0


In [28]:
unique, counts = np.unique(generated_data[4][:,-1], return_counts=True)
dict(zip(unique, counts))

{0.0: 100000}

In [29]:
test_df = pd.DataFrame(testing_sample, columns =  ['pt1','eta1','m1','pt2','eta2','phi2','m2','tau21j1','tau21j2','mjj','sblabel'])

In [30]:
test_df

Unnamed: 0,pt1,eta1,m1,pt2,eta2,phi2,m2,tau21j1,tau21j2,mjj,sblabel
0,1914.942993,0.369530,105.035004,1583.804443,-0.185737,2.898982,461.574005,0.552809,0.121353,3662.211182,1.0
1,1684.598755,-0.523116,159.865997,1647.186768,0.110357,3.141156,514.883972,0.440781,0.299984,3586.710693,1.0
2,1789.997070,0.156652,93.665901,1569.509399,0.144243,3.235663,475.316986,0.136103,0.135523,3421.777344,1.0
3,1672.631348,-1.015185,116.327003,1568.322998,-0.350886,3.165926,561.236023,0.617014,0.294746,3536.982910,1.0
4,1431.694946,-0.700751,513.015991,1099.721313,0.945019,3.245961,108.752998,0.183145,0.456454,3481.573486,1.0
...,...,...,...,...,...,...,...,...,...,...,...
271929,1227.959229,1.449520,58.746700,1065.344727,-0.411850,2.799537,125.490997,0.600681,0.619995,3337.869141,0.0
271930,1447.491943,0.133693,382.397003,1304.911377,1.394691,3.029165,334.260010,0.637250,0.452088,3398.224365,0.0
271931,1283.751709,-0.036741,349.522003,1119.477783,1.684605,3.130704,56.165401,0.568723,0.568059,3403.787109,0.0
271932,1259.622437,-0.325118,221.408997,1240.384521,1.267860,3.103868,94.780098,0.469168,0.667305,3357.853516,0.0


In [31]:
unique, counts = np.unique(testing_sample[:,10], return_counts=True)
dict(zip(unique, counts))

{0.0: 121338, 1.0: 150596}

In [44]:
classifier_real = mixedsb[0][:,[2,6,7,8]]
sblabel_real = mixedsb[0][:,-1]

classifier_fake = generated_data[4][:,[2,6,7,8]]
sblabel_fake = generated_data[4][:,-1]

unscaled_data = np.concatenate((classifier_real,classifier_fake),axis=0)

scaler = StandardScaler().fit(unscaled_data)
scaled_data = scaler.transform(unscaled_data)


sblabel = np.concatenate((sblabel_real,sblabel_fake),axis = 0)
sblabel = sblabel.reshape(len(sblabel),1)
rflabels = np.concatenate((np.ones([len(classifier_real),1]),np.zeros([len(classifier_fake),1])),axis=0)

scaled_data = np.concatenate((scaled_data,sblabel),axis=1) #add sb label to scaled data (mixed)
scaled_data = np.concatenate((scaled_data,rflabels),axis=1) #add rf label to scaled data (mixed)


scaled_sb = testing_sample[:,-1].reshape(len(testing_sample),1) #sb label for test
scaled_test = scaler.transform(testing_sample[:,[2,6,7,8]]) #scale testing data
scaled_test_data = np.concatenate((scaled_test,scaled_sb),axis=1) #add sb label to testing data




In [45]:
scaled_df = pd.DataFrame(scaled_data, columns = ['m1','m2','tau21j1','tau21j2','sb','rf'])

In [46]:
scaled_df

Unnamed: 0,m1,m2,tau21j1,tau21j2,sb,rf
0,-1.137928,-1.199470,1.438068,0.020828,0.0,1.0
1,-1.291569,-1.138336,1.738705,-0.482054,0.0,1.0
2,-0.195136,-0.740001,-0.498754,0.349394,0.0,1.0
3,-0.858205,-0.725160,0.083600,-1.418093,0.0,1.0
4,-0.615885,-0.605930,0.910652,0.239888,0.0,1.0
...,...,...,...,...,...,...
199995,0.830649,1.322088,-0.836263,-1.274495,0.0,0.0
199996,0.958158,0.650624,-0.894240,1.632262,0.0,0.0
199997,1.004230,1.156423,-0.895794,-1.179152,0.0,0.0
199998,1.008690,0.292891,-0.892876,0.716002,0.0,0.0


In [34]:
### unscaled 
combined_dat = np.concatenate((classifier_real,classifier_fake),axis=0)
data = np.concatenate((combined_dat,sblabel),axis=1)
data = np.concatenate((data,rflabels),axis=1)
test_sb = testing_sample[:,-1].reshape(len(testing_sample),1)
test_data = testing_sample[:,[2,3,7,8]]
test_data = np.concatenate((test_data,test_sb),axis=1)

In [39]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      
]
def classifier_model():    
    model = tf.keras.Sequential()
    model.add(layers.Dense(64,input_dim = 4,activation ='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64,activation = 'relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[METRICS])
    return model

def learningCurveLoss(history):
    plt.figure(figsize=(10,8))
    plt.plot(history.history['loss'], linewidth=1)
    plt.plot(history.history['val_loss'], linewidth=1)
    plt.title('Model Loss over Epochs')
    plt.ylabel('Loss')
    #plt.ylim(0,5)
    plt.xlabel('Epoch')
    plt.legend(['training sample loss','validatio}n sample loss'])
    plt.savefig('5_tag_learning_curve.png')
    plt.show()
    plt.close()
    #plt.savefig("Learning_Curve")
def plot_roc_curve(y_test, y_test_score):

    fpr, tpr, _ = roc_curve(y_test, y_test_score)
    auc_value = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr,label=' AUC = %.1f%%'%(auc_value*100.))
    plt.plot([0, 1], [0, 1], 'k-')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc='lower right')
    plt.show()


In [47]:
classifier = classifier_model()
classifier.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 64)                320       
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_14 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 64)               

In [None]:
x_train,x_test,y_train,y_test = train_test_split(unscaled_data ,rflabels, test_size = 0.2, random_state=32) #rf label
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=32) # 0.25 x 0.8 = 0.2, #rflabel


sbx_test,sby_test = scaled_test_data[:,0:4],scaled_test_data[:,-1] #SB label
classifier = classifier_model()
history = classifier.fit(x_train, y_train, epochs=10, batch_size=64,validation_data=(x_val,y_val))
#learningCurveLoss(history)
#y_pred = classifier.predict(sbx_test) #predict on signal vs background in test sample
#fpr, tpr, thresholds = roc_curve(sby_test, y_pred)
#auc_value = auc(fpr,tpr)


Train on 120000 samples, validate on 40000 samples
Epoch 1/10
Epoch 2/10
 16576/120000 [===>..........................] - ETA: 4s - loss: 0.0509 - tp: 8293.0000 - fp: 104.0000 - tn: 8023.0000 - fn: 156.0000 - accuracy: 0.9843 - precision: 0.9876 - recall: 0.9815 - auc: 0.9986

In [57]:
(generated_data[4][:,-1]).shape

(100000,)