In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.metrics import roc_curve, roc_auc_score
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import Input
from keras.optimizers import Adam, AdamW
tf.random.set_seed(2)

import pyhf

In [None]:
OUT_DIR = "./MachineLearning/"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
dataframe = pd.read_csv('dataframe.csv')

In [None]:
dataframe['train_weight'] = 1.
dataframe.loc[dataframe.query('label==1').index,'train_weight'] = (dataframe.loc[dataframe.query('label==1').index,'scaleweight']) / dataframe.loc[dataframe.query('label==1').index,'scaleweight'].sum()
dataframe.loc[dataframe.query('label==0').index,'train_weight'] = dataframe.loc[dataframe.query('label==0').index,'scaleweight'] / dataframe.loc[dataframe.query('label==0').index,'scaleweight'].sum()

sum_w_sig = dataframe.query('label==0')['train_weight'].sum()
sum_w_bkg = dataframe.query('label==1')['train_weight'].sum()
print(f'Sum of weights for training Signal {sum_w_sig:.3} and Background {sum_w_bkg:.3}')

In [None]:
train_features = ['etmiss', 'mtw', 'leadleptPt', 'leadleptEta',
       'leadleptE', 'leadleptPhi', 'Q_leadlep', 'leadleptID', 'n_TopLRjets',
       'leadTopLRjet_pt', 'leadTopLRjet_eta', 'leadTopLRjet_phi',
       'leadTopLRjet_m', 'leadTopLRjet_Tau32', 'n_jets', 'leadjet_pt',
       'leadjet_eta', 'n_bjets', 'leadbjet_pt', 'leadbjet_eta', 'ttbarMLR']

weights = ["scaleweight", "train_weight"]

In [None]:
x_train, x_val, y_train, y_val, w_train_full, w_val_full = train_test_split(dataframe[train_features].values, dataframe['label'].values, dataframe[weights].values, train_size = 1/3, random_state = 9)
x_val, x_test, y_val, y_test, w_val_full, w_test_full = train_test_split(x_val, y_val, w_val_full, test_size=1/2, random_state = 9)

scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)


In [None]:
def weight_separation(w):
    scaleweights = w[:, 0]
    train = w[:, 1]
    return scaleweights, train

w_train_scale, w_train = weight_separation(w_train_full)
w_val_scale, w_val = weight_separation(w_val_full)
w_test_scale, w_test = weight_separation(w_test_full)

In [None]:
model = Sequential([Input(shape=(x_train.shape[1],))])

# Input and Hidden layers
model.add(Dense(75, activation='softsign'))
model.add(Dropout(0.1))
model.add(Dense(125, activation='softsign'))
model.add(Dense(125, activation='relu'))
model.add(Dense(75, activation='relu'))

# Output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=AdamW(learning_rate=1e-3, weight_decay=1e-1), weighted_metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train, sample_weight=w_train, validation_data=(x_val, y_val, w_val), epochs=100, batch_size=1024, callbacks=[EarlyStopping(patience=2)])

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.savefig(OUT_DIR + "loss.png")
plt.show()

plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig(OUT_DIR + "accuracy.png")
plt.show()

In [None]:
y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)
y_test_pred = model.predict(x_test)

In [None]:
bin = plt.hist(y_test_pred[y_test==0], bins=100, density=True, histtype='step', color='blue', label='bkg test')
plt.hist(y_test_pred[y_test==1], bins=bin[1], density=True, histtype='step', color='red', label='sig test')

plt.xlabel('NN output')
plt.ylabel('Density')
plt.legend()
plt.savefig(OUT_DIR + "NN_test_output.png")
plt.show()

In [None]:
# your answer
bins = plt.hist(y_train_pred[y_train==1], bins=100, density=True, histtype='step', color='blue', label='sig train')
plt.hist(y_val_pred[y_val==1], bins=bins[1], density=True, histtype='step', color='red', label='sig val')
bins = plt.hist(y_train_pred[y_train==0], bins=bins[1], density=True, histtype='step', color='green', label='bkg train')
plt.hist(y_val_pred[y_val==0], bins=bins[1], density=True, histtype='step', color='orange', label='bkg val')

plt.xlabel('NN output')
plt.legend()
plt.savefig(OUT_DIR + "NN_train_val_output.png")
plt.show()


In [None]:
# RPC curve
fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
auc = roc_auc_score(y_val, y_val_pred)
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.title('ROC curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend()
plt.savefig(OUT_DIR + "ROC_curve.png")
plt.show()

In [None]:
# join the y predictions and scaleweights into a dataframe with two columns
def join_y_w(y, w):
    df = pd.DataFrame({'y_pred': y, 'scaleweights': w})
    return df

df_train = join_y_w(y_train_pred[:, 0], w_train_scale[:])
df_val = join_y_w(y_val_pred[:, 0], w_val_scale[:])
df_test = join_y_w(y_test_pred[:, 0], w_test_scale[:]*3)

In [None]:
df_test

In [None]:
data = pd.read_csv('Output_ZPrimeBoostedAnalysis/data.csv', delimiter= " ")
data

In [None]:
x_data = scaler.transform(data[train_features].values)
w_data = data["weight"].values

In [None]:
y_data_pred = model.predict(x_data)

In [None]:
df_data = join_y_w(y_data_pred[:, 0], w_data[:])

In [None]:
bins = plt.hist(df_data['y_pred'], bins=30, histtype='step', color='blue', label='data', weights=df_data['scaleweights'])
plt.hist(df_test['y_pred'], bins=bins[1], histtype='step', color='red', label='test', weights=df_test['scaleweights'])
plt.xlabel('NN output')
plt.ylabel('Events')
plt.legend()
plt.savefig(OUT_DIR + "NN_data_test_output.png")
plt.show()

In [None]:
N, bins = np.histogram(df_data['y_pred'], bins=50, weights=df_data['scaleweights'])

B = np.histogram(df_test['y_pred'][y_test==0], bins=bins, weights=df_test['scaleweights'][y_test==0])[0]
S = np.histogram(df_test['y_pred'][y_test==1], bins=bins, weights=df_test['scaleweights'][y_test==1])[0]


In [None]:
# Plot N, S, B
plt.hist(bins[:-1], bins, weights=N, histtype='step', color='blue', label='data')
plt.hist(bins[:-1], bins, weights=B, histtype='step', color='green', label='bkg')
plt.hist(bins[:-1], bins, weights=S, histtype='step', color='red', label='sig')
plt.xlabel('NN output')
plt.ylabel('Events')
plt.legend()
plt.savefig(OUT_DIR + "NN_NSB.png")
plt.show()

In [None]:
model_spec = {'channels': [{'name': 'singlechannel',
              'samples': [
              {'name': 'signal','data': S.tolist(),
               'modifiers': [{'data': None, 'name': 'mu', 'type': 'normfactor'}]},
              {'name': 'bkg1','data': B.tolist(),
               'modifiers': []},
              ]
              }],
              "observations": [{ "name": "singlechannel", "data": N.tolist() }],
              "measurements": [{ "name": "Measurement", "config": {"poi": "mu", "parameters": []}}],
              "version": "1.0.0",
}

workspace = pyhf.Workspace(model_spec)
model = workspace.model()

print("## Model")
print(f"  channels: {model.config.channels}")
print(f"     nbins: {model.config.channel_nbins}")
print(f"   samples: {model.config.samples}")
print(f" modifiers: {model.config.modifiers}")
print(f"parameters: {model.config.parameters}")
print(f"par. order: {model.config.par_order}")

print("\n## Model parameters")
print(f'   default: {model.config.suggested_init()}')
print(f'    bounds: {model.config.suggested_bounds()}')

data = N.tolist() + model.config.auxdata

test_stat = "qtilde"
test_poi = 1.

# Vary mu values
poi_values = np.linspace(0.01, 5, 500)
obs_limit, exp_limits = pyhf.infer.intervals.upper_limits.upper_limit(data, model, poi_values, level=0.05)
print(f"\nObserved μ upper limit (obs): {obs_limit:.3f}, Expected μ upper limit {exp_limits[2]:.3f}")

In [None]:
# save result to txt
with open(OUT_DIR + "results.txt", "w") as f:
    f.write(f"Observed μ upper limit (obs): {obs_limit:.3f}\n")
    f.write(f"Expected μ upper limit (exp): {exp_limits[2]:.3f}\n")