In [1]:
import sys
import os
import pathlib
from datetime import datetime
from time import time

import GPUtil
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras as tfk
from tensorflow.python.client import device_lib

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.offline import init_notebook_mode

init_notebook_mode(connected = True)
pio.templates.default = "plotly_dark"

sys.path.append("../")

from pae.models.autoencoder import DenseAutoencoder
from pae.models.flows import MAF
from pae.models.nn import PaeBuilder

from pae.utils import load_json
from pae.loaders.LHCO import ScalarLoaderLHCO, DatasetBuilder

from pae.plotting import feature_plots, loss_plot, \
                        latent_space_plot

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


devices = tf.config.list_physical_devices()
print("tensorflow", tf.__version__)
print("tensorflow-probability", tfp.__version__)
print("Available devices:", *[dev[1] for dev in devices])

# SEED = 100
# np.random.seed(SEED) 
# tf.random.set_seed(SEED)



2 Physical GPUs, 2 Logical GPUs
tensorflow 2.6.0
tensorflow-probability 0.14.0
Available devices: CPU GPU GPU


2021-10-17 18:15:36.884936: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-17 18:15:38.133460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 29386 MB memory:  -> device: 0, name: NVIDIA Tesla V100S-PCIE-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2021-10-17 18:15:38.134999: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30538 MB memory:  -> device: 1, name: NVIDIA Tesla V100S-PCIE-32GB, pci bus id: 0000:d8:00.0, compute capability: 7.0


Attempting (1/1) to locate available GPU.
GPU [0] located!


list

In [2]:

timestamp = datetime.fromtimestamp(int(time())).isoformat()
run_dir = pathlib.Path(f"./logs/{timestamp}")
os.mkdir(run_dir)

from tensorboard import main as tb
tf.flags.FLAGS.logdir = run_dir
tb.main()

1634476448


In [None]:
x = ScalarLoaderLHCO.from_json("../pae/configs/loader/rnd_scalar_2j.json")
mjj = ScalarLoaderLHCO.from_json("../pae/configs/loader/rnd_scalar_mjj.json")
builder = DatasetBuilder(x, mjj)
builder.data_preparation(sample_sizes ={'sig':100, 'bkg': 20_000}, fit_key='bkg')
dataset = builder.make_dataset(train = {'bkg':10_000}, test={'sig':100, 'bkg': 10_000})


fig = feature_plots(dataset['x_train'], 'dijet')
fig.update_layout(title="Training features transformed")
fig.write_html(run_dir / "train_features.pdf")
fig = feature_plots(dataset['x_test'], 'dijet', color='coral')
fig.update_layout(title="Testing features transformed")
fig.write_html(run_dir / "test_features.pdf")

In [None]:
from pae.density import GMM, ConvKDE, KNNDensity, ExpnormFit

fit_data = dataset["mjj_train"]

gmm = GMM(n_components=200, covariance_type='full', max_iter=1_000, n_init=5)
gmm.fit(fit_data)
#y_gmm = gmm.evaluate(fit_data)

fftkde = ConvKDE(bw="silverman", kernel="box")
fftkde.fit(fit_data)
#y_kde = fftkde.evaluate(fit_data)

knn = KNNDensity()
knn.fit(fit_data)
#y_knn = knn.evaluate(fit_data)

expn = ExpnormFit()
expn.fit(fit_data)

In [None]:
x_ref = np.linspace(1600, 8000, 1701)
y_gmm = gmm.evaluate(x_ref)
y_kde = fftkde.evaluate(x_ref)
y_knn = knn.evaluate(x_ref)
y_exp = expn.evaluate(x_ref)

fig = go.Figure()
fig.add_trace(go.Scatter(x=x_ref, y=y_gmm, mode='lines', name='GMM',
                         line={'color': 'greenyellow', 'width': 2, 'dash': 'dot'}))
fig.add_trace(go.Scatter(x=x_ref, y=y_kde, mode='lines', name='FFTKDE',
                         line={'color': 'indianred', 'width': 2, 'dash': 'dash'}))
fig.add_trace(go.Scatter(x=x_ref, y=y_knn, mode='lines', name='KNN',
                         line={'color': 'turquoise', 'width': 2, 'dash': 'longdashdot'}))
fig.add_trace(go.Scatter(x=x_ref, y=y_exp, mode='lines', name='expnorm',
                         line={'color': 'indigo', 'width': 2, 'dash': 'solid'}))
fig.add_trace(go.Histogram(x=dataset["mjj_train"].ravel(), nbinsx=600, histnorm='probability density', 
                           marker_color='steelblue', name='Histnorm'))
fig.update_layout(
    title_text='Dijet mass distribution and density estimation',
    xaxis_title_text=r'$$m_{jj}$$',
    yaxis_title_text=r'density',
)
fig.write_image(run_dir / "density_fit.pdf")

In [None]:
data = dataset['mjj_train'].ravel()

data_key = 'mjj_train'

w_gmm = gmm.get_weights(dataset[data_key])
w_kde = fftkde.get_weights(dataset[data_key])
w_expnorm = expn.get_weights(dataset[data_key])
w_knn = knn.get_weights(dataset[data_key])
fig = go.Figure()
fig.add_trace(go.Scattergl(x=data, y=w_gmm, 
                           mode='markers', name='GMM', opacity=0.8,
                           marker=dict(color='greenyellow',symbol='diamond'))
            )
fig.add_trace(go.Scattergl(x=data, y=w_kde, 
                           mode='markers', name='FFTKDE', opacity=0.8,
                           marker=dict(color='indianred',symbol='star-square'))
            )
fig.add_trace(go.Scattergl(x=data, y=w_expnorm, 
                           mode='markers', name='expnorm', opacity=0.8,
                           marker=dict(color='indigo',symbol='circle'))
            )
fig.add_trace(go.Scattergl(x=data, y=w_knn, 
                           mode='markers', name='KNN', opacity=0.8,
                           marker=dict(color='turquoise',symbol='triangle-nw-dot'))
            )
fig.update_layout(
    title_text='Weights relative to dijetmass scatter plot',
    xaxis_title_text=r'$$m_{jj}$$',
    yaxis_title_text=r'weight',
    yaxis_type="log"
)
fig.write_image(run_dir / "weights_scatter.pdf")

In [None]:
n_kde, b = np.histogram(data, bins=20, weights=w_kde)
n_gmm, _ = np.histogram(data, bins=b, weights=w_gmm)
n_exp, _ = np.histogram(data, bins=b, weights=w_expnorm)
n_knn, _ = np.histogram(data, bins=b, weights=w_knn)
fig = go.Figure()       
fig.add_trace(go.Bar(x=b[:-1], y=n_kde, name='FFTKDE',
                           marker=dict(color='indianred'))
            )
fig.add_trace(go.Bar(x=b[:-1], y=n_gmm, name='GMM',
                        marker=dict(color='yellowgreen'))
            )
fig.add_trace(go.Bar(x=b[:-1], y=n_exp, name='expnorm',
                        marker=dict(color='indigo'))
            )
fig.add_trace(go.Bar(x=b[:-1], y=n_knn, name='KNN',
                        marker=dict(color='turquoise'))
            )
fig.update_layout(
    title_text=r'Weighted dijet mass bins',
    xaxis_title_text=r'$$m_{jj}$$',
    yaxis_title_text=r'Counts',
    yaxis_type="log",
    bargap=0.1
)
fig.write_image(run_dir / "reweighted_mass.pdf")

In [None]:
from sklearn.model_selection import KFold

fold5 = KFold(8, shuffle=True)
q= fold5.split(dataset["x_train"])
x_train, x_valid = next(q)
print(x_train.shape)
print(x_valid.shape)

In [None]:
builder = PaeBuilder()

ae_config = {
    'input_dim':47, 
    'encoding_dim':10, 
    'units':[30, 20, 15],
    'weight_reg':tfk.regularizers.L1L2(l1=1e-5, l2=1e-4),
    'output_activation':tf.nn.sigmoid
}
nf_config = {
    'n_dims':10, 
    'n_layers':5, 
    'units':[32 for _ in range(4)]
}
optimizer_ae = {
    'learning_rate': 0.001
}
optimizer_nf = {
    'learning_rate': 0.005
}

builder.make_ae_model(DenseAutoencoder, ae_config)
builder.make_ae_optimizer(tfk.optimizers.Adam, optimizer_ae)
builder.make_nf_model(MAF, nf_config)
builder.make_nf_optimizer(tfk.optimizers.Adam, optimizer_nf)
builder.compile_ae()
builder.compile_nf()
pae = builder.pae

In [None]:
weights = w_gmm

ae_train ={
    'batch_size':100,
    'epochs':120,
    'sample_weight':weights[x_train],
    'validation_data':(dataset["x_train"][x_valid],dataset["x_train"][x_valid], weights[x_valid]),
    'callbacks':tfk.callbacks.ReduceLROnPlateau(
        factor=0.2,
        patience=10,
        verbose=1
    ),
    "verbose":0
}

nf_train ={
    'batch_size':100,
    'epochs':80,
    'validation_data':(dataset["x_train"][x_valid],dataset["x_train"][x_valid]),
    'callbacks':tfk.callbacks.ReduceLROnPlateau(
        factor=0.2,
        patience=5,
        verbose=1
    ),
    "verbose":0
}

device_id = GPUtil.getFirstAvailable(order = 'load', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=True)
if not device_id:
    raise RuntimeError("No GPU Available")
with tf.device(f"/device:GPU:{device_id[0]}"):
    pae.fit(dataset["x_train"][x_train],None,ae_train,nf_train)

pae.ae.save_weights("./save_testing/ae.h5")
pae.nf.save_weights("./save_testing/nf.h5")

In [None]:
fig = loss_plot(pae.history)
fig.write_image(run_dir / "loss_plot.pdf")

z_true = pae.ae.encode(dataset['x_train'])
z_sample = pae.nf.sample(dataset['x_train'].shape[0])

fig = latent_space_plot(z_true, z_sample)
fig.write_image(run_dir / "latent_space.pdf")

In [None]:
pae.compute_implicit_sigma(dataset['x_train'][x_valid])
ascore = -pae.anomaly_score(dataset['x_test'])
mse = pae.reco_error(dataset['x_test'])
x=dataset['x_test']
mses = np.dot(np.square(pae.ae(x)-x),pae.sigma_square**(-1))
lp = np.exp(np.array(pae.log_prob_encoding(x)))

prc=90

x_min = np.percentile(ascore, 1)
x_max = np.percentile(ascore, 99)
x_prc = np.percentile(ascore, prc)
i_prc = (ascore >= x_prc)

fig = go.Figure(layout_xaxis_range=[-30,30])
fig.add_trace(go.Histogram(x=ascore, name='Test dataset',
                           marker_color='plum', nbinsx=200),
              )
fig.add_vline(x=x_prc, y1=5100, line_width=2, line_color='firebrick', 
              annotation_text=f"{prc}th percentile", 
              annotation_position="top right",
              )

fig.update_layout(
    xaxis_title='Anomaly Score',
    #title_text=r'Cut on Anomaly Score',
    margin={'l': 80, 'b': 40, 't': 40, 'r': 40},
    width=600, height=300,
    paper_bgcolor='rgba(0,0,0,1)',
        font=dict(size=18))
fig.write_image(run_dir / "ascore.pdf")

In [None]:
def adj(x, prc_min=1, prc_max=99):
    xmin, xmax = np.percentile(x,prc_min), np.percentile(x,prc_max)
    return x[(x >= xmin) & (x<= xmax)]

mjj=dataset['mjj_test']


def binarize(label):
    return 1 if label == 'sig' else 0
labels = np.array(list(map(binarize, dataset['labels_test'])))
sig_label = (labels==1)
bkg_label = (labels==0)

fig = go.Figure()
fig.add_trace(go.Histogram(x=adj(mjj[bkg_label].ravel()), name="SM - QCD",
                          marker_color='steelblue', nbinsx=150))
fig.add_trace(go.Histogram(x=mjj[sig_label][:2000].ravel(), name="BSM - Signal",
                          marker_color='darkorange'))
sb = 100*sum(sig_label)/sum(bkg_label)
fig.update_layout(
    xaxis_title=r'$m_{jj}$',
    title_text='Dijet mass spectrum',
    barmode='stack',
    legend=dict(x=0.78, y=1, traceorder='normal', font=dict(size=15)),
    paper_bgcolor='rgba(0,0,0,1)',
    width=800,
    height=500,
)

fig.write_image(run_dir / "pre_cut.pdf") 

fig = go.Figure()
fig.add_trace(go.Histogram(x=mjj[i_prc&bkg_label].ravel(), name="Full test bkg",
                          marker_color='steelblue', nbinsx=100))
fig.add_trace(go.Histogram(x=mjj[i_prc&sig_label].ravel(), name="Full test sig",
                          marker_color='darkorange'))
sb = 100*sum(i_prc&sig_label)/sum(i_prc&bkg_label)
fig.update_layout(
    xaxis_title='$m_{jj}$',
    title_text=f'Dijet mass spectrum after cut S/B={sb:.2f}%',
    width=600,
    barmode='stack'
    )
fig.write_image(run_dir / "post_cut.pdf") 

In [None]:
from scipy.spatial.distance import jensenshannon

def mass_sculpting(mjj, score):
    max_prc = 99
    n_full, b = np.histogram(mjj, bins=60, density=True)
    js_div = {}
    for prc in range(1, max_prc+1):
        x_prc = np.percentile(score, prc)
        i_prc = np.where(score >= x_prc)[0]
        n_prc, _ = np.histogram(mjj[i_prc], bins=b, density=True)
        js_div[prc] = jensenshannon(n_full,n_prc)

    return js_div

def nmse(x, pae):
    reco_error = np.square(pae.ae(x)-x)
    return np.dot(reco_error,pae.sigma_square**(-1))

pio.templates.default = "plotly_dark"
mjj = dataset['mjj_train']

score = pae.anomaly_score(dataset['x_train'])
js_div_pae = mass_sculpting(mjj,score)

score = nmse(dataset['x_train'], pae)
js_div_nmse = mass_sculpting(mjj,score)

score = pae.reco_error(dataset['x_train'])
js_div_mse = mass_sculpting(mjj,score)

score = -pae.log_prob_encoding(dataset['x_train'])
js_div_lpz = mass_sculpting(mjj,score)

fig = go.Figure()
# fig.add_shape(
#     type='line', line=dict(dash='dash', color="tomato", width=1),
#     x0=90, x1=90, y0=0, y1=0.04, 
# )

fig.add_trace(
    go.Scatter(x=list(js_div_pae.keys()), y=list(js_div_lpz.values()), mode='lines',
        name=r"$-\log p_z$", line=dict(color="chocolate", width=3))
)
fig.add_trace(
    go.Scatter(x=list(js_div_pae.keys()), y=list(js_div_mse.values()), mode='lines',
        name=r"$\text{MSE}$", line=dict(color="steelblue", width=3))
)
fig.add_trace(
    go.Scatter(x=list(js_div_pae.keys()), y=list(js_div_nmse.values()), mode='lines',
        name=r"$\text{MSE} \cdot \sigma^{\circ-2}$", line=dict(color="cornflowerblue", width=3))
)
fig.add_trace(
    go.Scatter(x=list(js_div_pae.keys()), y=list(js_div_pae.values()), mode='lines',
        name=r"$\text{PAE}$", line=dict(color="plum", width=3))
)


fig.update_layout(
    title_text = "Mass sculpting",
    xaxis_title = "Percentile Cut",
    yaxis_title = "Jensen–Shannon",
    margin={'l': 80, 'b': 40, 't': 40, 'r': 0},
    width=600, height=500,
    paper_bgcolor='rgba(0,0,0,1)',
        legend = dict(x=0, y=1,
        traceorder='normal',
        font=dict(size=15))
)

fig.write_image(run_dir / "JS.pdf")


In [None]:
from sklearn.metrics import roc_curve, auc
import plotly.express as px

def make_trace(labels, score, c, n=""):
    fpr, tpr, _ = roc_curve(labels, score)
    aauc = auc(1-fpr, tpr)
    print(n,aauc)
    return go.Scatter(x=tpr, y=1-fpr, mode='lines',
        name=n+f"AUC:{aauc:.2f}", line=dict(color=c, width=2))


def binarize(label):
    return 1 if label == 'sig' else 0
labels = np.array(list(map(binarize, dataset['labels_test'])))

fpr, tpr, _ = roc_curve(labels, ascore)
pae_auc = auc(1-fpr, tpr)

score = pae.reco_error(dataset['x_test'])
roc_mse = make_trace(labels, score, 'steelblue')

score = nmse(dataset['x_test'], pae)
roc_nmse = make_trace(labels, score, 'cornflowerblue')

score = -pae.log_prob_encoding(dataset['x_test'])
roc_lpz = make_trace(labels, score, 'chocolate')

fig = go.Figure()

fig.add_trace(
    go.Scatter(x=tpr, y=1-fpr, mode='lines',
        name=f"AUC:{pae_auc:.2f}", line=dict(color="Plum", width=2))
)

fig.add_trace(roc_mse)
fig.add_trace(roc_nmse)
fig.add_trace(roc_lpz)
fig.update_layout(
    width=500, height=500,
    xaxis_title = "Signal efficiency",
    yaxis_title = "Background Rejection",
    margin={'l': 60, 'b': 60, 't': 40, 'r': 0},
    legend = dict(x=0.1, y=0.05,
        traceorder='normal',
        font=dict(size=15)),
    title_text="ROC curves",
    showlegend=True,
    paper_bgcolor='rgba(0,0,0,1)',
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.write_image(run_dir / "roc.pdf")
