In [None]:
import sys
import numpy as np
import pandas as pd

sys.path.append("../")

from pae.models.autoencoder import DenseAutoencoder
from pae.models.flows import MAF
from pae.models.nn import PaeBuilder

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras as tfk

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

from tensorflow.python.client import device_lib

devices = tf.config.list_physical_devices()
print("tensorflow", tf.__version__)
print("tensorflow-probability", tfp.__version__)
print("Available devices:", *[dev[1] for dev in devices])

SEED = 42
np.random.seed(SEED) 
tf.random.set_seed(SEED)

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

from plotly.offline import download_plotlyjs, init_notebook_mode
init_notebook_mode(connected = True)
pio.templates.default = "plotly_dark"


In [None]:
from pae.utils import load_json, dump_json

from pae.loaders.LHCO import ScalarLoaderLHCO, DatasetBuilder

x = ScalarLoaderLHCO.from_json("../pae/configs/loader/scalar_minimal.json")
mjj = ScalarLoaderLHCO.from_json("../pae/configs/loader/scalar_mjj.json")
builder = DatasetBuilder(x, mjj)
builder.data_preparation(sample_sizes ={'sig':100_000, 'bkg': 500_000}, fit_key='bkg')
dataset = builder.make_dataset(train = {'bkg':250_000}, test={'sig':2500, 'bkg': 250_000})

In [None]:
nf_train ={
    'batch_size':200,
    'epochs':100,
    'validation_data':(dataset["x_valid"],dataset["x_valid"]),
    'callbacks':tfk.callbacks.ReduceLROnPlateau(
        factor=0.2,
        patience=5,
        verbose=1
    )
}

nf_config = {
    'n_dims':8, 
    'n_layers':5, 
    'units':[32 for i in range(4)]
}

In [None]:
dataset["x_train"].shape

In [None]:
nf = MAF(**nf_config)
optimizer  = tfk.optimizers.Adam(lr=0.005)
nf.compile(optimizer=optimizer , loss=lambda _, log_p: -log_p)

nf.fit(x=dataset["x_train"], 
       y=np.zeros(dataset["x_train"].shape),
       **nf_train)


In [None]:
import numpy as np
from plotly.subplots import make_subplots

def optimal_grid(n):
    rows = np.floor(np.sqrt(n))
    residual = 1 if n%rows != 0 else 0
    cols = n//rows + residual
    return int(rows), int(cols)

def latent_space_plot(z_true, z_sample,
                      bins: int = 20
                      ):

    titles =  [r"$p_T1$", "$m_1$", r"$\tau^{(3)}_1/\tau^{(2)}_1$", 
              r"$\tau^{(2)}_1/\tau^{(1)}_1$", 
              r"$p_T2$", "$m_2$", r"$\tau^{(3)}_2/\tau^{(2)}_2$", 
              r"$\tau^{(2)}_2/\tau^{(1)}_2$"]

    rows, cols = optimal_grid(z_true.shape[1])
    data = {'true':[], 'sample':[], 'bins':[], 'title':titles}
    for i in range(z_true.shape[1]):
        n1, b = np.histogram(z_true[:,i], bins=bins, density=True)
        n2, _ = np.histogram(z_sample[:,i], bins=b, density=True)
        data['bins'].append(b[:-1])
        data['true'].append(n1)
        data['sample'].append(n2)
   
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=data["title"])
    for i in range(z_true.shape[1]):
        fig.add_trace(go.Bar(x=data['bins'][i], y=data['true'][i],
                name='True Distribution', marker_color='steelblue',
                showlegend=(i==0)),
            row=i//cols+1, col=i%cols+1
            )
        fig.add_trace(go.Bar(x=data['bins'][i], y=data['sample'][i],
                name='Sample from NF', marker_color='coral',
                showlegend=(i==0)),
            row=i//cols+1, col=i%cols+1
            )
    fig.update_layout(
            bargap=0,
            height=500, 
            width=900,
            title_text='True Distribution vs NF Sample',
            paper_bgcolor='rgba(0,0,0,0)',
            legend = dict(x=0.78, y=1.27,
                traceorder='normal',
                font=dict(size=15)),
    )
    return fig

In [None]:
pio.templates.default = "presentation"

z_true = dataset['x_train']
z_sample = nf.sample(dataset['x_train'].shape[0])

fig = latent_space_plot(z_true, z_sample)
fig.show('svg')
fig.write_image("nf.pdf")

In [None]:
from scipy.spatial.distance import jensenshannon
pio.templates.default = "presentation"
mjj = dataset['mjj_train']
max_prc = 99
score = nf(dataset['x_train']).numpy()
n_full, b = np.histogram(mjj, bins=60, density=True)
js_div = {}
for prc in range(1, max_prc+1):
    x_prc = np.percentile(score, prc)
    i_prc = np.where(score >= x_prc)[0]
    n_prc, _ = np.histogram(mjj[i_prc], bins=b, density=True)
    js_div[prc] = jensenshannon(n_full,n_prc)

fig = go.Figure()
# fig.add_shape(
#     type='line', line=dict(dash='dash', color="tomato", width=1),
#     x0=90, x1=90, y0=0, y1=0.04, (
# )

fig.add_trace(
    go.Scatter(x=list(js_div.keys()), y=list(js_div.values()), mode='lines',
        name="PAE", line=dict(color="plum", width=3))
)


# fig.add_trace(go.Scatter(
#     x=[86],
#     y=[0.003],
#     mode="text",
#     text=["97th percentile"],
#     textposition="top center",
#     showlegend=False
# ))

fig.update_layout(
    title_text = "Mass sculpting",
    xaxis_title = "Percentile Cut",
    yaxis_title = "Jensen–Shannon Divergence",
    margin={'l': 80, 'b': 40, 't': 40, 'r': 0},
    width=750, height=450

)
fig.show('svg')
#fig.write_image("JS-plot.png")


In [None]:
from pae.utils import dump_json

dump_json(js_div, "./nf-js.json")

In [None]:
nf_roc

In [None]:
from sklearn.metrics import roc_curve, auc
import plotly.express as px

ascore = -nf(dataset['x_test']).numpy()

def binarize(label):
    return 1 if label == 'sig' else 0
labels = np.array(list(map(binarize, dataset['labels_test'])))

fpr, tpr, thresholds = roc_curve(labels, ascore)
pae_auc = auc(1-fpr, tpr)
#gan_auc = auc(louis_roc['bkg_rej'], louis_roc['sig1_ef'])

fig = go.Figure()

# fig.add_trace(
#     go.Scatter(x=louis_roc['sig1_ef'], y=louis_roc['bkg_rej'], mode='lines',
#         name=f"GAN-AE (AUC:{gan_auc:.2f})", line=dict(color="LimeGreen", width=2))
# )
fig.add_trace(
    go.Scatter(x=tpr, y=1-fpr, mode='lines',
        name=f"PAE (AUC:{pae_auc:.2f})", line=dict(color="Plum", width=2))
)
fig.add_trace(go.Scatter(
    x=[0.018],
    y=[0.99],
    mode="text",
    text=["97th percentile"],
    textposition="top center",
    showlegend=False
))
fig.update_layout(
    width=500, height=500,
    xaxis_title = "Signal efficiency",
    yaxis_title = "Background Rejection",
    margin={'l': 60, 'b': 60, 't': 40, 'r': 0},
    legend = dict(x=0.5, y=0.9,
        traceorder='normal',
        font=dict(size=15)),
    title_text="ROC curves"
)
#fig.add_vline(x=0.0177, line_width=1, line_dash="dash", line_color="firebrick")
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.write_image("ROC2jet.png")
fig.show()

nf_roc = {
    "eff":tpr.tolist(),
    "rej":(1-fpr).tolist(),
    "auc":pae_auc
}
dump_json(nf_roc, "./nf-roc.json")

In [None]:
mjj=dataset['mjj_train'].ravel()
ascore = -nf(dataset['x_train']).numpy()
def adj(x, prc_min=1, prc_max=99):
    xmin, xmax = np.percentile(x,prc_min), np.percentile(x,prc_max)
    return x[(x >= xmin) & (x<= xmax)]

def binarize(label):
    return 1 if label == 'sig' else 0
labels = np.array(list(map(binarize, dataset['labels_train'])))
sig_label = (labels==1)
bkg_label = (labels==0)

x_prc = np.percentile(ascore, 50)
i_prc = (ascore >= x_prc)

fig = go.Figure()
fig.add_trace(go.Histogram(x=adj(mjj[bkg_label].ravel()), name="QCD background data",
                          marker_color='steelblue', nbinsx=60, opacity=0.5,
                          histnorm="probability density"))
fig.add_trace(go.Histogram(x=adj(mjj[i_prc&bkg_label.ravel()].ravel()), 
                          name="QCD, after cut @50%", nbinsx=60, opacity=0.5,
                          marker_color='darkorange', histnorm="probability density"))
# fig.add_trace(go.Histogram(x=mjj[sig_label][:2000].ravel(), name="BSM - Signal",
#                           marker_color='darkorange'))
sb = 100*sum(sig_label)/sum(bkg_label)
fig.update_layout(
    xaxis_title=r'$m_{jj}$',
    title_text=f'Dijet mass spectrum',
    barmode='overlay',
    bargroupgap=0,
    bargap=0,
    legend = dict(x=0.5, y=1,
        traceorder='normal',
        font=dict(size=15)),
    paper_bgcolor='rgba(0,0,0,0)',
    width=600, height=500,
    yaxis_tickformat = '.1e')

fig.show('vscode')
fig.write_image("anomaly.pdf") 

In [None]:
prc=50

x_min = np.percentile(ascore, 1)
x_max = np.percentile(ascore, 99)
x_prc = np.percentile(ascore, prc)
i_prc = (ascore >= x_prc)

fig = go.Figure()
fig.add_trace(go.Histogram(x=adj(ascore, 0.01, 99.9), name='Test dataset', 
                           marker_color='plum', nbinsx=120),
             )
fig.add_vline(x=x_prc, y1=1, line_width=2, line_color='firebrick', 
              annotation_text=f"{prc}th percentile", 
              annotation_position="top right",
              )

fig.update_layout(
    xaxis_title=r"$-\log LH$",
    title="Anomaly Score",
    barmode='overlay',
    legend = dict(x=0.5, y=1,
        traceorder='normal',
        font=dict(size=15)),
    paper_bgcolor='rgba(0,0,0,0)',
    width=600, height=500,)
    #yaxis_tickformat = '.1e')
fig.show('vscode')
fig.write_image("anomaly_pdf.pdf") 

In [None]:
def mass_sculpting(mjj, score):
    max_prc = 99
    n_full, b = np.histogram(mjj, bins=60, density=True)
    js_div = {}
    for prc in range(1, max_prc+1):
        x_prc = np.percentile(score, prc)
        i_prc = np.where(score >= x_prc)[0]
        n_prc, _ = np.histogram(mjj[i_prc], bins=b, density=True)
        js_div[prc] = jensenshannon(n_full,n_prc)

    return js_div

fig = go.Figure()

fig.add_trace(
    go.Scatter(x=list(js_div_pae.keys()), y=list(js_div_pae.values()), mode='lines',
        name="PAE", line=dict(color="plum", width=3))
)