In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from models.autoencoder import DenseAutoencoder
from models.flows import MAF
from models.nn import PaeBuilder

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras as tfk
print(tf.__version__)

In [None]:
from tensorflow.python.client import device_lib
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1" 
print(device_lib.list_local_devices())
#tf.config.list_physical_devices()


In [None]:
from loaders.LHCO import LhcoRnDLoader
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

scaler = QuantileTransformer(output_distribution='uniform')
#scaler = MinMaxScaler()
files = {
    'bkg':'../data/RnD_bkg_HLF.h5',
    'sig1':'../data/RnD_sig1_HLF.h5',
    'sig2':'../data/RnD_sig2_HLF.h5'
}

train_fractions = {
    'bkg':1
}

test_fractions = {
    'bkg':.5,
    'sig1':.5
}

loader = LhcoRnDLoader(files, 'all', scaler)
loader.preprocessing('bkg')
train = loader.make_train_val(250_000, train_fractions, val_split=.2)
test = loader.make_test(100_000, test_fractions, replace=False)

dataset = {**train, **test}
del train, test

In [None]:
dataset['mjj_test']

In [None]:
from utils.plotting import feature_plots

feature_plots(dataset['x_train'])



In [None]:
feature_plots(dataset['x_test'])


### Reweighting

In [None]:
from sklearn.mixture import GaussianMixture
GMM = GaussianMixture

%time gmm = GMM(n_components=350, covariance_type='full').fit(dataset["mjj_train"].reshape(-1,1))
plt.figure(figsize=(12,8))
_, b, _ = plt.hist(dataset["mjj_train"], bins=50, label='mjj true', alpha=.5, density=True)
sample = gmm.sample(dataset["mjj_train"].shape[0])
plt.hist(sample[0], bins=b, label='mjj GMM', alpha=.5, density=True)
plt.legend()
plt.show()

In [None]:
weights2 = gmm.score_samples(dataset["mjj_train"].reshape(-1,1))
weights2_valid = gmm.score_samples(dataset["mjj_valid"].reshape(-1,1))
plt.figure(figsize=(12,8))
_, b, _ = plt.hist(dataset["mjj_train"], bins=50, label='mjj weighted', alpha=.5, weights=np.exp(weights2))
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(dataset["mjj_train"], 1/np.exp(weights2))
plt.show()

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras as tfk
import numpy as np
import sys

tfd = tfp.distributions
tfb = tfp.bijectors
tfkl = tfk.layers

sys.path.append('../')

from models.autoencoder import DenseAutoencoder
from models.flows import MAF

In [None]:

builder = PaeBuilder()

ae_config = {
    'input_dim':47, 
    'encoding_dim':10, 
    'units_list':[30, 20, 15],
    'weight_reg':tfk.regularizers.l1(1e-6),
    'output_activation':tf.nn.sigmoid
}
nf_config = {
    'n_dims':10, 
    'n_layers':5, 
    'units':[32 for i in range(4)]
}
optimizer_ae = {
    'lr': 0.05
}
optimizer_nf = {
    'lr': 0.005
}

builder.make_ae_model(DenseAutoencoder, ae_config)
builder.make_nf_optimizer(tfk.optimizers.Adam, optimizer_ae)
builder.make_nf_model(MAF, nf_config)
builder.make_nf_optimizer(tfk.optimizers.Adam, optimizer_nf)
builder.compile_ae()
builder.compile_nf()
pae = builder.pae

In [None]:
ae_train ={
    'batch_size':200,
    'epochs':180,
    'sample_weight':1/np.exp(weights2),
    'validation_data':(dataset["x_valid"],dataset["x_valid"],1/np.exp(weights2_valid)),
    'callbacks':tfk.callbacks.ReduceLROnPlateau(
        factor=0.2,
        patience=10,
        verbose=1
    )
}

nf_train ={
    'batch_size':200,
    'epochs':100,
    'validation_data':(dataset["x_valid"],dataset["x_valid"]),
    'callbacks':tfk.callbacks.ReduceLROnPlateau(
        factor=0.2,
        patience=5,
        verbose=1
    )
}
with tf.device("/device:CPU:0"):
    %time pae.fit(dataset["x_train"],ae_train,nf_train)

In [None]:
from utils.plotting import loss_plot, latent_space_plot, mjj_cut_plot, \
                           sculpting_plot, roc_plot

loss_plot(pae.history)


In [None]:
z_true = pae.ae.encode(dataset['x_train'])
z_sample = pae.nf.sample(dataset['x_train'].shape[0])

latent_space_plot(z_true, z_sample, save_path='plots/latent_space.png')

In [None]:
def optimal_grid(n):
    rows = np.floor(np.sqrt(n))
    residual = 1 if n%rows != 0 else 0
    cols = n//rows + residual
    return int(rows), int(cols)



In [None]:
mse = pae.reco_error(dataset['x_train'])
pae.compute_implicit_sigma(dataset['x_valid'])
ascore = -pae.anomaly_score(dataset['x_train'])

mjj_cut_plot(mse, dataset['mjj_train'], prc=80, score_name='MSE')#, save_path='./plots/mse_cut.png')
mjj_cut_plot(ascore, dataset['mjj_train'], prc=80, score_name='NLL')


In [None]:
def sculpting_plot(ano_scores, mjj, 
                   max_prc: int = 90, 
                   bins: int = 60,
                   save_path: str = None):
    from scipy.spatial.distance import jensenshannon
    sculpting_plots = {}
    for label, score in ano_scores.items():
        n_full, b = np.histogram(mjj, bins=bins, density=True)
        js_div = {}
        for prc in range(1, max_prc+1):
            x_prc = np.percentile(score, prc)
            i_prc = np.where(score >= x_prc)[0]
            n_prc, _ = np.histogram(mjj[i_prc], bins=b, density=True)
            js_div[prc] = jensenshannon(n_full,n_prc)
        sculpting_plots[label]=js_div
        
    plt.figure(figsize=(10,6))
    for label, js_div in sculpting_plots.items():
        plt.plot(js_div.keys(), js_div.values(), label=label)
    plt.ylabel('JS-divergence')
    plt.xlabel('Percentile cut')
    plt.tight_layout()
    plt.legend()
    plt.title('Mass sculpting')
    if save_path is not None:
        plt.savefig(save_path)
    plt.show()
    return sculpting_plots

In [None]:
ano_scores = {
    'MSE': mse,
    'NLL': ascore
}

jss = sculpting_plot(ano_scores, dataset['mjj_train'], max_prc=99, save_path='./plots/mass_sculpting.png')

In [None]:
print(list(jss['NLL'].values()))

In [None]:
dy = np.diff(list(jss['MSE'].values()))
plt.figure(figsize=(12,8))
plt.plot(list(jss['MSE'].keys())[1:], dy)
plt.show()

In [None]:
def binarize(label):
    return 1 if label == 'sig1' else 0

mse_test = pae.reco_error(dataset['x_test'])
ascore_test = -pae.anomaly_score(dataset['x_test'])
targets = np.array(list(map(binarize, dataset['labels'])))

scores = {
    'NLL':ascore_test,
    'MSE':mse_test
}
roc_plot(targets,scores,save_path='./plots/roc_50prc.png')

In [None]:
def mjj_cut_plot(ano_score, mjj, 
                 prc: int = 75, 
                 bins: int = 60, 
                 x_min_prc: float = 0.5,
                 x_max_prc: float = 99.5,
                 score_name: str = 'anomaly score', 
                 save_path: str = None):
    
    x_min = np.percentile(ano_score, x_min_prc)
    x_max = np.percentile(ano_score, x_max_prc)
    x_prc = np.percentile(ano_score, prc)
    i_prc = np.where(ano_score >= x_prc)[0]
    not_i = np.where(ano_score <= x_prc)[0]

    plt.figure(figsize=(16,6))
    plt.subplot(1,2,1)
    plt.hist(ano_score, bins=bins, density=True, alpha=.8, 
             label='Test dataset')
    plt.xlim(x_min,x_max)
    plt.axvline(x_prc, color='red', label=f'{prc}''$^{th}$ percentile')
    plt.legend()
    plt.xlabel(f'{score_name}')

    plt.subplot(1,2,2)
    n_bkg, b, _ = plt.hist(mjj, bins=bins, density=True, alpha=.5, 
                       label='Full test datset')
    n_sig, _, _ = plt.hist(mjj[i_prc], bins=b, density=True, alpha=.5, 
                       label=f'{prc}''$^{th}$'f' {score_name} percentile+')
    plt.xlabel('$m_{jj}$')
    plt.legend()
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)
    plt.show()
    return mjj.ravel(), mjj[i_prc].ravel()

In [None]:
test_fractions = {
    'bkg':.999,
    'sig1':.0001
}
test2 = loader.make_test(500_000, test_fractions, replace=False)

ascore_test = -pae.anomaly_score(test2['x_test'])
mse_test = pae.reco_error(test2['x_test'])
bkg, data = mjj_cut_plot(ascore_test, test2['mjj_test'], prc=0, score_name='NLL', bins=100, save_path='./plots/cut_01prc.png')


In [None]:
bkg2, data2 = mjj_cut_plot(ascore_test, test2['mjj_test'], prc=99, score_name='NLL', bins=100, save_path='./plots/cut_01prc.png')

In [None]:
bkg.shape[0]/data.shape[0]
weights = np.repeat(1/(bkg.shape[0]/data.shape[0]),bkg.shape[0])
weights_k = np.repeat(1/(data.shape[0]/data2.shape[0]),data.shape[0]) #data.shape[0]/data2.shape[0]

In [None]:
import pyBumpHunter as BH

hunter = BH.BumpHunter(rang=(3000,4500),
                       width_min=2,
                       width_max=5,
                       width_step=1,
                       scan_step=1,
                       Npe=10000,
                       Nworker=1,
                       seed=666,
                       weights=weights_k
                    )

# x_prc_50 = np.percentile(ascore_test, 50)

# high_prc = np.where(ascore_test >= x_prc_50)
# low_prc = np.where(ascore_test <= x_prc_50)

# data, bkg = test2['excl_test'][high_prc][:,0], test2['excl_test'][low_prc][:,0]

%time hunter.BumpScan(data2,data)

In [None]:
hunter.PlotBump(data2,data)#, filename='./plots/bump_01prc.png')


In [None]:
hunter.PrintBumpTrue(data2,data)

In [None]:
targets = np.array(list(map(binarize, test2['labels'])))

scores = {
    'NLL':ascore_test
}
roc_plot(targets,scores,save_path='./plots/roc_1prc.png')