In [None]:
%cd ../../../

In [None]:
saved = "ml_hep_sim/notebooks/article_notebooks/saved/"

In [None]:
from ml_hep_sim.notebooks.article_notebooks.test_runs import *
from ml_hep_sim.pipeline.pipes import *
from ml_hep_sim.pipeline.blocks import *

from ml_hep_sim.plotting.style import style_setup, set_size
from ml_hep_sim.stats.stat_plots import two_sample_plot

from ml_hep_sim.data_utils.higgs.process_higgs_dataset import LATEX_COLNAMES, LOG_BIN_RANGES

import matplotlib 
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy

set_size()
style_setup(seaborn_pallete=True)

In [None]:
num_flows = np.arange(4, 32, 2)

In [None]:
pipelines = run_glow_pipeline(train=False, gen=False, test=False, skip_gen_test=False)

In [None]:
pipelines[0][0].pipes[0].config

In [None]:
results = []

N = 20 # batch size (i.e. 1024 * N data points)

for pipe in tqdm(pipelines):
    x_ConfigBuilderBlock, _, _, x_ModelTrainerBlock = pipe[0].pipes
    
    x1 = ModelLoaderBlock()(x_ConfigBuilderBlock, x_ModelTrainerBlock)
    x2 = DatasetBuilderBlock()(x_ConfigBuilderBlock)
    x3 = CouplingModelTestingBlock(N, loss_cutoff=20)(x2, x1, x_ConfigBuilderBlock)
    
    test_pipe = Pipeline()
    test_pipe.compose(x1, x2, x3)
    test_pipe.fit()
    results.append(test_pipe)

In [None]:
m_lst, s_lst = [], [] # mean and std

for r in results:
    m, s, _ = r.pipes[-1].results
    m_lst.append(m)
    s_lst.append(s)

In [None]:
set_size(s=20)
plt.scatter(num_flows, m_lst, s=140)
plt.errorbar(num_flows, m_lst, yerr=s_lst, capsize=4, ls="none", lw=3, capthick=4)
plt.xlim([0, 34])
plt.xlabel("Number of flow blocks", fontsize=29)
plt.ylabel("Validation loss", fontsize=29)
plt.savefig(saved + "loss_vs_num_flows_glow.pdf")
plt.tight_layout()

In [None]:
t = pipe[1].pipes[0].metrics[0]["timestamp"]

In [None]:
val_losses, steps, times = [], [], []

for pipe in pipelines:
    val_loss = pipe[1].pipes[0].metrics[-1]["val_loss"]
    step = pipe[1].pipes[0].metrics[-2]["step"]
    t = pipe[1].pipes[0].metrics[0]["timestamp"].to_numpy()
    times.append(t[-1] - t[0])
    val_losses.append(val_loss)
    steps.append(step)

In [None]:
set_size(s=20)
plt.plot(steps[0], val_losses[0], lw=4)
plt.plot(steps[3], val_losses[3], lw=4)
plt.plot(steps[6], val_losses[6], lw=4)
plt.plot(steps[-1], val_losses[-1], lw=4)
plt.legend(["4 blocks", "10 blocks", "16 blocks", "30 blocks"], fontsize=22)
plt.xlim([-800, 2.5*10**4])
plt.xlabel("Steps (early stopping = 15 epochs)", loc="left", fontsize=29)
plt.ylabel("Validation loss", fontsize=29)

plt.tight_layout()
plt.savefig(saved + "val_loss_vs_steps_glow.pdf")

In [None]:
device = "cuda"

r = 15 # repeats
s = 30 # scaling plot points
N = 10**5

res_lst = []
class_res = []

for _ in range(r):
    x_ConfigBuilderBlock, _, _, x_ModelTrainerBlock = pipelines[3][0].pipes
    x_ConfigBuilderBlock.config["datasets"]["data_params"]["subset_n"] = [250000, 100000, N]

    x1 = ModelLoaderBlock(device=device)(x_ConfigBuilderBlock, x_ModelTrainerBlock)

    x2 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x1)
    x3 = GeneratedDataVerifierBlock(save_data=False, device=device, rescale_data=False)(x1, x2)

    x4 = DatasetBuilderBlock()(x_ConfigBuilderBlock)
    x5 = ReferenceDataLoaderBlock(rescale_reference="logit_normal", device=device)(x4)

    class_run_name = "Higgs_resnet_classifier_train_pipeline"
    class_train_pipeline = Pipeline(pipeline_name=class_run_name, pipeline_path="ml_pipeline/")
    class_train_pipeline.load()

    x6 = ModelLoaderBlock(device=device)(class_train_pipeline.pipes[0], class_train_pipeline.pipes[-1])
    x7 = ClassifierRunnerBlock(save_data=False, device=device)(x5, x6)
    x8 = ClassifierRunnerBlock(save_data=False, device=device)(x3, x6)

    class_res.append(x7.results)

    x9 = ScalingTestBlock(1000, N, s)(x7, x8)

    scaling_pipe = Pipeline()
    scaling_pipe.compose(x1, x2, x3, x4, x5, x6, x7, x8, x9)
    scaling_pipe.fit()

    res = scaling_pipe.pipes[-1].results

    res_lst.append(res)

In [None]:
chi2_m = np.zeros((r, s))
ks_m = np.zeros((r, s))
chi2_m_crit = np.zeros((r, s))
ks_m_crit = np.zeros((r, s))

for ri in range(r):
    for si in range(s):
        chi2, ks = res_lst[ri][si]
        
        chi2_m[ri, si] = chi2["chi2"].to_numpy()[0]
        ks_m[ri, si] = ks["ks"].to_numpy()[0]
        chi2_m_crit[ri, si] = chi2["crit"].to_numpy()[0]
        ks_m_crit [ri, si] = ks["crit"].to_numpy()[0]

In [None]:
N_range = x9.N_range

In [None]:
set_size(s=20)

plt.scatter(N_range, chi2_m.mean(axis=0), s=60)
plt.plot(N_range, chi2_m_crit.mean(axis=0), ls='--', c='C1', lw=3)

plt.errorbar(N_range, chi2_m.mean(axis=0), yerr=chi2_m.std(axis=0), capsize=4, ls="none", lw=2, capthick=2)

def func(x, k, n):
    return k * x + n

popt, pcov = curve_fit(func, N_range, chi2_m.mean(axis=0), sigma=chi2_m.std(axis=0))
plt.plot(N_range, func(N_range, *popt), ls='--', c="C2", lw=3)

plt.xlim([8000, 1.05*10**5])
plt.xlabel("$N$ generated", loc="center", fontsize=29)
plt.ylabel("$\chi^2$ score", fontsize=29)
plt.legend(["critical", f"fit $kx+n$, $k=${popt[0]:.2e}", "result"], fontsize=22)
plt.title("Glow $\chi^2$ test")

plt.tight_layout()
plt.savefig(saved + "realnvp_chi2_scaling.pdf")

In [None]:
set_size(s=20)

plt.scatter(N_range[1:], ks_m.mean(axis=0)[1:], s=80)
plt.plot(N_range[1:], ks_m_crit.mean(axis=0)[1:], ls='--', c="C1", lw=3)
plt.errorbar(N_range[1:], ks_m.mean(axis=0)[1:], yerr=ks_m.std(axis=0)[1:], capsize=4, ls="none", lw=2, capthick=2)

def func(x, k, n):
    return k * x + n

popt, pcov = curve_fit(func, N_range[1:], ks_m.mean(axis=0)[1:], sigma=ks_m.std(axis=0)[1:])
plt.plot(N_range[1:], func(N_range[1:], *popt), ls='--', c="C2", lw=3)

plt.xlim([7000, 1.1*10**5])
plt.xlabel("$N$ generated", loc="center", fontsize=29)
plt.ylabel("KS score", fontsize=29)
plt.legend(["critical", f"fit $kx+n$, $k=${popt[0]:.2e}", "result"], fontsize=22)
plt.title("Glow KS test")
plt.tight_layout()
plt.savefig(saved + "realnvp_ks_scaling.pdf")

In [None]:
a = x7.results
b = x8.results

plt.hist(b, histtype="step", range=(-0.5, 1.5), bins=50, lw=2)
plt.hist(a, histtype="step", range=(-0.5, 1.5), bins=50, lw=2)
plt.xlabel("Classifier output")
plt.ylabel("$N,\>$ $\sum_i=10^5$")
plt.legend(["gen", "MC"])
plt.tight_layout()
plt.savefig(saved + "realnvp_class_out.pdf")

In [None]:
N = 10 ** 5
device = "cpu"

x_ConfigBuilderBlock, _, _, x_ModelTrainerBlock = pipelines[3][0].pipes
x_ConfigBuilderBlock.config["datasets"]["data_params"]["subset_n"] = [250000, 100000, N]

x1 = ModelLoaderBlock()(x_ConfigBuilderBlock, x_ModelTrainerBlock)

x2 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x1)
x3 = GeneratedDataVerifierBlock(save_data=False, device=device, rescale_data=True)(x1, x2)

x4 = DatasetBuilderBlock()(x_ConfigBuilderBlock)
x5 = ReferenceDataLoaderBlock()(x4)

x6 = ScalingTestBlock(10000, N, 30)(x5, x3)

scaling_pipe_full = Pipeline()
scaling_pipe_full.compose(x1, x2, x3, x4, x5, x6)
scaling_pipe_full.fit()

In [None]:
# N = 2 * 10**5
# N_scale = 10**5
# device = "cpu"

# x_ConfigBuilderBlock, _, _, x_ModelTrainerBlock = pipelines[3][0].pipes

# x_ConfigBuilderBlock.config["datasets"]["data_params"]["subset_n"] = [250000, 100000, N]

# x1 = ModelLoaderBlock()(x_ConfigBuilderBlock, x_ModelTrainerBlock)

# x2 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x1)
# x3 = GeneratedDataVerifierBlock(save_data=False, device=device, rescale_data=True)(x1, x2)

# x4 = DatasetBuilderBlock()(x_ConfigBuilderBlock)
# x5 = ReferenceDataLoaderBlock()(x4)

# x51 = CutDataBlock(0.5, N)(x3, x5)

# x6 = ScalingTestBlock(10000, N_scale, 30)(x51)

# scaling_pipe_full = Pipeline()
# scaling_pipe_full.compose(x1, x2, x3, x4, x5, x51, x6)
# scaling_pipe_full.fit()

# ref = scaling_pipe_full.pipes[-3].reference_data.numpy()
# gen = scaling_pipe_full.pipes[2].generated_data.numpy()

# plt.hist(scaling_pipe_full.pipes[-1].reference_data[:, 0], bins=30, histtype="step")
# plt.hist(scaling_pipe_full.pipes[-1].generated_data[:, 0], bins=30, histtype="step")
# plt.yscale("log")

In [None]:
ref = scaling_pipe_full.pipes[-2].reference_data.numpy()
gen = scaling_pipe_full.pipes[2].generated_data.numpy()

In [None]:
N_range = x6.N_range

In [None]:
plt.hist(gen[:, 0], histtype="step", range=(-1, 12), bins=30)
plt.hist(ref[:, 0], histtype="step", range=(-1, 12), bins=30)
# plt.yscale("log")
plt.tight_layout()

In [None]:
res = scaling_pipe_full.pipes[-1].results

In [None]:
s_chi2 = np.zeros((18, len(res)))
s_chi2_crit = np.zeros((18, len(res)))
s_ks = np.zeros((18, len(res)))
s_ks_crit = np.zeros((18, len(res)))

In [None]:
for i, r in enumerate(res):
    chi2, ks = r
    s_chi2[:, i] = chi2["chi2"].to_numpy()
    s_ks[:, i] = ks["ks"].to_numpy()
    s_chi2_crit[:, i] = chi2["crit"].to_numpy()
    s_ks_crit[:, i] = ks["crit"].to_numpy()

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

set_size(18)
plt.figure()
ax = plt.gca()

im = ax.imshow(np.log10(s_chi2 / s_chi2_crit))
# im = ax.imshow(s_chi2 / s_chi2_crit)
im.set_clim(-0.4, 0.8)

ax.set_yticks(np.arange(0, 18, 1))
ax.set_yticklabels(LATEX_COLNAMES)

ax.minorticks_off()
              
ax.set_xticks(np.arange(0, 30, 1)[1::5])
ax.set_xticklabels(N_range[1::5])          
              
ax.set_xlabel("$N$ generated", loc="center")

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im, cax=cax)
cax.set_xlabel('log $\chi^2/\chi^2_c$', loc="center")
ax.set_title("Glow $\chi^2$ test")

plt.tight_layout()
plt.savefig(saved + "imshow_realnvp_chi2.pdf")

In [None]:
set_size(18)
plt.figure()
ax = plt.gca()

# im = ax.imshow(np.log10(s_ks / s_ks_crit))
im = ax.imshow(s_ks / s_ks_crit)
#im.set_clim(-0.4, 0.8)

ax.set_yticks(np.arange(0, 18, 1))
ax.set_yticklabels(LATEX_COLNAMES)

ax.minorticks_off()
              
ax.set_xticks(np.arange(0, 30, 1)[1::5])
ax.set_xticklabels(N_range[1::5])
              
ax.set_xlabel("$N$ generated", loc="center")

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im, cax=cax)
cax.set_xlabel('log KS$/$KS$_c$', loc="center")
ax.set_title("Glow KS test")

plt.tight_layout()
plt.savefig(saved + "imshow_realnvp_ks.pdf")

In [None]:
run_name = "Higgs_Glow"
n = 10

pipeline_path = f"ml_pipeline/{run_name}/"
pipeline_sig_name = run_name + f"_flow_blocks_{n}_sig_train_pipe"
pipeline_bkg_name = run_name + f"_flow_blocks_{n}_train_pipe"

pipe_sig = Pipeline(pipeline_name=pipeline_sig_name, pipeline_path=pipeline_path).load().pipes
pipe_bkg = Pipeline(pipeline_name=pipeline_bkg_name, pipeline_path=pipeline_path).load().pipes

In [None]:
pipe_bkg

In [None]:
N = 50

x1 = ModelLoaderBlock()(pipe_sig[0], pipe_sig[-1], pipe_sig[1])
x2 = ModelLoaderBlock()(pipe_bkg[0], pipe_bkg[-1], pipe_bkg[1])

x3 = DatasetBuilderBlock()(pipe_sig[0])
x4 = DatasetBuilderBlock()(pipe_bkg[0])

x5 = CouplingModelTestingBlock(N, mean=False)(x4, x1, pipe_sig[0])
x6 = CouplingModelTestingBlock(N, mean=False)(x4, x2, pipe_bkg[0])

In [None]:
pipe = Pipeline()
pipe.compose(x1, x2, x3, x4, x5, x6)
pipe.fit()

In [None]:
sig_dist = pipe.pipes[-2].results[-1].flatten().numpy()
bkg_dist = pipe.pipes[-1].results[-1].flatten().numpy()

In [None]:
plt.hist(sig_dist, histtype="step", bins=40, range=(0, 80), lw=2)
plt.hist(bkg_dist, histtype="step", bins=40, range=(0, 80), lw=2)
plt.yscale("log")
plt.xlabel("NLL")
plt.ylabel("log $N$")
plt.legend(["sig (trained)", "bkg"])
plt.savefig(saved + "flow_class_realnvp_log.pdf")
plt.tight_layout()

In [None]:
plt.hist(sig_dist, histtype="step", bins=40, range=(0, 40), lw=2)
plt.hist(bkg_dist, histtype="step", bins=40, range=(0, 40), lw=2)
plt.xlabel("NLL")
plt.ylabel("$N$")
plt.legend(["sig (trained)", "bkg"])
plt.savefig(saved + "flow_class_realnvp.pdf")
plt.tight_layout()

In [None]:
run_name = "Higgs_Glow"
n = 10

pipeline_path = f"ml_pipeline/{run_name}/"
pipeline_sig_name = run_name + f"_flow_blocks_{n}_sig_train_pipe"
pipeline_bkg_name = run_name + f"_flow_blocks_{n}_train_pipe"

pipe_sig = Pipeline(pipeline_name=pipeline_sig_name, pipeline_path=pipeline_path).load().pipes
pipe_bkg = Pipeline(pipeline_name=pipeline_bkg_name, pipeline_path=pipeline_path).load().pipes

In [None]:
N = 10**5
device = "cpu"

x1 = ModelLoaderBlock()(pipe_sig[0], pipe_sig[-1], pipe_sig[1])
x2 = ModelLoaderBlock()(pipe_bkg[0], pipe_bkg[-1], pipe_bkg[1])

x3 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x1)
x4 = GeneratedDataVerifierBlock(save_data=False, device=device, rescale_data=False)(x1, x3)

x5 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x2)
x6 = GeneratedDataVerifierBlock(save_data=False, device=device, rescale_data=False)(x2, x5)

config = copy.deepcopy(pipe_sig[0].config)
config["datasets"]["data_name"] = "higgs_bkg"
config["datasets"]["data_params"]["subset_n"] = [10 ** 5, 10 ** 5, 10 ** 6]

x71 = DatasetBuilderBlock(config=config)()
x81 = ReferenceDataLoaderBlock(rescale_reference="logit_normal")(x71)

config = copy.deepcopy(pipe_sig[0].config)
config["datasets"]["data_name"] = "higgs_sig"
config["datasets"]["data_params"]["subset_n"] = [10 ** 5, 10 ** 5, 10 ** 6]

x72 = DatasetBuilderBlock(config=config)()
x82 = ReferenceDataLoaderBlock(rescale_reference="logit_normal")(x72)

class_run_name = "Higgs_resnet_classifier_train_pipeline"
class_train_pipeline = Pipeline(pipeline_name=class_run_name, pipeline_path="ml_pipeline/")
class_train_pipeline.load()

x9 = ModelLoaderBlock(device=device)(class_train_pipeline.pipes[0], class_train_pipeline.pipes[-1])
x10 = ClassifierRunnerBlock(save_data=False)(x4, x9) # sig gen
x11 = ClassifierRunnerBlock(save_data=False)(x6, x9) # bkg gen

x12 = ClassifierRunnerBlock(save_data=False)(x81, x9) # MC bkg
x13 = ClassifierRunnerBlock(save_data=False)(x82, x9) # MC sig

In [None]:
pipe = Pipeline()
pipe.compose(x1, x2, x3, x4, x5, x6, x71, x81, x72, x82, x9, x10, x11, x12, x13)
pipe.fit()

In [None]:
sig_gen = pipe.pipes[-4].results
bkg_gen = pipe.pipes[-3].results
sig_mc = pipe.pipes[-1].results[:len(sig_gen)]
bkg_mc = pipe.pipes[-2].results[:len(sig_gen)]

In [None]:
set_size(s=20)

plt.hist(sig_gen, histtype="step", bins=30, range=(-0.6, 1.2), lw=3)
plt.hist(bkg_gen, histtype="step", bins=30, range=(-0.6, 1.2), lw=3)
plt.hist(sig_mc, histtype="step", bins=30, range=(-0.6, 1.2), lw=3)
plt.hist(bkg_mc, histtype="step", bins=30, range=(-0.6, 1.2), lw=3)

plt.legend(["sig gen", "bkg gen", "sig MC", "bkg MC"], loc="upper left")
plt.axvline(0.5, c="k", ls='--')
plt.ylabel("$N$", fontsize=29)
plt.xlabel("Classifier output", fontsize=29)
plt.tight_layout()
plt.savefig(saved + "class_gen_mc.pdf")

In [None]:
N = len(sig_gen)
sig_per = 0.2
N_sig = int(N * sig_per)

sig_gen = sig_gen[:N_sig]
sig_mc = sig_mc[:N_sig]

In [None]:
sig_bkg_gen = np.concatenate([sig_gen, bkg_gen])
sig_bkg_mc = np.concatenate([sig_mc, bkg_mc])

In [None]:
plt.hist(sig_gen, histtype="step", bins=30, range=(-0.6, 1.5), lw=2)
plt.hist(bkg_gen, histtype="step", bins=30, range=(-0.6, 1.5), lw=2)
plt.hist(sig_mc, histtype="step", bins=30, range=(-0.6, 1.5), lw=2)
plt.hist(bkg_mc, histtype="step", bins=30, range=(-0.6, 1.5), lw=2)

plt.hist(sig_bkg_gen, histtype="step", bins=30, range=(-0.6, 1.2), lw=2)
plt.hist(sig_bkg_mc, histtype="step", bins=30, range=(-0.6, 1.2), lw=2)

plt.legend(["gen sig", "gen bkg", "mc sig", "mc bkg", "gen sig+bkg", "mc sig+bkg"], loc="upper left")
plt.axvline(0.5, c="k", ls='--')
plt.ylabel("$N$")
plt.xlabel("Classifier output")
plt.tight_layout()
plt.savefig(saved + "class_gen_mc_comp.pdf")

In [None]:
plt.hist(sig_bkg_gen, histtype="step", bins=30, range=(-0.6, 1.5), lw=2)
plt.hist(sig_bkg_mc, histtype="step", bins=30, range=(-0.6, 1.5), lw=2)

plt.legend(["gen sig+bkg", "mc sig+bkg"], loc="upper left")
plt.axvline(0.5, c="k", ls='--')
plt.ylabel("$N$")
plt.xlabel("Classifier output")
plt.tight_layout()