In [None]:
import numpy as np

In [None]:
%cd ../../../

In [None]:
saved = "ml_hep_sim/notebooks/article_notebooks/saved/"

In [None]:
from ml_hep_sim.notebooks.article_notebooks.test_runs import *
from ml_hep_sim.pipeline.pipes import *
from ml_hep_sim.pipeline.blocks import *

from ml_hep_sim.plotting.style import style_setup, set_size
from ml_hep_sim.stats.stat_plots import two_sample_plot

from ml_hep_sim.data_utils.higgs.process_higgs_dataset import LATEX_COLNAMES, LOG_BIN_RANGES

import matplotlib 
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy

set_size()
style_setup(seaborn_pallete=True)

In [None]:
num_splines = [4, 8, 12, 32]

In [None]:
pipelines = run_spline_pipeline(train=False, gen=False, test=False, num_splines=num_splines)

In [None]:
val_losses, steps, times = [], [], []

for pipeline in pipelines:
    pipes = pipeline.pipes
    x1 = ModelLoaderBlock()(*pipes)._run()
    metrics = x1.metrics
    
    val_loss =metrics[-1]["val_loss"]
    step = metrics[-2]["step"]
    t = metrics[0]["timestamp"].to_numpy()
    times.append(t[-1] - t[0])
    val_losses.append(val_loss)
    steps.append(step)

In [None]:
set_size(s=20)

plt.plot(steps[0], val_losses[0], lw=4)
plt.plot(steps[1], val_losses[1], lw=4)
plt.plot(steps[2], val_losses[2], lw=4)
plt.plot(steps[3], val_losses[3], lw=4)

plt.legend(["4 bins", "8 bins", "12 bins", "32 bins"], fontsize=22)
plt.xlim([-800, 2.5*10**4])
plt.xlabel("Steps", loc="center", fontsize=29)
plt.ylabel("Validation loss", fontsize=29)

plt.tight_layout()
plt.savefig(saved + "val_loss_vs_steps_splines.pdf")

In [None]:
# pipeline = run_spline_pipeline(train=False, gen=False, test=False, num_splines=[32])[0]

pipeline = run_spline_pipeline(
        False,
        False,
        False,
        sig=False,
        num_splines=[32],
        name_str="",
        num_train=np.logspace(4, 6, 10).astype(int),
    )[-1]

In [None]:
pipeline

In [None]:
device = "cuda"

r = 15 # repeats
s = 30 # scaling plot points
N = 10**5

res_lst = []
class_res = []

for _ in range(r):
    x_ConfigBuilderBlock, _, _, x_ModelTrainerBlock = pipeline.pipes
    x_ConfigBuilderBlock.config["datasets"]["data_params"]["subset_n"] = [250000, 100000, N]

    x1 = ModelLoaderBlock(device=device)(x_ConfigBuilderBlock, x_ModelTrainerBlock)

    x2 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x1)
    x3 = GeneratedDataVerifierBlock(save_data=False, device=device, rescale_data=False)(x1, x2)

    x4 = DatasetBuilderBlock()(x_ConfigBuilderBlock)
    x5 = ReferenceDataLoaderBlock(rescale_reference="logit_normal", device=device)(x4)

    class_run_name = "Higgs_resnet_classifier_train_pipeline"
    class_train_pipeline = Pipeline(pipeline_name=class_run_name, pipeline_path="ml_pipeline/")
    class_train_pipeline.load()

    x6 = ModelLoaderBlock(device=device)(class_train_pipeline.pipes[0], class_train_pipeline.pipes[-1])
    x7 = ClassifierRunnerBlock(save_data=False, device=device)(x5, x6)
    x8 = ClassifierRunnerBlock(save_data=False, device=device)(x3, x6)

    class_res.append(x7.results)

    x9 = ScalingTestBlock(1000, N, s)(x7, x8)

    scaling_pipe = Pipeline()
    scaling_pipe.compose(x1, x2, x3, x4, x5, x6, x7, x8, x9)
    scaling_pipe.fit()

    res = scaling_pipe.pipes[-1].results

    res_lst.append(res)

In [None]:
chi2_m = np.zeros((r, s))
ks_m = np.zeros((r, s))
chi2_m_crit = np.zeros((r, s))
ks_m_crit = np.zeros((r, s))

for ri in range(r):
    for si in range(s):
        chi2, ks = res_lst[ri][si]
        
        chi2_m[ri, si] = chi2["chi2"].to_numpy()[0]
        ks_m[ri, si] = ks["ks"].to_numpy()[0]
        chi2_m_crit[ri, si] = chi2["crit"].to_numpy()[0]
        ks_m_crit [ri, si] = ks["crit"].to_numpy()[0]

In [None]:
N_range = x9.N_range

In [None]:
set_size(20)

plt.scatter(N_range, chi2_m.mean(axis=0), s=60)
plt.plot(N_range, chi2_m_crit.mean(axis=0), ls='--', c='C1', lw=3)

plt.errorbar(N_range, chi2_m.mean(axis=0), yerr=chi2_m.std(axis=0), capsize=4, ls="none", lw=2, capthick=2)

def func(x, k, n):
    return k * x + n

popt, pcov = curve_fit(func, N_range, chi2_m.mean(axis=0), sigma=chi2_m.std(axis=0))
plt.plot(N_range, func(N_range, *popt), ls='--', c="C2", lw=3)

plt.xlim([-2000, 1.05*10**5])
plt.xlabel("$N$ generated", loc="center", fontsize=29)
plt.ylabel("$\chi^2$ score", fontsize=29)
plt.legend(["critical", f"fit $kx+n$, $k=${popt[0]:.2e}", "result"], fontsize=22)
plt.title("RQS $\chi^2$ test")

plt.tight_layout()
plt.savefig(saved + "rqs_chi2_scaling.pdf")

In [None]:
set_size(20)

plt.scatter(N_range[1:], ks_m.mean(axis=0)[1:], s=80)
plt.plot(N_range[1:], ks_m_crit.mean(axis=0)[1:], ls='--', c="C1", lw=3)
plt.errorbar(N_range[1:], ks_m.mean(axis=0)[1:], yerr=ks_m.std(axis=0)[1:], capsize=4, ls="none", lw=2, capthick=2)

def func(x, k, n):
    return k * x + n

popt, pcov = curve_fit(func, N_range[1:], ks_m.mean(axis=0)[1:], sigma=ks_m.std(axis=0)[1:])
plt.plot(N_range[1:], func(N_range[1:], *popt), ls='--', c="C2", lw=3)

plt.xlim([1000, 1.1*10**5])
plt.ylim([0, 3.4*10**(-2)])
plt.xlabel("$N$ generated", loc="center", fontsize=29)
plt.ylabel("KS score", fontsize=29)
plt.legend(["critical", f"fit $kx+n$, $k=${popt[0]:.2e}", "result"], fontsize=22, loc="upper right")
plt.title("RQS KS test")
plt.tight_layout()
plt.savefig(saved + "rqs_ks_scaling.pdf")

In [None]:
N = 10 ** 5
device = "cuda"

x_ConfigBuilderBlock, _, _, x_ModelTrainerBlock = pipeline.pipes

x1 = ModelLoaderBlock(device=device)(x_ConfigBuilderBlock, x_ModelTrainerBlock)

x2 = DataGeneratorBlock(N, model_type="flow", chunks=10, device=device)(x1)
x3 = GeneratedDataVerifierBlock(save_data=False, device=device)(x1, x2)

x4 = DatasetBuilderBlock()(x_ConfigBuilderBlock)
x5 = ReferenceDataLoaderBlock(device=device)(x4)

x6 = ScalingTestBlock(10000, N, 30)(x5, x3)

scaling_pipe_full = Pipeline()
scaling_pipe_full.compose(x1, x2, x3, x4, x5, x6)
scaling_pipe_full.fit()

In [None]:
N_range = x6.N_range

In [None]:
res = scaling_pipe_full.pipes[-1].results

In [None]:
s_chi2 = np.zeros((18, len(res)))
s_chi2_crit = np.zeros((18, len(res)))
s_ks = np.zeros((18, len(res)))
s_ks_crit = np.zeros((18, len(res)))

In [None]:
for i, r in enumerate(res):
    chi2, ks = r
    s_chi2[:, i] = chi2["chi2"].to_numpy()
    s_ks[:, i] = ks["ks"].to_numpy()
    s_chi2_crit[:, i] = chi2["crit"].to_numpy()
    s_ks_crit[:, i] = ks["crit"].to_numpy()

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

set_size(18)

plt.figure()
ax = plt.gca()

im = ax.imshow(np.log10(s_chi2 / s_chi2_crit))
im.set_clim(-0.4, 0.8)

ax.set_yticks(np.arange(0, 18, 1))
ax.set_yticklabels(LATEX_COLNAMES)

ax.minorticks_off()
              
ax.set_xticks(np.arange(0, 30, 1)[1::5])
ax.set_xticklabels(N_range[1::5])          
              
ax.set_xlabel("$N$ generated", loc="center")

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im, cax=cax)
cax.set_xlabel('log $\chi^2/\chi^2_c$', loc="center")
ax.set_title("RQS $\chi^2$ test")

plt.tight_layout()
plt.savefig(saved + "imshow_rqs_chi2.pdf")

In [None]:
set_size(18)

plt.figure()
ax = plt.gca()

im = ax.imshow(np.log10(s_ks / s_ks_crit))
im.set_clim(-0.4, 0.8)

ax.set_yticks(np.arange(0, 18, 1))
ax.set_yticklabels(LATEX_COLNAMES)

ax.minorticks_off()
              
ax.set_xticks(np.arange(0, 30, 1)[1::5])
ax.set_xticklabels(N_range[1::5])
              
ax.set_xlabel("$N$ generated", loc="center")

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im, cax=cax)
cax.set_xlabel('log KS$/$KS$_c$', loc="center")
ax.set_title("RQS KS test")

plt.tight_layout()
plt.savefig(saved + "imshow_rqs_ks.pdf")