In [35]:
import os

import pandas as pd
from tqdm import tqdm

from diquark import DATA_KEYS, PATH_DICT, CROSS_SECTION_DICT
from diquark.helpers import create_data_dict, get_col
from diquark.load import read_jet_delphes, read_met_delphes
from diquark.features import (
    jet_multiplicity,
    leading_jet_arr,
    calculate_delta_r,
    combined_invariant_mass,
    three_jet_invariant_mass,
    calculate_missing_transverse_energy,
    missing_ET,
)
from diquark.plotting import make_histogram, make_histogram_with_double_gaussian_fit


if os.getcwd().split("/")[-1] == "notebooks":
    os.chdir("..")

In [37]:
datasets = {key: read_jet_delphes(PATH_DICT[key]) for key in tqdm(DATA_KEYS)}
datasets2 = {key2: read_met_delphes(PATH_DICT[key2]) for key2 in tqdm(DATA_KEYS)}


  0%|          | 0/2 [00:43<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00,  6.67it/s]
100%|██████████| 2/2 [00:00<00:00,  6.92it/s]


In [38]:
jet_multiplicities = {key: jet_multiplicity(ds) for key, ds in tqdm(datasets.items())}

100%|██████████| 2/2 [00:00<00:00, 9532.51it/s]


In [39]:
jet_pts = {key: leading_jet_arr(data, key="Jet/Jet.PT") for key, data in tqdm(datasets.items())}
jet_etas = {key: leading_jet_arr(data, key="Jet/Jet.Eta") for key, data in tqdm(datasets.items())}
jet_phis = {key: leading_jet_arr(data, key="Jet/Jet.Phi") for key, data in tqdm(datasets.items())}

100%|██████████| 2/2 [00:00<00:00, 434.42it/s]
100%|██████████| 2/2 [00:00<00:00, 466.50it/s]
100%|██████████| 2/2 [00:00<00:00, 477.68it/s]


In [41]:
miss_MET = {key2: missing_ET(data2, key2="MissingET/MissingET.MET") for key2, data2 in tqdm(datasets2.items())}


100%|██████████| 2/2 [00:00<00:00, 642.07it/s]


In [46]:
missing_energy = {} #key: calculate_missing_transverse_energy(px_miss, py_miss) for key, arr in tqdm(datasets.items())
for key2, data2 in tqdm(datasets2.items()):
    px_miss = missing_ET(data2, 4, key2="MissingET/MissingET.MET")
    py_miss = missing_ET(data2, 4, key2="MissingET/MissingET.MET")
    
    missing_energy[key2] = calculate_missing_transverse_energy(px_miss, py_miss)


100%|██████████| 2/2 [00:00<00:00, 356.45it/s]


In [48]:
combined_masses = {key: combined_invariant_mass(arr) for key, arr in tqdm(datasets.items())}

100%|██████████| 2/2 [00:00<00:00, 64.50it/s]


In [49]:
delta_rs = {}
for key, data in tqdm(datasets.items()):
    etas = leading_jet_arr(data, 6, key="Jet/Jet.Eta")
    phis = leading_jet_arr(data, 6, key="Jet/Jet.Phi")
    pts = leading_jet_arr(data, 6, key="Jet/Jet.PT")

    delta_rs[key] = calculate_delta_r(etas, phis, pts)

100%|██████████| 2/2 [00:00<00:00, 47.37it/s]


In [50]:
m3j_s = {}
for key, data in tqdm(datasets.items()):
    m3j_s[key] = three_jet_invariant_mass(data)

100%|██████████| 2/2 [00:01<00:00,  1.02it/s]


In [51]:
fig = make_histogram(combined_masses, 20, clip_top_prc=100)
bin_width = fig.data[0].x[1] - fig.data[0].x[0]
fig.update_layout(
    title="6-jet Mass",
    xaxis_title="Invariant Mass [GeV]",
    yaxis_title_text="count x sigma",
    barmode="stack",
    bargap=0,
    width=1600 * (2 / 3),
    height=900 * (2 / 3),
    # ignore first bin
    xaxis_range=[fig.data[0].x[1] - bin_width / 2, fig.data[0].x[-1] + bin_width / 2],
    yaxis_type="log",
)

fig.show()
print(
    [fig.data[0].x[1] - bin_width, fig.data[0].x[-1]],
)

  0%|          | 0/2 [07:13<?, ?it/s]


[0.0, 9417.129991671465]


In [52]:
ds = create_data_dict(
    **{
        "multiplicity": jet_multiplicities,
        "delta_R": delta_rs,
        "m3j": m3j_s,
        "inv_mass": combined_masses,
        "pt": jet_pts,
        "eta": jet_etas,
        "phi": jet_phis,
        "mET": missing_energy,
    }
)

In [53]:
df = pd.DataFrame(ds)
df["target"] = df["Truth"].apply(lambda x: 1 if "SIG" in x else 0)

In [54]:
df.head()

Unnamed: 0,Truth,multiplicity,delta_R_1,delta_R_2,delta_R_3,delta_R_4,delta_R_5,delta_R_6,delta_R_7,delta_R_8,...,phi_2,phi_3,phi_4,phi_5,phi_6,mET_1,mET_2,mET_3,mET_4,target
0,BKG:semilepto_test,7,0.0,1.616195,2.158732,2.158483,2.304399,0.735744,0.0,0.0,...,2.587464,2.895854,-0.808096,-1.254239,0.474848,1627.102595,0.0,0.0,0.0,0
1,BKG:semilepto_test,7,0.0,2.794346,3.630165,2.483253,1.819955,2.97105,0.0,0.0,...,-2.978855,2.746671,2.002219,-1.144244,2.155058,846.806205,0.0,0.0,0.0,0
2,BKG:semilepto_test,6,0.0,3.095241,2.633242,2.163251,2.645165,2.844667,0.0,0.0,...,-0.625289,-1.476933,-1.581123,-2.121839,-0.053876,233.95649,0.0,0.0,0.0,0
3,BKG:semilepto_test,5,0.0,1.476971,1.050706,3.225524,2.661287,0.0,0.0,0.0,...,-2.573591,-3.015218,-1.501568,-0.330397,0.0,2224.845368,0.0,0.0,0.0,0
4,BKG:semilepto_test,5,0.0,3.084862,1.971243,3.002193,2.314375,0.0,0.0,0.0,...,-2.473694,2.331247,-2.901099,2.750091,0.0,2110.220183,0.0,0.0,0.0,0


In [55]:
df.to_parquet("data/full_sample.parquet", index=False)

# Data Visualization

In [56]:
fig = make_histogram(jet_pts, 20, col=0, clip_top_prc=100)
bin_width = fig.data[0].x[1] - fig.data[0].x[0]
fig.update_layout(
    title="6-jet Mass",
    xaxis_title="Invariant Mass [GeV]",
    yaxis_title_text="count x sigma",
    barmode="stack",
    bargap=0,
    width=1600 * (2 / 3),
    height=900 * (2 / 3),
    # ignore first bin
    xaxis_range=[fig.data[0].x[1] - bin_width / 2, fig.data[0].x[-1] + bin_width / 2],
    yaxis_type="log",
)

fig.show()

In [57]:
suu_mass = {"SIG:suu": combined_masses["SIG:suu"]}
fig = make_histogram_with_double_gaussian_fit(suu_mass, 20, clip_top_prc=100, cross=None)
bin_width = fig.data[0].x[1] - fig.data[0].x[0]
fig.update_layout(
    title="6-jet Mass",
    xaxis_title="Invariant Mass [GeV]",
    yaxis_title_text="probability density",
    barmode="stack",
    bargap=0,
    width=1300 * (2 / 3),
    height=1300 * (2 / 3),
    # ignore first bin
    xaxis_range=[fig.data[0].x[1] - bin_width / 2, fig.data[0].x[-1] + bin_width / 2],
    yaxis_type="log",
)
fig.update_legends(
    title_text="",
    itemsizing="constant",
    yanchor="top",
    y=0.1,
    xanchor="left",
    x=0.01,
    font=dict(size=16),
)
fig.show()
# fig.write_image("suu_mass.pdf")

RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 800.