In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import glob
import itertools

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from tqdm import tqdm
import xgboost as xgb #BDT
import pickle
import mplhep as hep

import concurrent.futures as futures

from utils.analysis import Optimization, PandasAnalysis

TAG = "pnetfix"

  from pandas import MultiIndex, Int64Index


In [3]:
babies = sorted(glob.glob(f"../analysis/studies/vbsvvhjets/output_{TAG}/Run2/*.root"))
sig_babies = [baby for baby in babies if "VBS" in baby]
bkg_babies = [baby for baby in babies if "VBS" not in baby and "data" not in baby]
data_babies = [baby for baby in babies if "data" in baby]
print("Signal:")
print("\n".join(sig_babies))
print("Background:")
print("\n".join(bkg_babies))
print("Data:")
print("\n".join(data_babies))

Signal:
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/VBSVVH.root
Background:
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/Bosons.root
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/QCD.root
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/SingleTop.root
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/TT1L.root
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/TTH.root
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/TTHad.root
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/TTW.root
Data:
../analysis/studies/vbsvvhjets/output_pnetfix/Run2/data.root


In [23]:
vbsvvh = Optimization(
    sig_root_files=sig_babies,
    bkg_root_files=bkg_babies,
    data_root_files=data_babies,
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", 
        "pu_sf",
        "prefire_sf"
    ]
)

vbsvvh.df["objsel"] = True
vbsvvh.df["presel"] = vbsvvh.df.eval(
    "objsel and hbbfatjet_xbb > 0.5 and ld_vqqfatjet_xwqq > 0.3 and tr_vqqfatjet_xwqq > 0.3"
)
vbsvvh.make_selection("presel")

bkg_count = vbsvvh.bkg_count()
qcd_count = vbsvvh.sample_count("QCD")
data_count = vbsvvh.data_count()

vbsvvh.df.loc[vbsvvh.df.name == "QCD", "event_weight"] *= (data_count - (bkg_count - qcd_count))/(qcd_count)


# vbsvvh.df["bdt_presel"] = vbsvvh.df.eval(
#     "M_jj > 500 and abs(deta_jj) > 3"
#     + "and hbbfatjet_xbb > 0.5"
#     + "and ld_vqqfatjet_xwqq > 0.5 and tr_vqqfatjet_xwqq > 0.5"
#     + "and ST > 1300"
#     + "and hbbfatjet_mass < 150"
#     + "and ld_vqqfatjet_mass < 120 and tr_vqqfatjet_mass < 120"
# )
# vbsvvh.df["bdt_presel"] = vbsvvh.df.eval(
#     "ST > 1300"
#     + " and hbbfatjet_xbb > 0.5"
#     + " and hbbfatjet_mass < 150"
#     + " and ld_vqqfatjet_xwqq > 0.5 and tr_vqqfatjet_xwqq > 0.5"
#     + " and ld_vqqfatjet_mass < 120 and tr_vqqfatjet_mass < 120"
# )
# vbsvvh.make_selection("bdt_presel")

bdt_name = "bdt_mediumPresel"
with open(f"../analysis/studies/vbsvvhjets/vbsvvhjets_bdt/{bdt_name}_features.txt", "r") as txt_file:
    features = txt_file.read().splitlines()
bst = pickle.load(open(f"../analysis/studies/vbsvvhjets/vbsvvhjets_bdt/{bdt_name}.pkl", "rb"))
vbsvvh.df["bdt"] = bst.predict(xgb.DMatrix(vbsvvh.df[features]))

Loading sig babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Loading bkg babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.41s/it]
Loading data babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.47s/it]
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [24]:
scans = {
    "bdt": np.linspace(0.8, 0.9, 6),
    "hbbfatjet_xbb": np.linspace(0.5, 0.9, 6),
    "ld_vqqfatjet_xwqq": np.linspace(0.5, 0.9, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.5, 0.9, 6),
    "abs_deta_jj": [3, 3.5, 4],
    "M_jj": [400, 500, 600]
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(B) if B > 0 else S/np.sqrt(0.0001)
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11664/11664 [09:52<00:00, 19.70it/s]


In [25]:
results_df = pd.DataFrame(
    {
        "selection": [r[0] for r in results],
        "sig": [r[1] for r in results],
        "bkg": [r[2] for r in results],
        "fom": [r[3] for r in results]
    }
)

In [26]:
results_df[(results_df.sig > 5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15]

Unnamed: 0,selection,sig,bkg,fom
9962,bdt > 0.9 and hbbfatjet_xbb > 0.5 and ld_vqqfa...,5.045029,0.807172,5.615398
8666,bdt > 0.88 and hbbfatjet_xbb > 0.66 and ld_vqq...,5.005103,0.975559,5.067411
5650,bdt > 0.8400000000000001 and hbbfatjet_xbb > 0...,5.01508,1.019257,4.967479
5102,bdt > 0.8400000000000001 and hbbfatjet_xbb > 0...,5.068073,1.061146,4.919889
8342,bdt > 0.88 and hbbfatjet_xbb > 0.58 and ld_vqq...,5.093697,1.118699,4.815889
10241,bdt > 0.9 and hbbfatjet_xbb > 0.58 and ld_vqqf...,5.027383,1.103192,4.786483
3706,bdt > 0.8200000000000001 and hbbfatjet_xbb > 0...,5.073891,1.125716,4.782189
3752,bdt > 0.8200000000000001 and hbbfatjet_xbb > 0...,5.027917,1.115374,4.760777
5649,bdt > 0.8400000000000001 and hbbfatjet_xbb > 0...,5.07852,1.153125,4.729323
1763,bdt > 0.8 and hbbfatjet_xbb > 0.9 and ld_vqqfa...,5.026069,1.149155,4.688557


In [27]:
bf_SR = results_df[(results_df.sig > 5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15].selection.values[0]
bf_SR

'bdt > 0.9 and hbbfatjet_xbb > 0.5 and ld_vqqfatjet_xwqq > 0.8200000000000001 and tr_vqqfatjet_xwqq > 0.66 and abs_deta_jj > 4 and M_jj > 600'

In [29]:
bf_rounded_SR = "bdt > 0.9 and hbbfatjet_xbb > 0.5 and ld_vqqfatjet_xwqq > 0.82 and tr_vqqfatjet_xwqq > 0.66 and abs_deta_jj > 5 and M_jj > 600"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     4.660689082644167 +- 0.06067747633923873
Background: 0.5678633281235802 +- 0.21793656600851102


In [49]:
bf_SR = results_df[(results_df.sig > 5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15].selection.values[1]
bf_SR

'bdt > 0.74 and hbbfatjet_xbb > 0.58 and ld_vqqfatjet_xwqq > 0.62 and tr_vqqfatjet_xwqq > 0.66'

In [50]:
bf_rounded_SR = "bdt > 0.75 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.6 and tr_vqqfatjet_xwqq > 0.65"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     5.0135211002373286 +- 0.06309293035494924
Background: 1.1027211115484759 +- 0.3359880041887074


In [20]:
# vbsvvh.df["SR"] = vbsvvh.df.eval(bf_rounded_SR)

# vbsvvh.make_selection("SR")
# update_cutflows("SR")

# cutflows.reorder(["QCD", "TTHad", "TT1L", "TTW", "TTH", "SingleTop", "Bosons", "TotalBkg", "VBSVVH"])
# cutflows.write_csv("test.csv", cutflows.terminal_cut_names[-1])