In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import glob
import itertools

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from tqdm import tqdm
import xgboost as xgb #BDT
import pickle
import mplhep as hep

import concurrent.futures as futures

from utils.analysis import Optimization, PandasAnalysis

TAG = "abcdnet"

  from pandas import MultiIndex, Int64Index


In [2]:
babies = sorted(glob.glob(f"../analysis/studies/vbsvvhjets/output_{TAG}/Run2/*.root"))
sig_babies = [baby for baby in babies if "VBS" in baby]
bkg_babies = [baby for baby in babies if "VBS" not in baby and "data" not in baby]
data_babies = [baby for baby in babies if "data" in baby]
print("Signal:")
print("\n".join(sig_babies))
print("Background:")
print("\n".join(bkg_babies))
print("Data:")
print("\n".join(data_babies))

Signal:
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/VBSVVH.root
Background:
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/Bosons.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/QCD.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/SingleTop.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TT1L.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TTH.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TTHad.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TTW.root
Data:
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/data.root


In [3]:
vbsvvh = Optimization(
    sig_root_files=sig_babies,
    bkg_root_files=bkg_babies,
    data_root_files=data_babies,
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", 
        "pu_sf",
        "prefire_sf"
    ]
)

vbsvvh.df["obj_sel"] = True
# vbsvvh.df["qcdnorm_CR"] = vbsvvh.df.eval(
#     "obj_sel and hbbfatjet_xbb > 0.1 and ld_vqqfatjet_xwqq > 0.5 and tr_vqqfatjet_xwqq > 0.5"
# )
# vbsvvh.make_selection("qcdnorm_CR")

bkg_count = vbsvvh.bkg_count()
qcd_count = vbsvvh.sample_count("QCD")
data_count = vbsvvh.data_count()

vbsvvh.df.loc[vbsvvh.df.name == "QCD", "event_weight"] *= (data_count - (bkg_count - qcd_count))/(qcd_count)

Loading sig babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.68s/it]
Loading bkg babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.56it/s]
Loading data babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.43it/s]


In [13]:
# vbsvvh.make_selection("abs(deta_jj) > 4")

scans = {
#     "abcdnet_score": np.linspace(0.89, 0.99, 11),
    "abcdnet_score": np.linspace(0.87, 0.97, 11),
    "hbbfatjet_xbb": np.linspace(0.5, 0.8, 6),
    "ld_vqqfatjet_xwqq": np.linspace(0.5, 0.8, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.5, 0.8, 6),
    "abs_deta_jj": [3, 3.5, 4]
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(max(B, 0.0001))
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7128/7128 [06:31<00:00, 18.20it/s]


In [4]:
scans = {
    "abcdnet_score1": np.linspace(0.79, 0.99, 6),
    "abcdnet_score2": np.linspace(0.79, 0.99, 6),
    "hbbfatjet_xbb": np.linspace(0.5, 0.8, 6),
    "ld_vqqfatjet_xwqq": np.linspace(0.5, 0.8, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.5, 0.8, 6),
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(B) if B > 0 else S/np.sqrt(0.0001)
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7776/7776 [06:07<00:00, 21.16it/s]


In [5]:
results_df = pd.DataFrame(
    {
        "selection": [r[0] for r in results],
        "sig": [r[1] for r in results],
        "bkg": [r[2] for r in results],
        "fom": [r[3] for r in results]
    }
)

In [16]:
results_df[(results_df.sig > 4.5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15]

Unnamed: 0,selection,sig,bkg,fom
2762,abcdnet_score1 > 0.87 and abcdnet_score2 > 0.7...,4.556552,1.519526,3.696428
2906,abcdnet_score1 > 0.87 and abcdnet_score2 > 0.8...,4.530361,1.520735,3.67372
2726,abcdnet_score1 > 0.87 and abcdnet_score2 > 0.7...,4.630198,1.701724,3.5494
1718,abcdnet_score1 > 0.8300000000000001 and abcdne...,4.501235,1.608317,3.549326
1503,abcdnet_score1 > 0.8300000000000001 and abcdne...,4.508406,1.650642,3.509106
2691,abcdnet_score1 > 0.87 and abcdnet_score2 > 0.7...,4.518579,1.659645,3.507473
2686,abcdnet_score1 > 0.87 and abcdnet_score2 > 0.7...,4.506239,1.67441,3.482438
566,abcdnet_score1 > 0.79 and abcdnet_score2 > 0.8...,4.51057,1.714965,3.444323
141,abcdnet_score1 > 0.79 and abcdnet_score2 > 0.7...,4.526477,1.76124,3.410759
632,abcdnet_score1 > 0.79 and abcdnet_score2 > 0.8...,4.547727,1.78871,3.400356


In [17]:
bf_SR = results_df[(results_df.sig > 4.5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15].selection.values[0]
bf_SR

'abcdnet_score1 > 0.87 and abcdnet_score2 > 0.79 and hbbfatjet_xbb > 0.74 and ld_vqqfatjet_xwqq > 0.74 and tr_vqqfatjet_xwqq > 0.62'

In [25]:
bf_rounded_SR = "abcdnet_score > 0.98 and hbbfatjet_xbb > 0.75 and ld_vqqfatjet_xwqq > 0.70 and tr_vqqfatjet_xwqq > 0.75 and abs_deta_jj > 3.5"
# bf_rounded_SR = "bdt > 0.8 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.7 and tr_vqqfatjet_xwqq > 0.7"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     3.003478457899269 +- 0.04881406858149049
Background: 0.7923911260720186 +- 0.3329195450133814


In [24]:
bf_rounded_SR = "abcdnet_score1 > 0.87 and abcdnet_score2 > 0.8 and hbbfatjet_xbb > 0.74 and ld_vqqfatjet_xwqq > 0.74 and tr_vqqfatjet_xwqq > 0.62"
# bf_rounded_SR = "bdt > 0.8 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.7 and tr_vqqfatjet_xwqq > 0.7"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     4.520915432929149 +- 0.05940794069761585
Background: 1.3806090251954204 +- 0.38890145375643526


In [49]:
bf_SR = results_df[(results_df.sig > 5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15].selection.values[1]
bf_SR

'bdt > 0.74 and hbbfatjet_xbb > 0.58 and ld_vqqfatjet_xwqq > 0.62 and tr_vqqfatjet_xwqq > 0.66'

In [50]:
bf_rounded_SR = "bdt > 0.75 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.6 and tr_vqqfatjet_xwqq > 0.65"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     5.0135211002373286 +- 0.06309293035494924
Background: 1.1027211115484759 +- 0.3359880041887074


In [20]:
# vbsvvh.df["SR"] = vbsvvh.df.eval(bf_rounded_SR)

# vbsvvh.make_selection("SR")
# update_cutflows("SR")

# cutflows.reorder(["QCD", "TTHad", "TT1L", "TTW", "TTH", "SingleTop", "Bosons", "TotalBkg", "VBSVVH"])
# cutflows.write_csv("test.csv", cutflows.terminal_cut_names[-1])