In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import itertools

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from tqdm import tqdm
import xgboost as xgb #BDT
import pickle
import mplhep as hep

import concurrent.futures as futures

from utils.analysis import Optimization, PandasAnalysis

  from pandas import MultiIndex, Int64Index


In [2]:
babies = glob.glob("../analysis/studies/vbsvvhjets/output_new/Run2/*.root")
babies = [baby for baby in babies if "data" not in baby]
babies

['../analysis/studies/vbsvvhjets/output_new/Run2/TTW.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/TTHad.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/SingleTop.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/QCD.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/TTH.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/Bosons.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/TT1L.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/VBSVVH.root']

In [29]:
vbsvvh = Optimization(
    sig_root_files=babies[-1:],
    bkg_root_files=babies[:-1],
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", 
        "pu_sf",
        "prefire_sf"
    ]
)

vbsvvh.df["bdt_presel"] = vbsvvh.df.eval(
    "ST > 1300"
    + "and hbbfatjet_score > 0.5"
    + "and ld_vqqfatjet_score > 0.5 and tr_vqqfatjet_score > 0.5"
    + "and ld_vqqfatjet_mass < 120 and tr_vqqfatjet_mass < 120"
    + "and hbbfatjet_mass < 150"
)

with open("../analysis/studies/vbsvvhjets/vbsvvhjets_bdt/bdt_features.txt", "r") as txt_file:
    features = txt_file.read().splitlines()
bst = pickle.load(open("../analysis/studies/vbsvvhjets/vbsvvhjets_bdt/bdt.pkl", "rb"))
vbsvvh.df["bdt"] = bst.predict(xgb.DMatrix(vbsvvh.df[features]))

Loading sig babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.84it/s]
Loading bkg babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.05s/it]
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [30]:
from utils.cutflow import CutflowCollection, Cutflow, Cut

cutflows = CutflowCollection(
    cutflows={name: Cutflow() for name in vbsvvh.df.name.unique()}
)

In [31]:
def update_cutflows(selection, cut_name=None):
    global vbsvvh
    global cutflows
    selection_str = PandasAnalysis.get_selection_str(selection)
    for name, cutflow in cutflows.items():
        # Get previous cut
        if len(cutflow) > 0:
            prev_cut = cutflow[cutflow.terminal_cut_names[-1]]
        else:
            prev_cut = Cut("Dummy")
        # Create new cut
        _df = vbsvvh.df[(vbsvvh.df.name == name)]
        n_pass_raw = len(_df)
        n_pass_wgt = _df.event_weight.sum()
        new_cut = Cut(
            cut_name or selection_str,
            n_pass=n_pass_raw,
            n_pass_weighted=n_pass_wgt,
            n_fail=(prev_cut.n_pass - n_pass_raw),
            n_fail_weighted=(prev_cut.n_pass_weighted - n_pass_wgt),
        )
        # Insert new cut
        if len(cutflow) > 0:
            cutflow.insert(prev_cut.name, new_cut)
        else:
            cutflow.set_root_cut(new_cut)
            
    if "TotalBkg" in cutflows.names:
        cutflows.pop("TotalBkg")

    cutflows["TotalBkg"] = cutflows.sum()

    for name in vbsvvh.df[vbsvvh.df.is_data | vbsvvh.df.is_signal].name.unique():
        cutflows["TotalBkg"] -= cutflows[name]

In [32]:
vbsvvh.make_selection("bdt_presel")
vbsvvh.set_split_column(ratio=0.6, name="split")

update_cutflows("bdt_presel")

In [33]:
vbsvvh.make_selection("M_jj > 600 and abs(deta_jj) > 4")

update_cutflows("M_jj > 600 and abs(deta_jj) > 4")

In [34]:
vbsvvh.get_event_counts()

(8.48347621199817, 207.62503816957337)

In [35]:
np.linspace(0.75, 0.95, 11)

array([0.75, 0.77, 0.79, 0.81, 0.83, 0.85, 0.87, 0.89, 0.91, 0.93, 0.95])

In [36]:
scans = {
    "bdt": np.linspace(0.75, 0.95, 11),
    "hbbfatjet_score": np.linspace(0.5, 0.9, 11),
    "ld_vqqfatjet_score": np.linspace(0.5, 0.9, 11),
    "tr_vqqfatjet_score": np.linspace(0.5, 0.9, 11)
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(B) if B > 0 else S/np.sqrt(0.0001)
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14641/14641 [05:03<00:00, 48.30it/s]


In [37]:
results_df = pd.DataFrame(
    {
        "selection": [r[0] for r in results],
        "sig": [r[1] for r in results],
        "bkg": [r[2] for r in results],
        "fom": [r[3] for r in results]
    }
)

In [40]:
results_df[(results_df.sig > 4.5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15]

Unnamed: 0,selection,sig,bkg,fom
3910,bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vq...,4.526214,0.704336,5.393184
3899,bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vq...,4.593207,0.739712,5.340535
3580,bdt > 0.79 and hbbfatjet_score > 0.78 and ld_v...,4.540158,0.748414,5.248075
3878,bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vq...,4.573396,0.846456,4.970917
3877,bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vq...,4.689965,0.890959,4.968677
3338,bdt > 0.79 and hbbfatjet_score > 0.7 and ld_vq...,4.624939,0.877442,4.937384
5109,bdt > 0.8099999999999999 and hbbfatjet_score >...,4.54364,0.847193,4.936427
5208,bdt > 0.8099999999999999 and hbbfatjet_score >...,4.517215,0.838593,4.932817
3889,bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vq...,4.533466,0.84623,4.928175
3888,bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vq...,4.647162,0.890733,4.923955


In [45]:
bf_SR = results_df[(results_df.sig > 4.5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15].selection.values[0]
bf_SR

'bdt > 0.79 and hbbfatjet_score > 0.9 and ld_vqqfatjet_score > 0.62 and tr_vqqfatjet_score > 0.7'

In [46]:
bf_rounded_SR = "bdt > 0.8 and hbbfatjet_score > 0.9 and ld_vqqfatjet_score > 0.6 and tr_vqqfatjet_score > 0.7"
vbsvvh.get_event_counts(bf_rounded_SR)

(4.486506345639647, 0.6467198500841318)

In [48]:
vbsvvh.df["SR"] = vbsvvh.df.eval(bf_rounded_SR)

In [49]:
vbsvvh.make_selection("SR")

update_cutflows("SR")

In [50]:
cutflows.reorder(["QCD", "TTHad", "TT1L", "TTW", "TTH", "SingleTop", "Bosons", "TotalBkg", "VBSVVH"])
cutflows.write_csv("test.csv", cutflows.terminal_cut_names[-1])