In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import itertools

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from tqdm import tqdm
import xgboost as xgb #BDT
import pickle
import mplhep as hep

import concurrent.futures as futures

from utils.analysis import Optimization

  from pandas import MultiIndex, Int64Index


In [2]:
babies = glob.glob("../analysis/studies/vbsvvhjets/output_new/Run2/*.root")
babies = [baby for baby in babies if "data" not in baby]
babies

['../analysis/studies/vbsvvhjets/output_new/Run2/TTW.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/TTHad.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/SingleTop.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/QCD.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/TTH.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/Bosons.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/TT1L.root',
 '../analysis/studies/vbsvvhjets/output_new/Run2/VBSVVH.root']

In [3]:
vbsvvh = Optimization(
    sig_root_files=babies[-1:],
    bkg_root_files=babies[:-1],
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", 
        "pu_sf",
        "prefire_sf"
    ]
)

with open("../analysis/studies/vbsvvhjets/vbsvvhjets_bdt/bdt_features.txt", "r") as txt_file:
    features = txt_file.read().splitlines()
bst = pickle.load(open("../analysis/studies/vbsvvhjets/vbsvvhjets_bdt/bdt.pkl", "rb"))
vbsvvh.df["bdt"] = bst.predict(xgb.DMatrix(vbsvvh.df[features]))

Loading sig babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.84it/s]
Loading bkg babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.06s/it]
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [4]:
vbsvvh.make_selection(
    "ST > 1300"
    + "and hbbfatjet_score > 0.5"
    + "and ld_vqqfatjet_score > 0.5 and tr_vqqfatjet_score > 0.5"
    + "and ld_vqqfatjet_mass < 120 and tr_vqqfatjet_mass < 120"
    + "and hbbfatjet_mass < 150"
)
vbsvvh.set_split_column(ratio=0.6, name="split")

In [5]:
vbsvvh.make_selection("M_jj > 500 and abs(deta_jj) > 3")

In [6]:
vbsvvh.get_event_counts()

(8.805810928567691, 285.5180614380611)

In [7]:
scans = {
    "bdt": np.linspace(0.79, 0.99, 11),
    "hbbfatjet_score": np.linspace(0.75, 0.95, 11),
    "ld_vqqfatjet_score": np.linspace(0.75, 0.95, 11),
    "tr_vqqfatjet_score": np.linspace(0.75, 0.95, 11)
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(B) if B > 0 else S/np.sqrt(0.0001)
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14641/14641 [04:59<00:00, 48.96it/s]


In [8]:
results_df = pd.DataFrame(
    {
        "selection": [r[0] for r in results],
        "sig": [r[1] for r in results],
        "bkg": [r[2] for r in results],
        "fom": [r[3] for r in results]
    }
)

In [9]:
results_df[(results_df.sig > 4.5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:10]

Unnamed: 0,selection,sig,bkg,fom
5929,bdt > 0.87 and hbbfatjet_score > 0.85 and ld_v...,4.644066,0.543949,6.296791
6050,bdt > 0.87 and hbbfatjet_score > 0.87 and ld_v...,4.626166,0.543949,6.272522
6171,bdt > 0.87 and hbbfatjet_score > 0.89 and ld_v...,4.606611,0.543949,6.246007
5940,bdt > 0.87 and hbbfatjet_score > 0.85 and ld_v...,4.584837,0.539702,6.240892
6292,bdt > 0.87 and hbbfatjet_score > 0.90999999999...,4.58612,0.54341,6.221306
6061,bdt > 0.87 and hbbfatjet_score > 0.87 and ld_v...,4.566937,0.539702,6.216527
6182,bdt > 0.87 and hbbfatjet_score > 0.89 and ld_v...,4.547382,0.539702,6.189908
5930,bdt > 0.87 and hbbfatjet_score > 0.85 and ld_v...,4.550129,0.542756,6.176199
6413,bdt > 0.87 and hbbfatjet_score > 0.92999999999...,4.537658,0.541087,6.168764
6303,bdt > 0.87 and hbbfatjet_score > 0.90999999999...,4.526891,0.539164,6.165095
