In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import glob
import itertools

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from tqdm import tqdm
import xgboost as xgb #BDT
import pickle
import mplhep as hep

import concurrent.futures as futures

from utils.analysis import Optimization, PandasAnalysis

TAG = "abcdnet"

  from pandas import MultiIndex, Int64Index


In [2]:
babies = sorted(glob.glob(f"../analysis/studies/vbsvvhjets/output_{TAG}/Run2/*.root"))
sig_babies = [baby for baby in babies if "VBSVVH" in baby]
bkg_babies = [baby for baby in babies if "VBS" not in baby and "data" not in baby]
data_babies = [baby for baby in babies if "data" in baby]
print("Signal:")
print("\n".join(sig_babies))
print("Background:")
print("\n".join(bkg_babies))
print("Data:")
print("\n".join(data_babies))

Signal:
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/VBSVVH.root
Background:
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/Bosons.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/QCD.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/SingleTop.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TT1L.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TTH.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TTHad.root
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/TTW.root
Data:
../analysis/studies/vbsvvhjets/output_abcdnet/Run2/data.root


In [3]:
vbsvvh = Optimization(
    sig_root_files=sig_babies,
    bkg_root_files=bkg_babies,
    data_root_files=data_babies,
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", 
        "pu_sf",
        "prefire_sf",
        "qcdnorm_sf"
    ]
)

vbsvvh.df["obj_sel"] = True
# vbsvvh.df["qcdnorm_CR"] = vbsvvh.df.eval(
#     "obj_sel and hbbfatjet_xbb > 0.1 and ld_vqqfatjet_xwqq > 0.5 and tr_vqqfatjet_xwqq > 0.5"
# )
# vbsvvh.make_selection("qcdnorm_CR")

Loading sig babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.05s/it]
Loading bkg babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.60it/s]
Loading data babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.58it/s]


In [4]:
# vbsvvh.make_selection("abs(deta_jj) > 4")

scans = {
#     "abcdnet_score": np.linspace(0.85, 0.95, 5),
#     "hbbfatjet_xbb": np.linspace(0.5, 0.8, 4),
#     "ld_vqqfatjet_xwqq": np.linspace(0.3, 0.6, 4),
#     "tr_vqqfatjet_xwqq": np.linspace(0.3, 0.6, 4),
#     "vbs_dnn_score": np.linspace(0.87, 0.97, 6)
    
#     "hbbfatjet_xbb": np.linspace(0.5, 0.8, 4),
    "ld_vqqfatjet_xwqq": np.linspace(0.3, 0.6, 4),
    "tr_vqqfatjet_xwqq": np.linspace(0.3, 0.6, 4),
    "abcdnet_score": np.linspace(0.69, 0.99, 31),
    "vbs_dnn_score": np.linspace(0.79, 0.99, 21)
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(max(B, 0.0001))
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

  0%|                                                                                                                                                                                                                                | 0/10416 [00:00<?, ?it/s]

KeyboardInterrupt



In [None]:
# vbsvvh.make_selection("abs(deta_jj) > 4")

DO_ABCD = True

N_WORKERS = 16

scans = {
    "abcdnet_score": np.linspace(0.70, 0.90, 6),
    "hbbfatjet_xbb": np.linspace(0.5, 0.8, 4),
    "ld_vqqfatjet_xwqq": np.linspace(0.3, 0.8, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.3, 0.8, 6),
    "vbs_bdt_score": np.linspace(0.75, 0.95, 6)
    
#     "abcdnet_score": np.linspace(0.79, 0.99, 21),
#     "vbs_bdt_score": np.linspace(0.79, 0.99, 21)
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(max(B, 0.0001))

results = [0 for sel in selections]

jobs = []
for sel_i, sel in enumerate(selections):
    job = (sel_i, sel, results)
    jobs.append(job)
            
def run_job(args):
    sel_i, sel, results = args
    # Get arms of ABCD and SR-like cuts
    cuts = sorted(sel.split(" and "))
    x_arm = cuts[0]
    y_arm = cuts[-1]
    sr_like = cuts[1:-1]
    sig, bkg = vbsvvh.get_event_counts(selection=sel)

    # Get MC in regions B, C, D
    region_B = " and ".join([x_arm, f"(not {y_arm})", *sr_like])
    B_sig, B_bkg = vbsvvh.get_event_counts(selection=region_B)
    if B_sig/B_bkg > 0.01:
        return
        
    region_C = " and ".join([f"(not {x_arm})", y_arm, *sr_like])
    C_sig, C_bkg = vbsvvh.get_event_counts(selection=region_C)
    if C_sig/C_bkg > 0.01:
        return
        
    region_D = " and ".join([f"(not {x_arm})", f"(not {y_arm})", *sr_like])
    D_sig, D_bkg = vbsvvh.get_event_counts(selection=region_D)
    if D_sig/D_bkg > 0.01:
        return
    
    # Get data in regions B, C, D
    B_data = vbsvvh.data_count(selection=region_B)
    C_data = vbsvvh.data_count(selection=region_C)
    D_data = vbsvvh.data_count(selection=region_D)
    
    A_pred = B_data*C_data/D_data
    
    results[sel_i] = (sel, sig, A_pred, fom(sig, A_pred))
    

import concurrent.futures as futures
    
# Execute jobs
submitted_futures = {}
print(f"N workers: {N_WORKERS}")
with tqdm(total=len(selections), desc="Executing jobs") as pbar:
    with futures.ThreadPoolExecutor(max_workers=N_WORKERS) as executor:
        submitted_futures = {
            executor.submit(run_job, job): job for job in jobs
        }
        for future in futures.as_completed(submitted_futures):
            # Update progress bar
            pbar.update(1)

N workers: 16


Executing jobs:   9%|█████████████████▍                                                                                                                                                                                     | 454/5184 [02:01<28:31,  2.76it/s]

In [4]:
# vbsvvh.make_selection("abs(deta_jj) > 4")

DO_ABCD = True

scans = {
    "abcdnet_score": np.linspace(0.70, 0.90, 6),
    "hbbfatjet_xbb": np.linspace(0.5, 0.8, 4),
    "ld_vqqfatjet_xwqq": np.linspace(0.3, 0.8, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.3, 0.8, 6),
    "vbs_bdt_score": np.linspace(0.75, 0.95, 6)
    
#     "abcdnet_score": np.linspace(0.79, 0.99, 21),
#     "vbs_bdt_score": np.linspace(0.79, 0.99, 21)
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(max(B, 0.0001))
            
results = []
for sel in tqdm(selections):
    # Get arms of ABCD and SR-like cuts
    cuts = sorted(sel.split(" and "))
    x_arm = cuts[0]
    y_arm = cuts[-1]
    sr_like = cuts[1:-1]
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    
    if not DO_ABCD:
        results.append((sel, sig, bkg, fom(sig, bkg)))
        continue
    
    # Get MC in regions B, C, D
    region_B = " and ".join([x_arm, f"(not {y_arm})", *sr_like])
    B_sig, B_bkg = vbsvvh.get_event_counts(selection=region_B)
    if B_sig/B_bkg > 0.01:
        continue
        
    region_C = " and ".join([f"(not {x_arm})", y_arm, *sr_like])
    C_sig, C_bkg = vbsvvh.get_event_counts(selection=region_C)
    if C_sig/C_bkg > 0.01:
        continue
        
    region_D = " and ".join([f"(not {x_arm})", f"(not {y_arm})", *sr_like])
    D_sig, D_bkg = vbsvvh.get_event_counts(selection=region_D)
    if D_sig/D_bkg > 0.01:
        continue
    
    # Get data in regions B, C, D
    B_data = vbsvvh.data_count(selection=region_B)
    C_data = vbsvvh.data_count(selection=region_C)
    D_data = vbsvvh.data_count(selection=region_D)
    
    A_pred = B_data*C_data/D_data
    
    results.append((sel, sig, A_pred, fom(sig, A_pred)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5184/5184 [10:01<00:00,  8.62it/s]


In [8]:
# vbsvvh.make_selection("abs(deta_jj) > 4")

scans = {
#     "abcdnet_score": np.linspace(0.89, 0.99, 11),
    "abcdnet_score": np.linspace(0.87, 0.97, 11),
    "hbbfatjet_xbb": np.linspace(0.5, 0.8, 6),
    "ld_vqqfatjet_xwqq": np.linspace(0.5, 0.8, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.5, 0.8, 6),
    "abs_deta_jj": [2.5, 3, 3.5, 4]
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(max(B, 0.0001))
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9504/9504 [07:52<00:00, 20.11it/s]


In [5]:
results_df = pd.DataFrame(
    {
        "selection": [r[0] for r in results],
        "sig": [r[1] for r in results],
        "bkg": [r[2] for r in results],
        "fom": [r[3] for r in results]
    }
)

In [11]:
results_filter = (results_df.sig > 0) & (results_df.bkg > 0)

bf_SR = results_df[results_filter].sort_values("fom", ascending=False)[:15].selection.values[0]
print(bf_SR)
sig_count, bkg_count = vbsvvh.get_event_counts(bf_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")
results_df[results_filter].sort_values("fom", ascending=False)[:15]

abcdnet_score > 0.9 and hbbfatjet_xbb > 0.8 and ld_vqqfatjet_xwqq > 0.3 and tr_vqqfatjet_xwqq > 0.3 and vbs_bdt_score > 0.75
Signal:     6.078819808761819 +- 0.07485565330045911
Background: 5.514516902298751 +- 1.3381826553958571


Unnamed: 0,selection,sig,bkg,fom
759,abcdnet_score > 0.9 and hbbfatjet_xbb > 0.8 an...,6.07882,3.993161,3.042011
495,abcdnet_score > 0.78 and hbbfatjet_xbb > 0.5 a...,6.522921,4.690208,3.011938
103,abcdnet_score > 0.7 and hbbfatjet_xbb > 0.6 an...,6.200063,4.349029,2.973034
757,abcdnet_score > 0.9 and hbbfatjet_xbb > 0.7000...,6.153886,4.338124,2.954598
751,abcdnet_score > 0.9 and hbbfatjet_xbb > 0.6 an...,6.109824,4.304635,2.944832
758,abcdnet_score > 0.9 and hbbfatjet_xbb > 0.7000...,6.176284,4.448684,2.928273
620,abcdnet_score > 0.8200000000000001 and hbbfatj...,6.588499,5.063736,2.927864
75,abcdnet_score > 0.7 and hbbfatjet_xbb > 0.5 an...,6.556661,5.024263,2.925139
756,abcdnet_score > 0.9 and hbbfatjet_xbb > 0.7000...,6.190061,4.553431,2.900852
744,abcdnet_score > 0.9 and hbbfatjet_xbb > 0.5 an...,6.204713,4.579076,2.899565


In [9]:
bf_SR = results_df[(results_df.sig > 4) & (results_df.bkg > 1)].sort_values("fom", ascending=False)[:15].selection.values[0]
bf_SR

'abcdnet_score > 0.95 and hbbfatjet_xbb > 0.62 and ld_vqqfatjet_xwqq > 0.74 and tr_vqqfatjet_xwqq > 0.68 and abs_deta_jj > 4'

In [13]:
bf_rounded_SR = "abcdnet_score > 0.96 and hbbfatjet_xbb > 0.60 and ld_vqqfatjet_xwqq > 0.8 and tr_vqqfatjet_xwqq > 0.75 and abs_deta_jj > 4"
# bf_rounded_SR = "bdt > 0.8 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.7 and tr_vqqfatjet_xwqq > 0.7"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     4.247743597157761 +- 0.057704025354949035
Background: 1.7958060430360603 +- 0.5918300065717386


In [5]:
scans = {
    "abcdnet_score1": np.linspace(0.79, 0.99, 11),
    "abcdnet_score2": np.linspace(0.79, 0.99, 11),
    "hbbfatjet_xbb": np.linspace(0.5, 0.8, 6),
    "ld_vqqfatjet_xwqq": np.linspace(0.3, 0.8, 6),
    "tr_vqqfatjet_xwqq": np.linspace(0.3, 0.8, 6),
}

cuts = {}
for var, wps in scans.items():
    cuts[var] = []
    for wp in wps:
        cuts[var].append(f"{var} > {wp}")

selections = [" and ".join(combo) for combo in itertools.product(*cuts.values())]

fom = lambda S, B: S/np.sqrt(B) if B > 0 else S/np.sqrt(0.0001)
            
results = []
for sel in tqdm(selections):
    sig, bkg = vbsvvh.get_event_counts(selection=sel)
    results.append((sel, sig, bkg, fom(sig, bkg)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26136/26136 [20:11<00:00, 21.57it/s]


In [7]:
results_df = pd.DataFrame(
    {
        "selection": [r[0] for r in results],
        "sig": [r[1] for r in results],
        "bkg": [r[2] for r in results],
        "fom": [r[3] for r in results]
    }
)

In [13]:
results_filter = (results_df.sig > 0) & (results_df.bkg > 2)

bf_SR = results_df[results_filter].sort_values("fom", ascending=False)[:15].selection.values[0]
print(bf_SR)
sig_count, bkg_count = vbsvvh.get_event_counts(bf_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")
results_df[results_filter].sort_values("fom", ascending=False)[:15]

abcdnet_score1 > 0.89 and abcdnet_score2 > 0.81 and hbbfatjet_xbb > 0.68 and ld_vqqfatjet_xwqq > 0.7 and tr_vqqfatjet_xwqq > 0.6000000000000001
Signal:     4.777688036521768 +- 0.06109130257000033
Background: 2.058895289432435 +- 0.8450428092127928


Unnamed: 0,selection,sig,bkg,fom
12231,abcdnet_score1 > 0.89 and abcdnet_score2 > 0.8...,4.777688,2.058895,3.329666
12626,abcdnet_score1 > 0.89 and abcdnet_score2 > 0.8...,4.699063,2.00835,3.315825
19099,abcdnet_score1 > 0.95 and abcdnet_score2 > 0.7...,4.783616,2.087674,3.310739
19170,abcdnet_score1 > 0.95 and abcdnet_score2 > 0.7...,4.756259,2.086931,3.292392
12267,abcdnet_score1 > 0.89 and abcdnet_score2 > 0.8...,4.695315,2.058129,3.272868
7875,abcdnet_score1 > 0.85 and abcdnet_score2 > 0.8...,4.731718,2.095367,3.268803
12662,abcdnet_score1 > 0.89 and abcdnet_score2 > 0.8...,4.630048,2.008122,3.267311
5535,abcdnet_score1 > 0.8300000000000001 and abcdne...,4.72321,2.096261,3.26223
19135,abcdnet_score1 > 0.95 and abcdnet_score2 > 0.7...,4.711358,2.087446,3.260907
19171,abcdnet_score1 > 0.95 and abcdnet_score2 > 0.7...,4.634298,2.035047,3.248604


In [6]:
bf_rounded_SR = "abcdnet_score1 > 0.87 and abcdnet_score2 > 0.8 and hbbfatjet_xbb > 0.74 and ld_vqqfatjet_xwqq > 0.74 and tr_vqqfatjet_xwqq > 0.62"
# bf_rounded_SR = "bdt > 0.8 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.7 and tr_vqqfatjet_xwqq > 0.7"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     4.723143500952743 +- 0.060780618232793564
Background: 2.178862577875975 +- 0.8750272583707693


In [49]:
bf_SR = results_df[(results_df.sig > 5) & (results_df.bkg > 0)].sort_values("fom", ascending=False)[:15].selection.values[1]
bf_SR

'bdt > 0.74 and hbbfatjet_xbb > 0.58 and ld_vqqfatjet_xwqq > 0.62 and tr_vqqfatjet_xwqq > 0.66'

In [50]:
bf_rounded_SR = "bdt > 0.75 and hbbfatjet_xbb > 0.6 and ld_vqqfatjet_xwqq > 0.6 and tr_vqqfatjet_xwqq > 0.65"
sig_count, bkg_count = vbsvvh.get_event_counts(bf_rounded_SR)
sig_error, bkg_error = vbsvvh.get_event_errors(bf_rounded_SR)
print(f"Signal:     {sig_count} +- {sig_error}")
print(f"Background: {bkg_count} +- {bkg_error}")

Signal:     5.0135211002373286 +- 0.06309293035494924
Background: 1.1027211115484759 +- 0.3359880041887074


In [20]:
# vbsvvh.df["SR"] = vbsvvh.df.eval(bf_rounded_SR)

# vbsvvh.make_selection("SR")
# update_cutflows("SR")

# cutflows.reorder(["QCD", "TTHad", "TT1L", "TTW", "TTH", "SingleTop", "Bosons", "TotalBkg", "VBSVVH"])
# cutflows.write_csv("test.csv", cutflows.terminal_cut_names[-1])