In [1]:
import uproot
import glob
import pandas as pd
import numpy as np
import itertools

from utils.analysis import PandasAnalysis
from utils.cutflow import Cutflow

TAG = "pku"

In [2]:
SIG_SYSTS = []
BKG_SYSTS = []

def store_syst(new_syst, sig=True):
    global SIG_SYSTS
    global BKG_SYSTS
    systs = SIG_SYSTS if sig else BKG_SYSTS

    for syst in systs:
        if syst["Systematic"] == new_syst.name:
            return
        
    systs.append({"Systematic": new_syst.name})
    systs[-1].update(new_syst.get_systs_str())

In [3]:
class Systematic:
    def __init__(self, name, signal_regions):
        self.name = name
        self.signal_regions = signal_regions
        self.systs = {signal_region: [] for signal_region in signal_regions}

    def add_syst(self, syst, signal_region):
        self.systs[signal_region].append(syst)
        
    def add_systs(self, systs, signal_region=None):
        if signal_region:
            self.systs[signal_region] += systs
        else:
            for syst_i, syst in enumerate(systs):
                self.systs[self.signal_regions[syst_i]].append(syst)
    
    def get_systs(self):
        return self.systs
    
    def get_systs_str(self, signal_region=None):
        if signal_region:
            systs = self.systs[signal_region]
            if len(systs) == 1:
                return f"{systs[0]:0.1%}"
            else:
                return f"{min(systs):0.1%} - {max(systs):0.1%}"
        else:
            return {SR: self.get_systs_str(signal_region=SR) for SR in self.signal_regions}

# Scale factors

In [4]:
babies = glob.glob(f"../analysis/studies/vbswh/output_{TAG}/Run2/*.root")
babies = [baby for baby in babies if "data" not in baby]
babies

['../analysis/studies/vbswh/output_pku/Run2/TTbar2L.root',
 '../analysis/studies/vbswh/output_pku/Run2/TTX.root',
 '../analysis/studies/vbswh/output_pku/Run2/WJets.root',
 '../analysis/studies/vbswh/output_pku/Run2/VH.root',
 '../analysis/studies/vbswh/output_pku/Run2/EWKWLep.root',
 '../analysis/studies/vbswh/output_pku/Run2/SingleTop.root',
 '../analysis/studies/vbswh/output_pku/Run2/TTbar1L.root',
 '../analysis/studies/vbswh/output_pku/Run2/Bosons.root',
 '../analysis/studies/vbswh/output_pku/Run2/VBSWH_mkW.root']

In [5]:
vbswh = PandasAnalysis(
    sig_root_files=[baby for baby in babies if "VBSWH_mkW" in baby],
    bkg_root_files=[baby for baby in babies if "VBSWH_mkW" not in baby],
    ttree_name="tree",
    weight_columns=["xsec_sf", "lep_sf", "btag_sf"]
)
vbswh.df["presel_noDetaJJ"] = vbswh.df.eval(
    "passes_bveto and M_jj > 500 and hbbjet_score > 0.3"
)
vbswh.df["presel"] = vbswh.df.eval(
    "passes_bveto and M_jj > 500 and abs(deta_jj) > 3 and hbbjet_score > 0.3"
)
vbswh.df["SR2"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR2_up"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_up > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1_up"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_up > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR2_dn"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_dn > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1_dn"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_dn > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)

SIGNAL_REGIONS = ["SR1", "SR2"]

Loading sig babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.45it/s]
Loading bkg babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:23<00:00,  2.92s/it]


In [6]:
def get_systs(sample_name, signal_regions, sf, sf_up, sf_dn):
    systs = []
    for SR in signal_regions:
        df = vbswh.sample_df(name=sample_name, selection=SR)
        
        count = df.event_weight.sum()
        count_up = np.sum(df.event_weight/df[sf]*df[sf_up])
        count_dn = np.sum(df.event_weight/df[sf]*df[sf_dn])
        
        perc_up = abs((count_up - count)/count)
        perc_dn = abs((count - count_dn)/count)
        
        systs.append(max(perc_up, perc_dn))
        
    return systs

In [7]:
def get_systs_nonSF(sample_name, signal_regions, signal_regions_up, signal_regions_dn):
    systs = []
    for SR_i, SR in enumerate(signal_regions):
        SR_up = signal_regions_up[SR_i]
        SR_dn = signal_regions_dn[SR_i]
        
        df = vbswh.sample_df(name=sample_name, selection=SR)
        df_up = vbswh.sample_df(name=sample_name, selection=SR_up)
        df_dn = vbswh.sample_df(name=sample_name, selection=SR_dn)
        
        count = df.event_weight.sum()
        count_up = df_up.event_weight.sum()
        count_dn = df_dn.event_weight.sum()
        
        up_perc = abs((count_up - count)/count)
        dn_perc = abs((count - count_dn)/count)
        
        systs.append(max(up_perc, dn_perc))
        
    return systs

### Lepton scale factors

In [8]:
bkg_lep_sf_systs = Systematic("Lepton scale factors", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_lep_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "lep_sf", "lep_sf_up", "lep_sf_dn")
        )
        
store_syst(bkg_lep_sf_systs, sig=False)

In [9]:
lep_sf_systs = Systematic("Lepton scale factors", SIGNAL_REGIONS)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "lep_sf", "lep_sf_up", "lep_sf_dn")
)
store_syst(lep_sf_systs)

### DeepJet b-tagging scale factors

In [10]:
bkg_btag_sf_systs = Systematic("DeepJet b-tagging scale factors", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_btag_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "btag_sf", "btag_sf_dn", "btag_sf_up")
        )

store_syst(bkg_btag_sf_systs, sig=False)

In [11]:
btag_sf_systs = Systematic("DeepJet b-tagging scale factors", SIGNAL_REGIONS)
btag_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "btag_sf", "btag_sf_dn", "btag_sf_up")
)
store_syst(btag_sf_systs)

### Pileup reweighting

In [12]:
bkg_pu_sf_systs = Systematic("Pileup reweighting", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_pu_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "pu_sf", "pu_sf_dn", "pu_sf_up")
        )

store_syst(bkg_pu_sf_systs, sig=False)

In [13]:
pu_sf_systs = Systematic("Pileup reweighting", SIGNAL_REGIONS)
pu_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "pu_sf", "pu_sf_dn", "pu_sf_up")
)
store_syst(pu_sf_systs)

### MET uncertainty

In [14]:
bkg_met_unc_systs = Systematic("MET unc.", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_met_unc_systs.add_systs(
            get_systs_nonSF(sample_name, SIGNAL_REGIONS, ["SR1_dn", "SR2_dn"], ["SR1_up", "SR2_up"])
        )

store_syst(bkg_met_unc_systs, sig=False)

In [15]:
met_unc_systs = Systematic("MET unc.", SIGNAL_REGIONS)
met_unc_systs.add_systs(
    get_systs_nonSF("VBSWH_mkW", SIGNAL_REGIONS, ["SR1_dn", "SR2_dn"], ["SR1_up", "SR2_up"])
)
store_syst(met_unc_systs)

### Luminosity

In [16]:
lumi_systs = Systematic("Luminosity", SIGNAL_REGIONS)
lumi_systs.add_systs([0.025, 0.025])
store_syst(lumi_systs)
store_syst(lumi_systs, sig=False)

In [17]:
pd.DataFrame(SIG_SYSTS)

Unnamed: 0,Systematic,SR1,SR2
0,Lepton scale factors,1.5%,1.9%
1,DeepJet b-tagging scale factors,0.3%,0.3%
2,Pileup reweighting,1.5%,1.8%
3,MET unc.,0.1%,0.4%
4,Luminosity,2.5%,2.5%


# Jet energy

In [18]:
def get_jet_energy_systs(nominal_cflow, up_cflow, dn_cflow, signal_regions, name):
    
    nominal_cutflow = Cutflow.from_file(nominal_cflow)
    up_cutflow = Cutflow.from_file(up_cflow)
    dn_cutflow = Cutflow.from_file(dn_cflow)

    syst_up_cutflow = (up_cutflow - nominal_cutflow)/nominal_cutflow
    syst_dn_cutflow = (nominal_cutflow - dn_cutflow)/nominal_cutflow

    systs = Systematic(name, signal_regions.keys())
    for SR, cut_name in signal_regions.items():
        systs.add_syst(
            max(
                syst_up_cutflow[cut_name].n_pass,
                syst_dn_cutflow[cut_name].n_pass
            ),
            signal_region=SR
        )
        
    return systs

### Jet energy corrections

In [19]:
jec_systs = Systematic("Jet energy scale", ["SR1", "SR2"])

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        systs = get_jet_energy_systs(
            f"../analysis/studies/vbswh/output_pku/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jec_up/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jec_dn/Run2/{sample_name}_cutflow.cflow",
            {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
            "Jet energy scale"
        )
        for SR, values in systs.get_systs().items():
            jec_systs.add_systs(values, signal_region=SR)
        
store_syst(jec_systs, sig=False)

In [20]:
jec_systs = get_jet_energy_systs(
    "../analysis/studies/vbswh/output_pku/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jec_up/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jec_dn/Run2/VBSWH_mkW_cutflow.cflow",
    {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
    "Jet energy scale"
)
store_syst(jec_systs)

### Jet energy resolution

In [21]:
jer_systs = Systematic("Jet energy resolution", ["SR1", "SR2"])

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        systs = get_jet_energy_systs(
            f"../analysis/studies/vbswh/output_pku/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jer_up/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jer_dn/Run2/{sample_name}_cutflow.cflow",
            {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
            "Jet energy resolution"
        )
        for SR, values in systs.get_systs().items():
            jer_systs.add_systs(values, signal_region=SR)
        
store_syst(jer_systs, sig=False)

In [22]:
jer_systs = get_jet_energy_systs(
    "../analysis/studies/vbswh/output_pku/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jer_up/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jer_dn/Run2/VBSWH_mkW_cutflow.cflow",
    {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
    "Jet energy resolution"
)
store_syst(jer_systs)

In [23]:
pd.DataFrame(SIG_SYSTS)

Unnamed: 0,Systematic,SR1,SR2
0,Lepton scale factors,1.5%,1.9%
1,DeepJet b-tagging scale factors,0.3%,0.3%
2,Pileup reweighting,1.5%,1.8%
3,MET unc.,0.1%,0.4%
4,Luminosity,2.5%,2.5%
5,Jet energy scale,5.0%,6.5%
6,Jet energy resolution,0.1%,0.4%


In [24]:
pd.DataFrame(BKG_SYSTS)

Unnamed: 0,Systematic,SR1,SR2
0,Lepton scale factors,0.9% - 6.5%,0.8% - 3.3%
1,DeepJet b-tagging scale factors,0.0% - 3.2%,0.2% - 8.3%
2,Pileup reweighting,0.3% - 10.6%,2.7% - 7.0%
3,MET unc.,0.0% - 1.4%,0.0% - 3.7%
4,Luminosity,2.5%,2.5%
5,Jet energy scale,9.7% - 24.4%,0.0% - 23.5%
6,Jet energy resolution,1.5% - 12.2%,0.0% - 23.3%
