In [1]:
import uproot
import glob
import pandas as pd
import numpy as np
import itertools

from utils.analysis import PandasAnalysis
from utils.cutflow import Cutflow

TAG = "pku"

In [2]:
babies = glob.glob(f"../analysis/studies/vbswh/output_{TAG}/Run2/*.root")
babies = [baby for baby in babies if "data" not in baby]
babies

['../analysis/studies/vbswh/output_pku/Run2/TTX.root',
 '../analysis/studies/vbswh/output_pku/Run2/TTbar2L.root',
 '../analysis/studies/vbswh/output_pku/Run2/VH.root',
 '../analysis/studies/vbswh/output_pku/Run2/TTbar1L.root',
 '../analysis/studies/vbswh/output_pku/Run2/WJets.root',
 '../analysis/studies/vbswh/output_pku/Run2/SingleTop.root',
 '../analysis/studies/vbswh/output_pku/Run2/EWKWLep.root',
 '../analysis/studies/vbswh/output_pku/Run2/Bosons.root',
 '../analysis/studies/vbswh/output_pku/Run2/VBSWH_mkW.root']

In [3]:
vbswh = PandasAnalysis(
    sig_root_files=[baby for baby in babies if "VBSWH_mkW" in baby],
    bkg_root_files=[baby for baby in babies if "VBSWH_mkW" not in baby],
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", "lep_id_sf", "elec_reco_sf", "muon_iso_sf", 
        "btag_sf", "pu_sf", "prefire_sf", "trig_sf"
    ]
)
vbswh.df["presel_noDetaJJ"] = vbswh.df.eval(
    "passes_bveto and M_jj > 500 and hbbjet_score > 0.3"
)
vbswh.df["presel"] = vbswh.df.eval(
    "passes_bveto and M_jj > 500 and abs(deta_jj) > 3 and hbbjet_score > 0.3"
)
vbswh.df["SR2"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR2_up"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_up > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1_up"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_up > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR2_dn"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_dn > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1_dn"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_dn > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)

SIGNAL_REGIONS = ["SR1", "SR2"]

Loading sig babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.68it/s]
Loading bkg babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:35<00:00,  4.39s/it]


# Utilities

In [4]:
class Systematic:
    def __init__(self, name, signal_regions):
        self.name = name
        self.signal_regions = signal_regions
        self.systs = {signal_region: [] for signal_region in signal_regions}

    def add_syst(self, syst, signal_region):
        self.systs[signal_region].append(syst)
        
    def add_systs(self, systs, signal_region=None):
        if signal_region:
            self.systs[signal_region] += systs
        else:
            for syst_i, syst in enumerate(systs):
                self.systs[self.signal_regions[syst_i]].append(syst)
    
    def get_systs(self):
        return self.systs
    
    def get_systs_str(self, signal_region=None):
        if signal_region:
            systs = self.systs[signal_region]
            if len(systs) == 1:
                return f"{systs[0]:0.1%}"
            else:
                return f"{min(systs):0.1%} - {max(systs):0.1%}"
        else:
            return {SR: self.get_systs_str(signal_region=SR) for SR in self.signal_regions}
        
class SystematicsTable:
    def __init__(self, systs=None):
        self.systs = systs or []
        
    def add_row(self, syst):
        self.systs.append(syst)
        
    def to_dataframe(self):
        rows = []
        for syst in self.systs:
            row = {"Systematic": syst.name}
            row.update(syst.get_systs_str())
            rows.append(row)
        return pd.DataFrame(rows)
        
    def to_latex(self, output_tex=None):
        # Convert to Pandas DataFrame
        df = self.to_dataframe()
        
        # Convert to LaTeX
        latex = (df.style
                   .hide(axis="index")
                   .to_latex(column_format="lcc", position="H")
                   .replace("%", "\%"))
        # Insert hlines and centering
        latex = latex.split("\n")
        latex.insert(3, "\\hline")
        latex.insert(2, "\\hline\n\\hline")
        latex.insert(1, "\\begin{center}")
        latex.insert(-3, "\\hline\n\\hline")
        latex.insert(-2, "\\end{center}")
        latex = "\n".join(latex)
        
        return latex
    
    def to_datacard(self, output_dat=None):
        # TODO: add this! should turn list of Systematic objects into a HiggsCombine datacard
        pass

# Scale factors

In [5]:
SIG_SYSTS = SystematicsTable()
BKG_SYSTS = SystematicsTable()

In [6]:
def get_systs(sample_name, signal_regions, sf, sf_up, sf_dn):
    systs = []
    for SR in signal_regions:
        df = vbswh.sample_df(name=sample_name, selection=SR)
        
        count = df.event_weight.sum()
        count_up = np.sum(df.event_weight/df[sf]*df[sf_up])
        count_dn = np.sum(df.event_weight/df[sf]*df[sf_dn])
        
        perc_up = abs((count_up - count)/count)
        perc_dn = abs((count - count_dn)/count)
        
        systs.append(max(perc_up, perc_dn))
        
    return systs

def get_systs_nonSF(sample_name, signal_regions, signal_regions_up, signal_regions_dn):
    systs = []
    for SR_i, SR in enumerate(signal_regions):
        SR_up = signal_regions_up[SR_i]
        SR_dn = signal_regions_dn[SR_i]
        
        df = vbswh.sample_df(name=sample_name, selection=SR)
        df_up = vbswh.sample_df(name=sample_name, selection=SR_up)
        df_dn = vbswh.sample_df(name=sample_name, selection=SR_dn)
        
        count = df.event_weight.sum()
        count_up = df_up.event_weight.sum()
        count_dn = df_dn.event_weight.sum()
        
        up_perc = abs((count_up - count)/count)
        dn_perc = abs((count - count_dn)/count)
        
        systs.append(max(up_perc, dn_perc))
        
    return systs

### L1 Prefiring weight

In [7]:
bkg_prefire_sf_systs = Systematic("L1 pre-fire corrections", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_prefire_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "prefire_sf", "prefire_sf_up", "prefire_sf_dn")
        )
        
BKG_SYSTS.add_row(bkg_prefire_sf_systs)

In [8]:
prefire_sf_systs = Systematic("L1 pre-fire corrections", SIGNAL_REGIONS)
prefire_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "prefire_sf", "prefire_sf_up", "prefire_sf_dn")
)
SIG_SYSTS.add_row(prefire_sf_systs)

### Lepton scale factors

In [9]:
bkg_lep_sf_systs = Systematic("Lepton scale factors", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_lep_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "lep_id_sf", "lep_id_sf_up", "lep_id_sf_dn")
        )
        
BKG_SYSTS.add_row(bkg_lep_sf_systs)

In [10]:
lep_sf_systs = Systematic("Lepton scale factors", SIGNAL_REGIONS)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "lep_id_sf", "lep_id_sf_up", "lep_id_sf_dn")
)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "elec_reco_sf", "elec_reco_sf_up", "elec_reco_sf_dn")
)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "muon_iso_sf", "muon_iso_sf_up", "muon_iso_sf_dn")
)
SIG_SYSTS.add_row(lep_sf_systs)

### DeepJet b-tagging scale factors

In [11]:
bkg_btag_sf_systs = Systematic("DeepJet b-tagging scale factors", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_btag_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "btag_sf", "btag_sf_dn", "btag_sf_up")
        )

BKG_SYSTS.add_row(bkg_btag_sf_systs)

In [12]:
btag_sf_systs = Systematic("DeepJet b-tagging scale factors", SIGNAL_REGIONS)
btag_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "btag_sf", "btag_sf_dn", "btag_sf_up")
)
SIG_SYSTS.add_row(btag_sf_systs)

### Pileup reweighting

In [13]:
bkg_pu_sf_systs = Systematic("Pileup reweighting", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_pu_sf_systs.add_systs(
            get_systs(sample_name, SIGNAL_REGIONS, "pu_sf", "pu_sf_dn", "pu_sf_up")
        )

BKG_SYSTS.add_row(bkg_pu_sf_systs)

In [14]:
pu_sf_systs = Systematic("Pileup reweighting", SIGNAL_REGIONS)
pu_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "pu_sf", "pu_sf_dn", "pu_sf_up")
)
SIG_SYSTS.add_row(pu_sf_systs)

### MET uncertainty

In [15]:
bkg_met_unc_systs = Systematic("MET unc.", SIGNAL_REGIONS)

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        bkg_met_unc_systs.add_systs(
            get_systs_nonSF(sample_name, SIGNAL_REGIONS, ["SR1_dn", "SR2_dn"], ["SR1_up", "SR2_up"])
        )

BKG_SYSTS.add_row(bkg_met_unc_systs)

In [16]:
met_unc_systs = Systematic("MET unc.", SIGNAL_REGIONS)
met_unc_systs.add_systs(
    get_systs_nonSF("VBSWH_mkW", SIGNAL_REGIONS, ["SR1_dn", "SR2_dn"], ["SR1_up", "SR2_up"])
)
SIG_SYSTS.add_row(met_unc_systs)

### Luminosity

In [17]:
lumi_systs = Systematic("Luminosity", SIGNAL_REGIONS)
lumi_systs.add_systs([0.025, 0.025])
BKG_SYSTS.add_row(lumi_systs)
SIG_SYSTS.add_row(lumi_systs)

In [18]:
BKG_SYSTS.to_dataframe()

Unnamed: 0,Systematic,SR1,SR2
0,L1 pre-fire corrections,0.3% - 1.5%,0.1% - 1.2%
1,Lepton scale factors,0.9% - 7.0%,0.9% - 3.4%
2,DeepJet b-tagging scale factors,0.0% - 3.3%,0.0% - 8.2%
3,Pileup reweighting,1.9% - 14.9%,2.2% - 9.7%
4,MET unc.,0.2% - 27.3%,0.0% - 1.6%
5,Luminosity,2.5%,2.5%


In [19]:
SIG_SYSTS.to_dataframe()

Unnamed: 0,Systematic,SR1,SR2
0,L1 pre-fire corrections,1.0%,1.0%
1,Lepton scale factors,0.0% - 1.4%,0.0% - 1.9%
2,DeepJet b-tagging scale factors,0.3%,0.3%
3,Pileup reweighting,0.3%,0.6%
4,MET unc.,0.1%,0.8%
5,Luminosity,2.5%,2.5%


# Jet energy

In [20]:
def get_jet_energy_systs(nominal_cflow, up_cflow, dn_cflow, signal_regions, name):
    
    nominal_cutflow = Cutflow.from_file(nominal_cflow)
    up_cutflow = Cutflow.from_file(up_cflow)
    dn_cutflow = Cutflow.from_file(dn_cflow)

    syst_up_cutflow = (up_cutflow - nominal_cutflow)/nominal_cutflow
    syst_dn_cutflow = (nominal_cutflow - dn_cutflow)/nominal_cutflow

    systs = Systematic(name, signal_regions.keys())
    for SR, cut_name in signal_regions.items():
        systs.add_syst(
            max(
                syst_up_cutflow[cut_name].n_pass,
                syst_dn_cutflow[cut_name].n_pass
            ),
            signal_region=SR
        )
        
    return systs

### Jet energy corrections

In [21]:
jec_systs = Systematic("Jet energy scale", ["SR1", "SR2"])

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        systs = get_jet_energy_systs(
            f"../analysis/studies/vbswh/output_pku/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jec_up/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jec_dn/Run2/{sample_name}_cutflow.cflow",
            {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
            "Jet energy scale"
        )
        for SR, values in systs.get_systs().items():
            jec_systs.add_systs(values, signal_region=SR)
        
BKG_SYSTS.add_row(jec_systs)

In [22]:
jec_systs = get_jet_energy_systs(
    "../analysis/studies/vbswh/output_pku/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jec_up/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jec_dn/Run2/VBSWH_mkW_cutflow.cflow",
    {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
    "Jet energy scale"
)
SIG_SYSTS.add_row(jec_systs)

### Jet energy resolution

In [23]:
jer_systs = Systematic("Jet energy resolution", ["SR1", "SR2"])

for baby in babies:
    sample_name = baby.split("/")[-1].replace(".root", "")
    if sample_name != "VBSWH_mkW":
        systs = get_jet_energy_systs(
            f"../analysis/studies/vbswh/output_pku/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jer_up/Run2/{sample_name}_cutflow.cflow",
            f"../analysis/studies/vbswh/output_pku_jer_dn/Run2/{sample_name}_cutflow.cflow",
            {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
            "Jet energy resolution"
        )
        for SR, values in systs.get_systs().items():
            jer_systs.add_systs(values, signal_region=SR)
        
BKG_SYSTS.add_row(jer_systs)

In [24]:
jer_systs = get_jet_energy_systs(
    "../analysis/studies/vbswh/output_pku/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jer_up/Run2/VBSWH_mkW_cutflow.cflow",
    "../analysis/studies/vbswh/output_pku_jer_dn/Run2/VBSWH_mkW_cutflow.cflow",
    {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
    "Jet energy resolution"
)
SIG_SYSTS.add_row(jer_systs)

In [25]:
BKG_SYSTS.to_dataframe()

Unnamed: 0,Systematic,SR1,SR2
0,L1 pre-fire corrections,0.3% - 1.5%,0.1% - 1.2%
1,Lepton scale factors,0.9% - 7.0%,0.9% - 3.4%
2,DeepJet b-tagging scale factors,0.0% - 3.3%,0.0% - 8.2%
3,Pileup reweighting,1.9% - 14.9%,2.2% - 9.7%
4,MET unc.,0.2% - 27.3%,0.0% - 1.6%
5,Luminosity,2.5%,2.5%
6,Jet energy scale,11.9% - 25.3%,0.0% - 27.5%
7,Jet energy resolution,3.0% - 12.8%,0.0% - 20.0%


In [26]:
SIG_SYSTS.to_dataframe()

Unnamed: 0,Systematic,SR1,SR2
0,L1 pre-fire corrections,1.0%,1.0%
1,Lepton scale factors,0.0% - 1.4%,0.0% - 1.9%
2,DeepJet b-tagging scale factors,0.3%,0.3%
3,Pileup reweighting,0.3%,0.6%
4,MET unc.,0.1%,0.8%
5,Luminosity,2.5%,2.5%
6,Jet energy scale,6.1%,7.5%
7,Jet energy resolution,0.1%,0.5%


In [27]:
print(SIG_SYSTS.to_latex())

\begin{table}[H]
\begin{center}
\begin{tabular}{lcc}
\hline
\hline
Systematic & SR1 & SR2 \\
\hline
L1 pre-fire corrections & 1.0\% & 1.0\% \\
Lepton scale factors & 0.0\% - 1.4\% & 0.0\% - 1.9\% \\
DeepJet b-tagging scale factors & 0.3\% & 0.3\% \\
Pileup reweighting & 0.3\% & 0.6\% \\
MET unc. & 0.1\% & 0.8\% \\
Luminosity & 2.5\% & 2.5\% \\
Jet energy scale & 6.1\% & 7.5\% \\
Jet energy resolution & 0.1\% & 0.5\% \\
\hline
\hline
\end{tabular}
\end{center}
\end{table}

