In [1]:
import uproot
import glob
import json
import pandas as pd
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import numpy as np
import itertools

from utils.analysis import PandasAnalysis
from utils.cutflow import Cutflow

TAG = "kscans"

In [2]:
babies = glob.glob(f"../analysis/studies/vbswh/output_{TAG}/Run2/*.root")
babies = [baby for baby in babies if "Lambda" not in baby]
data_babies = [baby for baby in babies if "data" in baby]
sig_babies = [baby for baby in babies if "VBSWH_mkW" in baby]
bkg_babies = list(set(babies) - set(data_babies + sig_babies))
print(data_babies)
print(sig_babies)
bkg_babies

['../analysis/studies/vbswh/output_kscans/Run2/data.root']
['../analysis/studies/vbswh/output_kscans/Run2/VBSWH_mkW.root']


['../analysis/studies/vbswh/output_kscans/Run2/TTX.root',
 '../analysis/studies/vbswh/output_kscans/Run2/TTbar1L.root',
 '../analysis/studies/vbswh/output_kscans/Run2/SingleTop.root',
 '../analysis/studies/vbswh/output_kscans/Run2/VBSWH_SM.root',
 '../analysis/studies/vbswh/output_kscans/Run2/EWKWLep.root',
 '../analysis/studies/vbswh/output_kscans/Run2/TTbar2L.root',
 '../analysis/studies/vbswh/output_kscans/Run2/VH.root',
 '../analysis/studies/vbswh/output_kscans/Run2/WJets.root',
 '../analysis/studies/vbswh/output_kscans/Run2/Bosons.root']

In [3]:
vbswh = PandasAnalysis(
    sig_root_files=sig_babies,
    bkg_root_files=[b for b in bkg_babies if "VBSWH_SM" not in b],
    data_root_files=data_babies,
    ttree_name="tree",
    weight_columns=[
        "xsec_sf", "lep_id_sf", "ewkfix_sf", 
        "elec_reco_sf", "muon_iso_sf", 
        "btag_sf", "pu_sf", "prefire_sf", "trig_sf", "puid_sf",
        "xbb_sf" # applied only because Xbb > 0.9 applied everywhere for SR1 and SR2
    ],
    reweight_column="reweights"
)
vbswh.df["unity"] = 1 # IMPORTANT
vbswh.df["presel_noDetaJJ"] = vbswh.df.eval(
    "passes_bveto and M_jj > 500 and hbbjet_score > 0.3"
)
vbswh.df["presel"] = vbswh.df.eval(
    "passes_bveto and M_jj > 500 and abs(deta_jj) > 3 and hbbjet_score > 0.3"
)
vbswh.df["SR2"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR2_up"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_up > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1_up"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_up > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR2_dn"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_dn > 1500 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["SR1_dn"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST_dn > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop < 150"
)
vbswh.df["regionA"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 900 and hbbjet_score > 0.9 and abs(deta_jj) > 4 and hbbjet_msoftdrop >= 150"
)
vbswh.df["regionB"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 900 and hbbjet_score > 0.9 and abs(deta_jj) <= 4 and hbbjet_msoftdrop >= 150"
)
vbswh.df["regionC"] = vbswh.df.eval(
    "presel_noDetaJJ and M_jj > 600 and ST > 900 and hbbjet_score > 0.9 and abs(deta_jj) <= 4 and hbbjet_msoftdrop < 150"
)

SIGNAL_REGIONS = ["SR1", "SR2"]

Loading sig babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.76it/s]
Loading bkg babies: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:38<00:00,  4.78s/it]
Loading data babies: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.01s/it]


# Utilities

In [4]:
class Systematic:
    def __init__(self, name, signal_regions):
        self.name = name
        self.signal_regions = signal_regions
        self.systs = {signal_region: [] for signal_region in signal_regions}
        
    def copy(self, name):
        new_systs = Systematic(name, signal_regions=self.signal_regions)
        new_systs.systs = self.systs
        return new_systs

    def add_syst(self, syst, signal_region):
        self.systs[signal_region].append(syst)
        
    def add_systs(self, systs, signal_region=None):
        if signal_region:
            self.systs[signal_region] += systs
        else:
            for syst_i, syst in enumerate(systs):
                self.systs[self.signal_regions[syst_i]].append(syst)
    
    def get_systs(self):
        return self.systs
    
    def get_systs_str(self, signal_region=None):
        if signal_region:
            systs = self.systs[signal_region]
            if len(systs) == 1:
                return f"{systs[0]:0.1%}"
            else:
                return f"{min(systs):0.1%} - {max(systs):0.1%}"
        else:
            return {SR: self.get_systs_str(signal_region=SR) for SR in self.signal_regions}
        
class SystematicsTable:
    def __init__(self, systs=None, samples=None):
        self.systs = systs or []
        self.samples = samples or []
        
    def add_row(self, syst):
        self.systs.append(syst)
        
    def to_dataframe(self, columns=None):
        rows = []
        for syst in self.systs:
            row = {"Systematic": syst.name}
            row.update(syst.get_systs_str())
            rows.append(row)
        df = pd.DataFrame(rows)
        if columns:
            columns.insert(0, "Systematic")
            return df[columns]
        else:
            return df
        
    def to_csv(self, columns=None, output_csv=None):
        df = self.to_dataframe(columns=columns)
        csv = df.to_csv(index=False)
        if output_csv:
            with open(output_csv, "w") as csv_out:
                csv.write(csv)
        else:
            return csv
        
    def to_latex(self, columns=None, output_tex=None):
        # Convert to Pandas DataFrame
        df = self.to_dataframe(columns=columns)
        
        # Convert to LaTeX
        latex = (df.style
                   .hide(axis="index")
                   .to_latex(column_format="lcc", position="H")
                   .replace("%", "\%"))
        # Insert hlines and centering
        latex = latex.split("\n")
        latex.insert(3, "\\hline")
        latex.insert(2, "\\hline\n\\hline")
        latex.insert(1, "\\begin{center}")
        latex.insert(-3, "\\hline\n\\hline")
        latex.insert(-2, "\\end{center}")
        latex = "\n".join(latex)
        
        if output_tex:
            with open(output_tex, "w") as tex_out:
                tex_out.write(latex)
        else:
            return latex
    
    def to_datacard_json(self, signal_regions=None, output_json=None):
        datacard_systs = {}
        for syst in self.systs:
            labeled_systs = syst.get_systs()
            datacard_systs[syst.name] = [1 + max(systs[SR]) for SR in signal_regions]
            
        if output_json:
            with open(output_json, "w") as json_out:
                json.dump(datacard_systs, json_out)
                
        return datacard_systs
    
def get_year_str(year, doAPV=True):
    if doAPV and year == -2016:
        return "2016preVFP"
    elif doAPV and year == 2016:
        return "2016postVFP"
    else:
        return str(abs(year))

# Scale factors

In [5]:
SIG_SYSTS_TABLE = SystematicsTable(samples=["VBSWH_mkW"])
BKG_SYSTS_TABLE = SystematicsTable(
    samples=[b.split("/")[-1].replace(".root", "") for b in bkg_babies]
)
SIG_SYSTS_LIMIT = SystematicsTable(samples=["VBSWH_mkW"])

In [6]:
def get_systs(sample_name, signal_regions, sf, *sf_variations, year=None):
    global vsbwh
    systs = []
    for SR in signal_regions:
        df = vbswh.sample_df(name=sample_name, selection=SR)
        # Get nominal
        count = df.event_weight.sum()
        # Get delta = nominal - variation for each variation
        deltas = []
        for sf_var in sf_variations:
            if year:
                year_df = df[df.year == year]
                count_var = (
                    np.sum(year_df.event_weight/year_df[sf]*year_df[sf_var])
                    + np.sum(df[df.year != year].event_weight)
                )
            else:
                count_var = np.sum(df.event_weight/df[sf]*df[sf_var])
            deltas.append(abs((count - count_var)/count))
        
        systs.append(max(deltas))
        
    return systs

def get_systs_nonSF(sample_name, signal_regions, signal_regions_up, signal_regions_dn):
    systs = []
    for SR_i, SR in enumerate(signal_regions):
        SR_up = signal_regions_up[SR_i]
        SR_dn = signal_regions_dn[SR_i]
        
        df = vbswh.sample_df(name=sample_name, selection=SR)
        df_up = vbswh.sample_df(name=sample_name, selection=SR_up)
        df_dn = vbswh.sample_df(name=sample_name, selection=SR_dn)
        
        count = df.event_weight.sum()
        count_up = df_up.event_weight.sum()
        count_dn = df_dn.event_weight.sum()
        
        up_perc = abs((count_up - count)/count)
        dn_perc = abs((count - count_dn)/count)
        
        systs.append(max(up_perc, dn_perc))
        
    return systs

### PDF uncertainty

In [7]:
root_files = glob.glob("/ceph/cms/store/user/jguiang/VBSVHSkim/sig_1lep_1ak8_2ak4_pku/VBSWH_mkW*NANOGEN*/merged.root")
gen_sum = 0
pdf_sum = np.zeros(101)
for root_file in root_files:
    with uproot.open(root_file) as f:
        gen_sums = f["Runs"]["genEventSumw"].array(library="np")
        pdf_sums = f["Runs"]["LHEPdfSumw"].array(library="np")
#         print(pdf_sums[0].shape)
        missed = np.array([len(s) != 101 for s in pdf_sums])
        reshaped = np.vstack(pdf_sums[~missed])
        pdf_sum += np.dot(gen_sums[~missed], reshaped) + np.sum(gen_sums[missed])
        gen_sum += np.sum(gen_sums)
        
pdf_ratio = pdf_sum/gen_sum
# pdf_ratio = pdf_sum/pdf_sum[0] # pdf_sum[0] should be equal to gen_sum, but isn't right now...

with uproot.open("../analysis/studies/vbswh/output_kscans/Run2/VBSWH_mkW.root") as f:
    df = f.get("pdf_tree").arrays(library="pd")
    
systs = []
for signal_region in SIGNAL_REGIONS:
    SR = df.eval(signal_region)
    count = np.sum(df[SR].event_weight*df[SR].lhe_pdf_0)
    deltas = []
    for i in range(1, 101):
        count_var = np.sum(df[SR].event_weight*df[SR][f"lhe_pdf_{i}"])
        deltas.append(count - count_var/pdf_ratio[i])

    deltas = np.array(deltas)
    systs.append(np.sqrt(np.sum(deltas**2))/count)

pdf_systs = Systematic("PDF variations", SIGNAL_REGIONS)
pdf_systs.add_systs(systs)
SIG_SYSTS_TABLE.add_row(pdf_systs)
SIG_SYSTS_LIMIT.add_row(pdf_systs.copy("pdf_vars"))

### $\alpha_S$ uncertainty

In [8]:
# vbswh.df["alphaS_corr_up"] = vbswh.df.alphaS_up
# vbswh.df["alphaS_corr_dn"] = vbswh.df.alphaS_dn

# vbswh.df.loc[vbswh.df.is_signal, "alphaS_corr_up"] = vbswh.df[vbswh.df.is_signal].alphaS_up/pdf_ratio[101]
# vbswh.df.loc[vbswh.df.is_signal, "alphaS_corr_dn"] = vbswh.df[vbswh.df.is_signal].alphaS_up/pdf_ratio[102]

# alphaS_systs = Systematic("$\\alpha_S$ unc.", SIGNAL_REGIONS)
# alphaS_systs.add_systs(
#     get_systs("VBSWH_mkW", SIGNAL_REGIONS, "unity", "alphaS_corr_dn", "alphaS_corr_up")
# )
# SIG_SYSTS_TABLE.add_row(alphaS_systs)
# SIG_SYSTS_LIMIT.add_row(alphaS_systs.copy("alphaS_unc"))

### LHE scale weights

In [9]:
lhe_muR_weights = list(vbswh.df.columns[vbswh.df.columns.str.contains("muF1p0")])
lhe_muF_weights = list(vbswh.df.columns[vbswh.df.columns.str.contains("muR1p0")])

muR_systs = Systematic("$\\mu_R$ scale", SIGNAL_REGIONS)
muR_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "unity", *lhe_muR_weights)
)

muF_systs = Systematic("$\\mu_F$ scale", SIGNAL_REGIONS)
muF_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "unity", *lhe_muF_weights)
)

# SIG_SYSTS_TABLE.add_row(muR_systs)                   # muR variations have not effect
SIG_SYSTS_TABLE.add_row(muF_systs)
# SIG_SYSTS_LIMIT.add_row(muR_systs.copy("muR_scale")) # muR variations have not effect
SIG_SYSTS_LIMIT.add_row(muF_systs.copy("muF_scale"))

### Parton shower weights

In [10]:
isr_weights = list(vbswh.df.columns[vbswh.df.columns.str.contains("fsr1p0")])
fsr_weights = list(vbswh.df.columns[vbswh.df.columns.str.contains("isr1p0")])

isr_sf_systs = Systematic("Parton shower ISR weights", SIGNAL_REGIONS)
isr_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "unity", *isr_weights)
)

fsr_sf_systs = Systematic("Parton shower FSR weights", SIGNAL_REGIONS)
fsr_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "unity", *fsr_weights)
)

SIG_SYSTS_TABLE.add_row(isr_sf_systs)
SIG_SYSTS_TABLE.add_row(fsr_sf_systs)
SIG_SYSTS_LIMIT.add_row(isr_sf_systs.copy("isr_weights"))
SIG_SYSTS_LIMIT.add_row(fsr_sf_systs.copy("fsr_weights"))

### Pileup reweighting

In [11]:
bkg_pu_sf_systs = Systematic("Pileup reweighting", SIGNAL_REGIONS)

for sample_name in BKG_SYSTS_TABLE.samples:
    bkg_pu_sf_systs.add_systs(
        get_systs(sample_name, SIGNAL_REGIONS, "pu_sf", "pu_sf_dn", "pu_sf_up")
    )

BKG_SYSTS_TABLE.add_row(bkg_pu_sf_systs)

  deltas.append(abs((count - count_var)/count))


In [12]:
pu_sf_systs = Systematic("Pileup reweighting", SIGNAL_REGIONS)
pu_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "pu_sf", "pu_sf_dn", "pu_sf_up")
)
SIG_SYSTS_TABLE.add_row(pu_sf_systs)
SIG_SYSTS_LIMIT.add_row(pu_sf_systs.copy("pu_rwgt"))

### Pileup jet ID scale factors

In [13]:
puid_sf_systs = Systematic("Pileup jet ID", SIGNAL_REGIONS)
puid_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "puid_sf", "puid_sf_dn", "puid_sf_up")
)
SIG_SYSTS_TABLE.add_row(puid_sf_systs)
SIG_SYSTS_LIMIT.add_row(puid_sf_systs.copy("puid_sf"))

### L1 Prefiring weight

In [14]:
bkg_prefire_sf_systs = Systematic("L1 pre-fire corrections", SIGNAL_REGIONS)

for sample_name in BKG_SYSTS_TABLE.samples:
    bkg_prefire_sf_systs.add_systs(
        get_systs(sample_name, SIGNAL_REGIONS, "prefire_sf", "prefire_sf_up", "prefire_sf_dn")
    )
        
BKG_SYSTS_TABLE.add_row(bkg_prefire_sf_systs)

  deltas.append(abs((count - count_var)/count))


In [15]:
prefire_sf_systs = Systematic("L1 pre-fire corrections", SIGNAL_REGIONS)
prefire_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "prefire_sf", "prefire_sf_up", "prefire_sf_dn")
)
SIG_SYSTS_TABLE.add_row(prefire_sf_systs)
SIG_SYSTS_LIMIT.add_row(prefire_sf_systs.copy("L1_prefire"))

### HLT scale factors

In [16]:
trig_sf_systs = Systematic("HLT scale factors", SIGNAL_REGIONS)
trig_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "trig_sf", "trig_sf_up", "trig_sf_dn")
)
SIG_SYSTS_TABLE.add_row(trig_sf_systs)
SIG_SYSTS_LIMIT.add_row(trig_sf_systs.copy("hlt_sfs"))

### Statistical uncertainty

In [17]:
bkg_stat_systs = Systematic("Simulation stat. unc.", SIGNAL_REGIONS)
for sample_name in BKG_SYSTS_TABLE.samples:
    bkg_stat_systs.add_systs(
        [
            vbswh.bkg_error(selection=f"SR1 and name == '{sample_name}'")/vbswh.bkg_count(selection="SR1"), 
            vbswh.bkg_error(selection=f"SR2 and name == '{sample_name}'")/vbswh.bkg_count(selection="SR2")
        ]
    )
BKG_SYSTS_TABLE.add_row(bkg_stat_systs)

In [18]:
stat_systs = Systematic("Simulation stat. unc.", SIGNAL_REGIONS)
stat_systs.add_systs(
    [
        vbswh.sig_error(selection="SR1")/vbswh.sig_count(selection="SR1"), 
        vbswh.sig_error(selection="SR2")/vbswh.sig_count(selection="SR2")
    ]
)
SIG_SYSTS_TABLE.add_row(stat_systs)
SIG_SYSTS_LIMIT.add_row(stat_systs.copy("mc_stat"))

### Lepton scale factors

In [19]:
bkg_lep_sf_systs = Systematic("Lepton scale factors", SIGNAL_REGIONS)

for sample_name in BKG_SYSTS_TABLE.samples:
    bkg_lep_sf_systs.add_systs(
        get_systs(sample_name, SIGNAL_REGIONS, "lep_id_sf", "lep_id_sf_up", "lep_id_sf_dn")
    )
        
BKG_SYSTS_TABLE.add_row(bkg_lep_sf_systs)

  deltas.append(abs((count - count_var)/count))


In [20]:
lep_sf_systs = Systematic("Lepton scale factors", SIGNAL_REGIONS)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "lep_id_sf", "lep_id_sf_up", "lep_id_sf_dn")
)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "elec_reco_sf", "elec_reco_sf_up", "elec_reco_sf_dn")
)
lep_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "muon_iso_sf", "muon_iso_sf_up", "muon_iso_sf_dn")
)
SIG_SYSTS_TABLE.add_row(lep_sf_systs)

lep_id_sf_systs = Systematic("lep_id", SIGNAL_REGIONS)
lep_id_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "lep_id_sf", "lep_id_sf_up", "lep_id_sf_dn")
)
SIG_SYSTS_LIMIT.add_row(lep_id_sf_systs)

elec_reco_sf_systs = Systematic("elec_reco", SIGNAL_REGIONS)
elec_reco_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "elec_reco_sf", "elec_reco_sf_up", "elec_reco_sf_dn")
)
SIG_SYSTS_LIMIT.add_row(elec_reco_sf_systs)

muon_iso_sf_systs = Systematic("muon_iso", SIGNAL_REGIONS)
muon_iso_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "muon_iso_sf", "muon_iso_sf_up", "muon_iso_sf_dn")
)
SIG_SYSTS_LIMIT.add_row(muon_iso_sf_systs)

### ParticleNet Xbb scale factors

In [21]:
xbb_sf_systs = Systematic("ParticleNet Xbb scale factors", SIGNAL_REGIONS)
# xbb_sf_systs.add_systs(
#     get_systs("VBSWH_mkW", SIGNAL_REGIONS, "xbb_sf", "xbb_sf_dn", "xbb_sf_up")
# )
# SIG_SYSTS_TABLE.add_row(xbb_sf_systs)
# SIG_SYSTS_LIMIT.add_row(xbb_sf_systs.copy("xbb_sfs"))

for year in [-2016, 2016, 2017, 2018]:
    xbb_sf_systs_ = Systematic(f"xbb_sfs_{get_year_str(year)}", SIGNAL_REGIONS)
    xbb_sf_systs_.add_systs(
        get_systs("VBSWH_mkW", SIGNAL_REGIONS, "xbb_sf", "xbb_sf_dn", "xbb_sf_up", year=year)
    )
    xbb_sf_systs.add_systs(
        get_systs("VBSWH_mkW", SIGNAL_REGIONS, "xbb_sf", "xbb_sf_dn", "xbb_sf_up", year=year)
    )
    SIG_SYSTS_LIMIT.add_row(xbb_sf_systs_)
    
SIG_SYSTS_TABLE.add_row(xbb_sf_systs)

### DeepJet b-tagging scale factors

In [22]:
bkg_btag_sf_systs = Systematic("DeepJet b-tagging scale factors", SIGNAL_REGIONS)

for sample_name in BKG_SYSTS_TABLE.samples:
    bkg_btag_sf_systs.add_systs(
        get_systs(sample_name, SIGNAL_REGIONS, "btag_sf", "btag_sf_dn", "btag_sf_up")
    )

BKG_SYSTS_TABLE.add_row(bkg_btag_sf_systs)

  deltas.append(abs((count - count_var)/count))


In [23]:
btag_sf_systs = Systematic("DeepJet b-tagging scale factors", SIGNAL_REGIONS)
btag_sf_systs.add_systs(
    get_systs("VBSWH_mkW", SIGNAL_REGIONS, "btag_sf", "btag_sf_dn", "btag_sf_up")
)
SIG_SYSTS_TABLE.add_row(btag_sf_systs)
SIG_SYSTS_LIMIT.add_row(btag_sf_systs.copy("btag_sfs"))

### MET uncertainty

In [24]:
bkg_met_unc_systs = Systematic("MET unc.", SIGNAL_REGIONS)

for sample_name in BKG_SYSTS_TABLE.samples:
    bkg_met_unc_systs.add_systs(
        get_systs_nonSF(sample_name, SIGNAL_REGIONS, ["SR1_dn", "SR2_dn"], ["SR1_up", "SR2_up"])
    )

BKG_SYSTS_TABLE.add_row(bkg_met_unc_systs)

  up_perc = abs((count_up - count)/count)
  dn_perc = abs((count - count_dn)/count)


In [25]:
met_unc_systs = Systematic("MET unc.", SIGNAL_REGIONS)
met_unc_systs.add_systs(
    get_systs_nonSF("VBSWH_mkW", SIGNAL_REGIONS, ["SR1_dn", "SR2_dn"], ["SR1_up", "SR2_up"])
)
SIG_SYSTS_TABLE.add_row(met_unc_systs)
SIG_SYSTS_LIMIT.add_row(met_unc_systs.copy("met_unc"))

# Jet energy

In [26]:
def get_jet_energy_systs(nominal_cflow, up_cflow, dn_cflow, signal_regions, name):
    
    nominal_cutflow = Cutflow.from_file(nominal_cflow)
    up_cutflow = Cutflow.from_file(up_cflow)
    dn_cutflow = Cutflow.from_file(dn_cflow)

    syst_up_cutflow = (up_cutflow - nominal_cutflow)/nominal_cutflow
    syst_dn_cutflow = (nominal_cutflow - dn_cutflow)/nominal_cutflow

    systs = Systematic(name, signal_regions.keys())
    for SR, cut_name in signal_regions.items():
        systs.add_syst(
            max(
                syst_up_cutflow[cut_name].n_pass,
                syst_dn_cutflow[cut_name].n_pass
            ),
            signal_region=SR
        )
        
    return systs

### Jet energy corrections

In [27]:
jec_systs = Systematic("Jet energy scale", ["SR1", "SR2"])

for sample_name in BKG_SYSTS_TABLE.samples:
    systs = get_jet_energy_systs(
        f"../analysis/studies/vbswh/output_{TAG}/Run2/{sample_name}_cutflow.cflow",
        f"../analysis/studies/vbswh/output_{TAG}_jec_up/Run2/{sample_name}_cutflow.cflow",
        f"../analysis/studies/vbswh/output_{TAG}_jec_dn/Run2/{sample_name}_cutflow.cflow",
        {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
        "Jet energy scale"
    )
    for SR, values in systs.get_systs().items():
        jec_systs.add_systs(values, signal_region=SR)
        
BKG_SYSTS_TABLE.add_row(jec_systs)

In [28]:
jec_systs = get_jet_energy_systs(
    f"../analysis/studies/vbswh/output_{TAG}/Run2/VBSWH_mkW_cutflow.cflow",
    f"../analysis/studies/vbswh/output_{TAG}_jec_up/Run2/VBSWH_mkW_cutflow.cflow",
    f"../analysis/studies/vbswh/output_{TAG}_jec_dn/Run2/VBSWH_mkW_cutflow.cflow",
    {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
    "Jet energy scale"
)
SIG_SYSTS_TABLE.add_row(jec_systs)
SIG_SYSTS_LIMIT.add_row(jec_systs.copy("jes"))

### Jet energy resolution

In [29]:
jer_systs = Systematic("Jet energy resolution", ["SR1", "SR2"])

for sample_name in BKG_SYSTS_TABLE.samples:
    systs = get_jet_energy_systs(
        f"../analysis/studies/vbswh/output_{TAG}/Run2/{sample_name}_cutflow.cflow",
        f"../analysis/studies/vbswh/output_{TAG}_jer_up/Run2/{sample_name}_cutflow.cflow",
        f"../analysis/studies/vbswh/output_{TAG}_jer_dn/Run2/{sample_name}_cutflow.cflow",
        {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
        "Jet energy resolution"
    )
    for SR, values in systs.get_systs().items():
        jer_systs.add_systs(values, signal_region=SR)
        
BKG_SYSTS_TABLE.add_row(jer_systs)

In [30]:
jer_systs = get_jet_energy_systs(
    f"../analysis/studies/vbswh/output_{TAG}/Run2/VBSWH_mkW_cutflow.cflow",
    f"../analysis/studies/vbswh/output_{TAG}_jer_up/Run2/VBSWH_mkW_cutflow.cflow",
    f"../analysis/studies/vbswh/output_{TAG}_jer_dn/Run2/VBSWH_mkW_cutflow.cflow",
    {"SR1": "XbbGt0p9_MSDLt150", "SR2": "STGt1500"},
    "Jet energy resolution"
)
SIG_SYSTS_TABLE.add_row(jer_systs)
SIG_SYSTS_LIMIT.add_row(jer_systs.copy("jer"))

# Other

### Luminosity

In [31]:
lumi_systs = Systematic("Luminosity", SIGNAL_REGIONS)
lumi_systs.add_systs([0.016, 0.016])
BKG_SYSTS_TABLE.add_row(lumi_systs)
SIG_SYSTS_TABLE.add_row(lumi_systs)
SIG_SYSTS_LIMIT.add_row(lumi_systs.copy("lumi"))

### H to bb BR uncertainty

In [32]:
hbb_br_systs = Systematic("\\Htobb BR", SIGNAL_REGIONS)
hbb_br_systs.add_systs([0.0127, 0.0127])
BKG_SYSTS_TABLE.add_row(hbb_br_systs)
SIG_SYSTS_TABLE.add_row(hbb_br_systs)
SIG_SYSTS_LIMIT.add_row(hbb_br_systs.copy("hbb_br"))

In [33]:
BKG_SYSTS_TABLE.to_dataframe(columns=["SR1"])

Unnamed: 0,Systematic,SR1
0,Pileup reweighting,0.9% - 11.9%
1,L1 pre-fire corrections,0.4% - 1.3%
2,Simulation stat. unc.,0.0% - 2.3%
3,Lepton scale factors,0.9% - 2.1%
4,DeepJet b-tagging scale factors,0.2% - 2.9%
5,MET unc.,0.2% - 15.6%
6,Jet energy scale,8.9% - 21.3%
7,Jet energy resolution,1.3% - 7.4%
8,Luminosity,1.6%
9,\Htobb BR,1.3%


In [34]:
SIG_SYSTS_TABLE.to_dataframe(columns=["SR1"])

Unnamed: 0,Systematic,SR1
0,PDF variations,2.2%
1,$\mu_F$ scale,17.7%
2,Parton shower ISR weights,0.2%
3,Parton shower FSR weights,1.5%
4,Pileup reweighting,0.1%
5,Pileup jet ID,1.0%
6,L1 pre-fire corrections,1.0%
7,HLT scale factors,0.8%
8,Simulation stat. unc.,2.2%
9,Lepton scale factors,0.0% - 1.6%


In [35]:
print(SIG_SYSTS_TABLE.to_latex(columns=["SR1"]))

\begin{table}[H]
\begin{center}
\begin{tabular}{lcc}
\hline
\hline
Systematic & SR1 \\
\hline
PDF variations & 2.2\% \\
$\mu_F$ scale & 17.7\% \\
Parton shower ISR weights & 0.2\% \\
Parton shower FSR weights & 1.5\% \\
Pileup reweighting & 0.1\% \\
Pileup jet ID & 1.0\% \\
L1 pre-fire corrections & 1.0\% \\
HLT scale factors & 0.8\% \\
Simulation stat. unc. & 2.2\% \\
Lepton scale factors & 0.0\% - 1.6\% \\
ParticleNet Xbb scale factors & 1.0\% - 1.9\% \\
DeepJet b-tagging scale factors & 0.3\% \\
MET unc. & 0.3\% \\
Jet energy scale & 6.4\% \\
Jet energy resolution & 0.6\% \\
Luminosity & 1.6\% \\
\Htobb BR & 1.3\% \\
\hline
\hline
\end{tabular}
\end{center}
\end{table}



In [36]:
print(SIG_SYSTS_TABLE.to_csv(columns=["SR1"]))

Systematic,SR1
PDF variations,2.2%
$\mu_F$ scale,17.7%
Parton shower ISR weights,0.2%
Parton shower FSR weights,1.5%
Pileup reweighting,0.1%
Pileup jet ID,1.0%
L1 pre-fire corrections,1.0%
HLT scale factors,0.8%
Simulation stat. unc.,2.2%
Lepton scale factors,0.0% - 1.6%
ParticleNet Xbb scale factors,1.0% - 1.9%
DeepJet b-tagging scale factors,0.3%
MET unc.,0.3%
Jet energy scale,6.4%
Jet energy resolution,0.6%
Luminosity,1.6%
\Htobb BR,1.3%



In [37]:
class Datacard:
    def __init__(self, obs, sig, bkg, systs):
        self.obs = obs
        self.n_obs = len(obs)
        
        self.sig_labels = []
        self.sig_yields = [[] for _ in range(self.n_obs)]
        for sig_label, sig_yields in sig.items():
            self.sig_labels.append(sig_label)
            for bin_i, sig_yield in enumerate(sig_yields):
                self.sig_yields[bin_i].append(sig_yield)
                
        self.bkg_labels = []
        self.bkg_yields = [[] for _ in range(self.n_obs)]
        for bkg_label, bkg_yields in bkg.items():
            self.bkg_labels.append(bkg_label)
            for bin_i, bkg_yield in enumerate(bkg_yields):
                self.bkg_yields[bin_i].append(bkg_yield)

        self.n_sig = len(self.sig_labels)
        self.n_bkg = len(self.bkg_labels)
        
        self.column_width = max([len(l) for l in self.sig_labels + self.bkg_labels])+2
        self.column_width = max(self.column_width, 12)
                
        self.syst_labels = []
        self.systs = []
        n_samples = self.n_sig + self.n_bkg
        w = self.column_width
        for sample_label, labeled_systs in systs.items():
            for syst_label, syst_values in labeled_systs.items():
                # Register syst
                if syst_label not in self.syst_labels:
                    self.syst_labels.append(syst_label)
                    self.systs.append([f"{'-':>{w}}" for _ in range(self.n_obs*(n_samples))])
                
                # Get index of syst label
                label_i = self.syst_labels.index(syst_label)
                
                # Assign syst values
                if sample_label in self.sig_labels:
                    for value_i, syst_value in enumerate(syst_values):
                        syst_i = self.sig_labels.index(sample_label)+value_i*n_samples
                        self.systs[label_i][syst_i] = f"{syst_value:{w}.4f}" 
                elif sample_label in self.bkg_labels:
                    for value_i, syst_value in enumerate(syst_values):
                        syst_i = self.n_sig+self.bkg_labels.index(sample_label)+value_i*n_samples
                        self.systs[label_i][syst_i] = f"{syst_value:{w}.4f}"  
                else:
                    raise Exception(f"{sample_label} not found")
                    
        self.header_width = max([len(l) for l in self.syst_labels])+2
        self.header_width = max(self.header_width, 12)
        
        self.content = None
        self.__create()
        
    def __create(self):
        cw = self.column_width
        hw = self.header_width
        hline = "-"*(hw+5 + cw*(self.n_sig + self.n_bkg)*self.n_obs) + "\n"
        content = ""
        content += f"imax {self.n_obs} number of channels\n"
        content += f"jmax {self.n_bkg} number of backgrounds\n"
        content += f"kmax {len(self.systs)} number of nuisance parameters\n"
        content += hline
        content += f"{'bin':<{hw+5}}"
        content +=  "".join([f"{'bin'+str(i+1):>{cw}}" for i in range(self.n_obs)])
        content +=  "\n"
        content += f"{'observation':<{hw+5}}"
        content +=  "".join([f"{n:{cw}d}" for n in self.obs])
        content +=  "\n"
        content += hline
        content += f"{'bin':<{hw+5}}"
        content += "".join([f"{'bin'+str(i+1):>{cw}}" for i in range(self.n_obs) for _ in range(self.n_sig + self.n_bkg)])
        content +=  "\n"
        content += f"{'process':<{hw+5}}"
        content +=  "".join([f"{l:>{cw}}" for _ in range(self.n_obs) for l in self.sig_labels + self.bkg_labels])
        content +=  "\n"
        content += f"{'process':<{hw+5}}"
        content += "".join([f"{i:>{cw}}" for _ in range(self.n_obs) for i in range(self.n_sig + self.n_bkg)])
        content +=  "\n"
        content += f"{'rate':<{hw+5}}"
        content +=  "".join([f"{y[i]:{cw}.2f}" for y in self.sig_yields for i in range(self.n_sig)])
        content +=  "".join([f"{y[i]:{cw}.2f}" for y in self.bkg_yields for i in range(self.n_bkg)])
        content +=  "\n"
        content += hline
        for syst_i, syst_values in enumerate(self.systs):
            content += f"{self.syst_labels[syst_i]:<{hw}}"
            content +=  " lnN "
            content +=  "".join(syst_values)
            content += "\n"
            
        self.content = content
        
    def write(self, output_dat):
        with open(output_dat, "w") as dat_out:
            dat_out.write(self.content)

In [38]:
with open("AN_numbers.json", "r") as f_in:
    AN_numbers = json.load(f_in)

datacard_systs = {
    "TotalBkg": {
        "abcd_syst": [1 + AN_numbers["BkgEstTotalSystErr"]/100],
        "abcd_stat": [1 + AN_numbers["BkgEstStatErr"]/100]
    }
}

total_sig_syst = 0
datacard_systs["VBSWH_mkW"] = {}
for syst_obj in SIG_SYSTS_LIMIT.systs:
    systs = syst_obj.get_systs()
    datacard_systs["VBSWH_mkW"][syst_obj.name] = [1 + systs["SR1"][0]]
    total_sig_syst += systs["SR1"][0]**2
    
total_sig_syst = np.sqrt(total_sig_syst)
    
pred_bkg = vbswh.data_count(selection="regionA")/vbswh.data_count(selection="regionB")*vbswh.data_count(selection="regionC")
datacard = Datacard(
    [round(pred_bkg)], # dummy value for observed
    {"VBSWH_mkW": [vbswh.sig_count(selection="SR1")]},
    {"TotalBkg": [pred_bkg]},
    datacard_systs
)
    
print(datacard.content)
datacard.write("../combine/datacards/vbswh.dat")

imax 1 number of channels
jmax 1 number of backgrounds
kmax 24 number of nuisance parameters
--------------------------------------------------
bin                               bin1
observation                        120
--------------------------------------------------
bin                               bin1        bin1
process                      VBSWH_mkW    TotalBkg
process                              0           1
rate                            397.44      120.10
--------------------------------------------------
abcd_syst             lnN            -      1.1270
abcd_stat             lnN            -      1.1340
pdf_vars              lnN       1.0215           -
muF_scale             lnN       1.1771           -
isr_weights           lnN       1.0019           -
fsr_weights           lnN       1.0153           -
pu_rwgt               lnN       1.0012           -
puid_sf               lnN       1.0100           -
L1_prefire            lnN       1.0097           -
hlt_sfs      

In [39]:
total_SR1_sig_syst = 0
total_SR2_sig_syst = 0
for syst_obj in SIG_SYSTS_LIMIT.systs:
    systs = syst_obj.get_systs()
    total_SR1_sig_syst += systs["SR1"][0]**2
    total_SR2_sig_syst += systs["SR2"][0]**2
    
total_SR1_sig_syst = np.sqrt(total_SR1_sig_syst)
total_SR2_sig_syst = np.sqrt(total_SR2_sig_syst)

with open("AN_numbers.json", "w") as f_out:
    AN_numbers["ExpSigSystErr"] = round(total_SR1_sig_syst*AN_numbers["ExpSig"], 1)
    AN_numbers["SRTwoExpSigSystErr"] = round(total_SR2_sig_syst*AN_numbers["SRTwoExpSig"], 1)
    json.dump(AN_numbers, f_out)

In [40]:
for key, value in AN_numbers.items():
    print(f"\\newcommand{{\\{key}}}{{{value}}}")

\newcommand{\PredBkg}{120}
\newcommand{\PredBkgStatErr}{16.1}
\newcommand{\PredBkgSystErr}{15.3}
\newcommand{\ExpSig}{397}
\newcommand{\ExpSigStatErr}{8.7}
\newcommand{\ExpSigSystErr}{77.9}
\newcommand{\ExpBkg}{116}
\newcommand{\BkgEstABMC}{0.71}
\newcommand{\BkgEstABMCErr}{3.1}
\newcommand{\BkgEstABData}{0.71}
\newcommand{\BkgEstABDataErr}{11.0}
\newcommand{\PredBkgMC}{129.4}
\newcommand{\BkgEstMethodSystErr}{11.2}
\newcommand{\BkgEstBkgCompSystErr}{6.0}
\newcommand{\BkgEstTotalSystErr}{12.7}
\newcommand{\BkgEstStatErr}{13.4}
\newcommand{\BkgEstWJetsUpABMC}{0.68}
\newcommand{\BkgEstWJetsUpABMCErr}{2.9}
\newcommand{\BkgEstWJetsDownABMC}{0.74}
\newcommand{\BkgEstWJetsDownABMCErr}{3.2}
\newcommand{\BkgEstWJetsCompSyst}{5.4}
\newcommand{\BkgEstBosonsUpABMC}{0.7}
\newcommand{\BkgEstBosonsUpABMCErr}{4.5}
\newcommand{\BkgEstBosonsDownABMC}{0.72}
\newcommand{\BkgEstBosonsDownABMCErr}{2.6}
\newcommand{\BkgEstBosonsCompSyst}{2.6}
\newcommand{\SRTwoPredBkg}{5}
\newcommand{\SRTwoPredBkgStatErr}{0

In [41]:
print(SIG_SYSTS_TABLE.to_latex(columns=["SR2"]))

\begin{table}[H]
\begin{center}
\begin{tabular}{lcc}
\hline
\hline
Systematic & SR2 \\
\hline
PDF variations & 2.2\% \\
$\mu_F$ scale & 21.1\% \\
Parton shower ISR weights & 0.3\% \\
Parton shower FSR weights & 0.8\% \\
Pileup reweighting & 0.5\% \\
Pileup jet ID & 1.0\% \\
L1 pre-fire corrections & 1.0\% \\
HLT scale factors & 0.8\% \\
Simulation stat. unc. & 4.2\% \\
Lepton scale factors & 0.0\% - 1.5\% \\
ParticleNet Xbb scale factors & 1.2\% - 2.4\% \\
DeepJet b-tagging scale factors & 0.3\% \\
MET unc. & 0.2\% \\
Jet energy scale & 8.0\% \\
Jet energy resolution & 0.5\% \\
Luminosity & 1.6\% \\
\Htobb BR & 1.3\% \\
\hline
\hline
\end{tabular}
\end{center}
\end{table}



In [42]:
# "Datacard" for SR2 (NOT USED)
# datacard_systs = {
#     "TotalBkg": {
#         "est_syst": [1.34],
#         "est_stat": [1.13]
#     }
# }

# datacard_systs["VBSWH_mkW"] = {}
# for syst_obj in SIG_SYSTS_LIMIT.systs:
#     systs = syst_obj.get_systs()
#     datacard_systs["VBSWH_mkW"][syst_obj.name] = [1 + systs["SR2"][0]]
    
# pred_bkg = vbswh.data_count(selection="regionA")/vbswh.data_count(selection="regionB")*vbswh.data_count(selection="regionC")
# datacard = Datacard(
#     [int(pred_bkg)],
#     {"VBSWH_mkW": [vbswh.sig_count(selection="SR2")]},
#     {"TotalBkg": [pred_bkg]},
#     datacard_systs
# )
    
# print(datacard.content)