In [1]:
import argparse
import glob
import json
import logging
import os
import pickle as pkl
import warnings

import hist as hist2
import numpy as np
import pandas as pd
import pyarrow
import yaml
#from systematics import get_systematic_dict, sigs
from systematicsPass import get_systematic_dict, sigs

from utils import get_common_sample_name, get_finetuned_score, get_xsecweight

logging.basicConfig(level=logging.INFO)

warnings.filterwarnings("ignore", message="Found duplicate branch ")
pd.set_option("mode.chained_assignment", None)

CATEGORY = 'pass'



def get_templates(years, channels, samples, samples_dir, regions_sel, model_path, add_fake=False):

    print('regions_sel', regions_sel)
    # add extra selections to preselection
    presel = { "mu": { "fj_mass": "fj_mass < 180",}, "ele": {"fj_mass": "fj_mass <180",}, }


    region = 'SR1'

    #mass_binning = [40,70,115,140,180]
    mass_binning =  [40,70,110,140,180]



    
    hists = hist2.Hist(
        hist2.axis.StrCategory([], name="Sample", growth=True), hist2.axis.StrCategory([], name="Systematic", growth=True), hist2.axis.StrCategory([], name="Region", growth=True),
        hist2.axis.Variable(
            #list(range(55, 255, mass_binning)),
            mass_binning, name="mass_observable", label=r"V reconstructed mass [GeV]", overflow=True,
        ), storage=hist2.storage.Weight(),
    )

    for variation in ["fakes_nominal", "fakes_SF_Up", "fakes_SF_Down", "fakes_DR_Up", "fakes_DR_Down"]:
   
        for year in years:
            #data = pd.read_parquet(f"{samples_dir[year]}/fake_{year}_ele.parquet")
            #data = pd.read_parquet(f"/uscms/home/jieun201/nobackup/YOURWORKINGAREA/Fake_{year}/outfiles/0-1_ele.parquet")/sept8_parquets/2016APV/Fake/outfiles
            data = pd.read_parquet(f"/uscms/home/jieun201/nobackup/YOURWORKINGAREA/dec_12/2016APV/Fake/outfiles/{variation}_ele.parquet")
            print('data', data)
            for selection in presel["ele"]:
                logging.info(f"Applying {selection} selection on {len(data)} events")
                data = data.query(presel["ele"][selection])
                data["THWW"] = get_finetuned_score(data, model_path)
                #print('data[thww]', data["THWW"])

            #rint('hists.axes["Region"]', hists.axes["Region"])
            #for region in hists.axes["Region"]:
             #   print('region', hists.axes["Region"])
            df = data.copy()
            #logging.info(f"Applying {region} selection on {len(data)} events")
           # regions_sel = (THWW > 0.9) & (met_pt > 30) & (fj_pt > 250) & (h_fj_pt > 250)
            df = df[ (df['numberBJets_Medium_OutsideFatJets'] == 0) & (df['ReconVCandidateFatJetVScore'] > 0.9)
            & (df['met_pt'] > 30) & (df['fj_pt'] > 250) & (df['h_fj_pt'] > 250) & (df['THWW'] > 0.905)]
           
            
#can do a simple region sel here and elimin config file i guess

            
            logging.info(f"Will fill the histograms with the remaining {len(data)} events")

            if variation == "fakes_nominal":
                hists.fill( Sample="Fake", Systematic="pass_nominal", Region=region, mass_observable=df["fj_mass"], weight=df["event_weight"],  )
            else:
                print('variation', variation)
                hists.fill( Sample="Fake", Systematic="pass_" + variation, Region=region, mass_observable=df["fj_mass"], weight=df["event_weight"],  )


    
    
    return hists


def fix_neg_yields(h):
    """
    Will set the bin yields of a process to 0 if the nominal yield is negative, and will
    set the yield to 0 for the full Systematic axis.
    """
    for region in h.axes["Region"]:
        for sample in h.axes["Sample"]:
            neg_bins = np.where(h[{"Sample": sample, "Systematic": "pass_nominal", "Region": region}].values() < 0)[0]

            if len(neg_bins) > 0:
                print('got neg bins')
                print(f"{region}, {sample}, has {len(neg_bins)} bins with negative yield.. will set them to 0")

                sample_index = np.argmax(np.array(h.axes["Sample"]) == sample)
                region_index = np.argmax(np.array(h.axes["Region"]) == region)

                for neg_bin in neg_bins:
                    h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].value = 1e-3
                    h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].variance = 1e-3


#    for variation in ["fakes_nominal", "fakes_SF_Up", "fakes_SF_Down", "fakes_DR_Up", "fakes_DR_Down"]:
def fix_neg_yieldsFakes(h):
    for region in h.axes["Region"]:
        neg_bins = np.where(h[{"Sample": "Fake", "Systematic": "pass_fakes_DR_Down", "Region": region}].values() < 0)[0]
        if len(neg_bins) > 0:
            print('got neg bins')
            sample_index = np.argmax(np.array(h.axes["Sample"]) == 'Fake')
            region_index = np.argmax(np.array(h.axes["Region"]) == region)
            for neg_bin in neg_bins:
                h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].value = 1e-3
                h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].variance = 1e-3
                
def fix_neg_yieldsFakes2(h):
    for region in h.axes["Region"]:
        neg_bins = np.where(h[{"Sample": "Fake", "Systematic": "pass_fakes_SF_Down", "Region": region}].values() < 0)[0]
        if len(neg_bins) > 0:
            print('got neg bins')
            sample_index = np.argmax(np.array(h.axes["Sample"]) == 'Fake')
            region_index = np.argmax(np.array(h.axes["Region"]) == region)
            for neg_bin in neg_bins:
                h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].value = 1e-3
                h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].variance = 1e-3

In [2]:
#years = ['2017','2018']
years = ['2016APV']

#years = ['2016', '2016APV','2017', '2018']
#years = ['2017', '2018']

outdir = 'templates'

channels = 'mu','ele'
with open("simplePass_Fake.yaml", "r") as stream:
    config = yaml.safe_load(stream)

if len(years) == 4:
    save_as = "Run2"
else:
    save_as = "_".join(years)

if len(channels) == 1:
    save_as += f"_{channels[0]}_"

os.system(f"mkdir -p {outdir}")


#def get_templates(years, channels, samples, samples_dir, regions_sel, model_path, add_fake=False):
hists = get_templates( years, channels, 'Fake', config["samples_dir"], config["regions_sel"], config["model_path"],)

fix_neg_yields(hists)
fix_neg_yieldsFakes(hists)
fix_neg_yieldsFakes2(hists)
with open(f"{outdir}/hists_templates_{save_as}_fake_pass.pkl", "wb") as fp:
    print('hists', hists)
    pkl.dump(hists, fp)




INFO:root:Applying fj_mass selection on 85711 events


regions_sel {'SR1': '(numberBJets_Medium_OutsideFatJets == 0) & (ReconVCandidateFatJetVScore > 0.9) & (THWW > 0.905) & (met_pt > 30) & (fj_pt > 250) & (h_fj_pt > 250)'}
data        n_good_electrons  n_good_muons      lep_pt   lep_eta  lep_isolation  \
0                     1             0  193.528061 -0.717529       0.006599   
1                     1             0  143.862259  1.219727       0.618413   
3                     1             0   72.595901  1.302490       0.057834   
4                     1             0  185.531387 -0.494568       0.351502   
5                     1             0  193.405975  0.069321       0.010475   
...                 ...           ...         ...       ...            ...   
13633                 1             0  560.699646  0.672974       0.126621   
13634                 1             0  199.440948  1.960449       0.188273   
13635                 1             0  268.302734  2.180176       0.003672   
13640                 1             0  350.465

INFO:root:Will fill the histograms with the remaining 82625 events
INFO:root:Applying fj_mass selection on 85711 events


data        n_good_electrons  n_good_muons      lep_pt   lep_eta  lep_isolation  \
0                     1             0  193.528061 -0.717529       0.006599   
1                     1             0  143.862259  1.219727       0.618413   
3                     1             0   72.595901  1.302490       0.057834   
4                     1             0  185.531387 -0.494568       0.351502   
5                     1             0  193.405975  0.069321       0.010475   
...                 ...           ...         ...       ...            ...   
13633                 1             0  560.699646  0.672974       0.126621   
13634                 1             0  199.440948  1.960449       0.188273   
13635                 1             0  268.302734  2.180176       0.003672   
13640                 1             0  350.465179  1.832520       0.415715   
13641                 1             0  150.912201 -2.474121       0.690645   

       lep_misolation  lep_fj_dr  lep_met_mt  met_fj_dphi 

INFO:root:Will fill the histograms with the remaining 82625 events
INFO:root:Applying fj_mass selection on 85711 events


variation fakes_SF_Up
data        n_good_electrons  n_good_muons      lep_pt   lep_eta  lep_isolation  \
0                     1             0  193.528061 -0.717529       0.006599   
1                     1             0  143.862259  1.219727       0.618413   
3                     1             0   72.595901  1.302490       0.057834   
4                     1             0  185.531387 -0.494568       0.351502   
5                     1             0  193.405975  0.069321       0.010475   
...                 ...           ...         ...       ...            ...   
13633                 1             0  560.699646  0.672974       0.126621   
13634                 1             0  199.440948  1.960449       0.188273   
13635                 1             0  268.302734  2.180176       0.003672   
13640                 1             0  350.465179  1.832520       0.415715   
13641                 1             0  150.912201 -2.474121       0.690645   

       lep_misolation  lep_fj_dr  le

INFO:root:Will fill the histograms with the remaining 82625 events
INFO:root:Applying fj_mass selection on 85711 events


variation fakes_SF_Down
data        n_good_electrons  n_good_muons      lep_pt   lep_eta  lep_isolation  \
0                     1             0  193.528061 -0.717529       0.006599   
1                     1             0  143.862259  1.219727       0.618413   
3                     1             0   72.595901  1.302490       0.057834   
4                     1             0  185.531387 -0.494568       0.351502   
5                     1             0  193.405975  0.069321       0.010475   
...                 ...           ...         ...       ...            ...   
13633                 1             0  560.699646  0.672974       0.126621   
13634                 1             0  199.440948  1.960449       0.188273   
13635                 1             0  268.302734  2.180176       0.003672   
13640                 1             0  350.465179  1.832520       0.415715   
13641                 1             0  150.912201 -2.474121       0.690645   

       lep_misolation  lep_fj_dr  

INFO:root:Will fill the histograms with the remaining 82625 events
INFO:root:Applying fj_mass selection on 85711 events


variation fakes_DR_Up
data        n_good_electrons  n_good_muons      lep_pt   lep_eta  lep_isolation  \
0                     1             0  193.528061 -0.717529       0.006599   
1                     1             0  143.862259  1.219727       0.618413   
3                     1             0   72.595901  1.302490       0.057834   
4                     1             0  185.531387 -0.494568       0.351502   
5                     1             0  193.405975  0.069321       0.010475   
...                 ...           ...         ...       ...            ...   
13633                 1             0  560.699646  0.672974       0.126621   
13634                 1             0  199.440948  1.960449       0.188273   
13635                 1             0  268.302734  2.180176       0.003672   
13640                 1             0  350.465179  1.832520       0.415715   
13641                 1             0  150.912201 -2.474121       0.690645   

       lep_misolation  lep_fj_dr  le

INFO:root:Will fill the histograms with the remaining 82625 events


variation fakes_DR_Down
got neg bins
SR1, Fake, has 4 bins with negative yield.. will set them to 0
hists Hist(
  StrCategory(['Fake'], growth=True, name='Sample'),
  StrCategory(['pass_nominal', 'pass_fakes_SF_Up', 'pass_fakes_SF_Down', 'pass_fakes_DR_Up', 'pass_fakes_DR_Down'], growth=True, name='Systematic'),
  StrCategory(['SR1'], growth=True, name='Region'),
  Variable([40, 70, 110, 140, 180], name='mass_observable', label='V reconstructed mass [GeV]'),
  storage=Weight()) # Sum: WeightedSum(value=0.02, variance=0.02)


In [3]:
fakePass1 = pd.read_pickle(r'/home/jieun201/boostedhiggs_may27/combine_nov26/2016APV/templates/hists_templates_2016APV_fake_pass.pkl')
fakePass_2 = fakePass1[{"Region": "SR1", "Sample": "Fake", "Systematic": "pass_nominal"}]
fakePass_2

In [4]:
fakePass_3 = fakePass1[{"Region": "SR1", "Sample": "Fake", "Systematic": "pass_fakes_SF_Up"}]
fakePass_3


In [5]:
fakePass_4 = fakePass1[{"Region": "SR1", "Sample": "Fake", "Systematic": "pass_fakes_SF_Down"}]
fakePass_4

In [6]:
fakePass_5 = fakePass1[{"Region": "SR1", "Sample": "Fake", "Systematic": "pass_fakes_DR_Up"}]
fakePass_5

In [7]:
fakePass_6 = fakePass1[{"Region": "SR1", "Sample": "Fake", "Systematic": "pass_fakes_DR_Down"}]
fakePass_6