In [1]:
import argparse
import glob
import json
import logging
import os
import pickle as pkl
import warnings

import hist as hist2
import numpy as np
import pandas as pd
import pyarrow
import yaml
#from systematics import get_systematic_dict, sigs
from systematicsPass import get_systematic_dict, sigs

from utils import get_common_sample_name, get_finetuned_score, get_xsecweight

logging.basicConfig(level=logging.INFO)

warnings.filterwarnings("ignore", message="Found duplicate branch ")
pd.set_option("mode.chained_assignment", None)

CATEGORY = 'pass'



def get_templates(years, channels, samples, samples_dir, regions_sel, model_path, add_fake=False):

    print('regions_sel', regions_sel)
    # add extra selections to preselection
    presel = { "mu": { "fj_mass": "fj_mass < 180",}, "ele": {"fj_mass": "fj_mass <180",}, }


    region = 'TopCR'
    mass_binning = [40,70,120,180]

    
    hists = hist2.Hist(
        hist2.axis.StrCategory([], name="Sample", growth=True), hist2.axis.StrCategory([], name="Systematic", growth=True), hist2.axis.StrCategory([], name="Region", growth=True),
        hist2.axis.Variable(
            #list(range(55, 255, mass_binning)),
            mass_binning, name="mass_observable", label=r"V reconstructed mass [GeV]", overflow=True,
        ), storage=hist2.storage.Weight(),
    )

    for variation in ["fakes_nominal", "fakes_SF_Up", "fakes_SF_Down", "fakes_DR_Up", "fakes_DR_Down"]:
   
        for year in years:
            data = pd.read_parquet(f"/uscms/home/jieun201/nobackup/YOURWORKINGAREA/Fake_{year}/outfiles/{variation}_ele.parquet")
  
            for selection in presel["ele"]:
                logging.info(f"Applying {selection} selection on {len(data)} events")
                data = data.query(presel["ele"][selection])
                #data["THWW"] = get_finetuned_score(data, model_path) #not using HWW score for top CR
     
            df = data.copy()
       
            df = df[ (df['numberBJets_Medium_OutsideFatJets'] > 0) & (df['ReconVCandidateFatJetVScore'] > 0.9)
            & (df['met_pt'] > 30) & (df['fj_pt'] > 250) & (df['h_fj_pt'] > 250) ]
                    
            
            logging.info(f"Will fill the histograms with the remaining {len(data)} events")

            if variation == "fakes_nominal":
                hists.fill( Sample="Fake", Systematic="pass_nominal", Region=region, mass_observable=df["fj_mass"], weight=df["event_weight"],  )
            else:
                print('variation', variation)
                hists.fill( Sample="Fake", Systematic="pass_" + variation, Region=region, mass_observable=df["fj_mass"], weight=df["event_weight"],  )


    
    
    return hists


def fix_neg_yields(h):
    """
    Will set the bin yields of a process to 0 if the nominal yield is negative, and will
    set the yield to 0 for the full Systematic axis.
    """
    for region in h.axes["Region"]:
        for sample in h.axes["Sample"]:
            neg_bins = np.where(h[{"Sample": sample, "Systematic": "pass_nominal", "Region": region}].values() < 0)[0]

            if len(neg_bins) > 0:
                print('got neg bins')
                print(f"{region}, {sample}, has {len(neg_bins)} bins with negative yield.. will set them to 0")

                sample_index = np.argmax(np.array(h.axes["Sample"]) == sample)
                region_index = np.argmax(np.array(h.axes["Region"]) == region)

                for neg_bin in neg_bins:
                    h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].value = 1e-3
                    h.view(flow=True)[sample_index, :, region_index, neg_bin + 1].variance = 1e-3



In [2]:
#years = ['2017','2018']
years = ['2017']

#years = ['2016', '2016APV','2017', '2018']
#years = ['2017', '2018']

outdir = 'templates'

channels = 'mu','ele'
with open("simplePass_Fake_TopCR.yaml", "r") as stream:
    config = yaml.safe_load(stream)

if len(years) == 4:
    save_as = "Run2"
else:
    save_as = "_".join(years)

if len(channels) == 1:
    save_as += f"_{channels[0]}_"

os.system(f"mkdir -p {outdir}")


#def get_templates(years, channels, samples, samples_dir, regions_sel, model_path, add_fake=False):
hists = get_templates( years, channels, 'Fake', config["samples_dir"], config["regions_sel"], config["model_path"],)

fix_neg_yields(hists)

with open(f"{outdir}/hists_templates_{save_as}_fake_pass_TopCR.pkl", "wb") as fp:
    print('hists', hists)
    pkl.dump(hists, fp)




regions_sel {'TopCR': '(numberBJets_Medium_OutsideFatJets > 0) & (met_pt > 30) & (fj_pt > 250) & (h_fj_pt > 250)  & (ReconVCandidateFatJetVScore > 0.9)'}


INFO:root:Applying fj_mass selection on 147477 events
INFO:root:Will fill the histograms with the remaining 140917 events
INFO:root:Applying fj_mass selection on 147477 events
INFO:root:Will fill the histograms with the remaining 140917 events
INFO:root:Applying fj_mass selection on 147477 events
INFO:root:Will fill the histograms with the remaining 140917 events


variation fakes_SF_Up
variation fakes_SF_Down


INFO:root:Applying fj_mass selection on 147477 events
INFO:root:Will fill the histograms with the remaining 140917 events
INFO:root:Applying fj_mass selection on 147477 events
INFO:root:Will fill the histograms with the remaining 140917 events


variation fakes_DR_Up
variation fakes_DR_Down
got neg bins
TopCR, Fake, has 3 bins with negative yield.. will set them to 0
hists Hist(
  StrCategory(['Fake'], growth=True, name='Sample'),
  StrCategory(['pass_nominal', 'pass_fakes_SF_Up', 'pass_fakes_SF_Down', 'pass_fakes_DR_Up', 'pass_fakes_DR_Down'], growth=True, name='Systematic'),
  StrCategory(['TopCR'], growth=True, name='Region'),
  Variable([40, 70, 120, 180], name='mass_observable', label='V reconstructed mass [GeV]'),
  storage=Weight()) # Sum: WeightedSum(value=0.015, variance=0.015)


In [3]:
obj = pd.read_pickle(r'/home/jieun201/boostedhiggs_may27/combine/templates/hists_templates_2017_fake_pass_TopCR.pkl')
obj

Hist(
  StrCategory(['Fake'], growth=True, name='Sample'),
  StrCategory(['pass_nominal', 'pass_fakes_SF_Up', 'pass_fakes_SF_Down', 'pass_fakes_DR_Up', 'pass_fakes_DR_Down'], growth=True, name='Systematic'),
  StrCategory(['TopCR'], growth=True, name='Region'),
  Variable([40, 70, 120, 180], name='mass_observable', label='V reconstructed mass [GeV]'),
  storage=Weight()) # Sum: WeightedSum(value=0.015, variance=0.015)

In [4]:
fakePass_3 = obj[{"Region": "TopCR", "Sample": "Fake", "Systematic": "pass_nominal"}]
fakePass_3