Basic Z-selection applied in processor. Jet matching notebook for JMENANO for CHS/PUPPI. Plotting distributions from MC using processor

In [1]:
#Probably not needed to run this cell, but maybe good to do? Used bleeding edge with 16 GB as configuration
#!pip install --user coffea

In [2]:
import bokeh
import time
import copy
import scipy.stats as ss
from scipy.optimize import curve_fit
from coffea import hist, processor, nanoevents, util
from coffea.nanoevents.methods import candidate
from coffea.nanoevents import NanoAODSchema, BaseSchema

import awkward as ak
import numpy as np
import glob as glob
import itertools
import pandas as pd
from numpy.random import RandomState

#from dask.distributed import Client
import inspect
import matplotlib.pyplot as plt

#from lpcjobqueue import LPCCondorCluster

In [3]:
class JMENanoAODSchema(NanoAODSchema):
    """JMENano schema builder

    JMENano is an extended NanoAOD format that includes various jet collections down to low pt for JME studies
    More info at https://twiki.cern.ch/twiki/bin/viewauth/CMS/JMECustomNanoAOD
    Customization at https://github.com/nurfikri89/cmssw/blob/master/PhysicsTools/NanoAOD/python/custom_jme_cff.py
    """

    mixins = {
        **NanoAODSchema.mixins,
        "JetCalo": "Jet",
        "JetPuppi": "Jet",
        "FatJetForJEC": "Jet",
        "FatJetCHS": "Jet",
    }
    all_cross_references = {
        **NanoAODSchema.all_cross_references,
        "FatJetForJEC_genJetIdx": "GenJetAK8ForJEC",
        "FatJetCHS_genJetIdx": "GenJetAK8ForJEC",
        "JetCalo_genJetIdx": "GenJet",
        "JetPuppi_genJetIdx": "GenJet",
    }


### Import processor

In [4]:
from CoffeaJERCProcessor_PUPPI_ZJet import Processor

In [56]:
xrootdstr = '/eos/cms/store/group/phys_jetmet/kirschen/JMENANO_EarlyDataTest/'
xrootdstr = '/scratch/singularity_scratch_coffea/JMENANO_LOCAL/'
#adapt this to eos...phys_jetmet

rootfiles = open('dataset_local_DY.txt').read().split()

fileslist = [xrootdstr + file for file in rootfiles]

rootfiles = open('dataset_local_DoubleMuon.txt').read().split()
fileslist_Data = [xrootdstr + file for file in rootfiles]
#for now only process either MC or Data. Should improve the bookkeeping, but will start with separate plots
#Following line commented out: Process MC, uncommented: process Data ()
fileslist=fileslist_Data



In [57]:
fileslist

['/scratch/singularity_scratch_coffea/JMENANO_LOCAL/DoubleMuon_Run2018D-UL2018_MiniAODv2_JMENanoAODv9-v1_NANOAOD/3ABDB242-A800-8842-8912-375A64AE9FA2.root',
 '/scratch/singularity_scratch_coffea/JMENANO_LOCAL/DoubleMuon_Run2018D-UL2018_MiniAODv2_JMENanoAODv9-v1_NANOAOD/8135DB63-F48E-3449-8ECC-16215A994FE9.root']

In [58]:
#process just two files for now
fileslist = fileslist[:20]
fileslist

['/scratch/singularity_scratch_coffea/JMENANO_LOCAL/DoubleMuon_Run2018D-UL2018_MiniAODv2_JMENanoAODv9-v1_NANOAOD/3ABDB242-A800-8842-8912-375A64AE9FA2.root',
 '/scratch/singularity_scratch_coffea/JMENANO_LOCAL/DoubleMuon_Run2018D-UL2018_MiniAODv2_JMENanoAODv9-v1_NANOAOD/8135DB63-F48E-3449-8ECC-16215A994FE9.root']

In [59]:
tstart = time.time()

outputs_unweighted = {}

seed = 1234577890
prng = RandomState(seed)
Chunk = [10000, 10] # [chunksize, maxchunks]

filesets = {'QCD': fileslist,
           #'Data': fileslist_Data
           }

for name,files in filesets.items(): 
    chosen_exec = 'futures'
    output = processor.run_uproot_job({name:files},
                                          treename='Events',
                                          processor_instance=Processor(),
                                          #executor=processor.iterative_executor,
                                            executor=processor.futures_executor,
                                          executor_args={
                                              'skipbadfiles':False,
                                              'schema': JMENanoAODSchema, #NanoAODSchema, #BaseSchema
                                              'workers': 48},
                                          chunksize=Chunk[0])#, maxchunks=Chunk[1])

elapsed = time.time() - tstart
outputs_unweighted[name] = output
print(output)
#util.save(output, 'CoffeaJERCOutputs_binned_DY_WithoutDZCut.coffea')
#util.save(output, 'CoffeaJERCOutputs_binned_DY_DZCut.coffea')
util.save(output, 'CoffeaJERCOutputs_ZJet_Selection_binned_something.coffea')


outputs_unweighted[name] = output
print(name + ' unweighted output loaded')
elapsed = time.time() - tstart

<coffea.lookup_tools.evaluator.evaluator object at 0x7fe1d7b03ca0>
dict_keys(['Summer20UL18_V2_MC_L2Relative_AK4PFPuppi'])
['Summer20UL18_V2_MC_L2Relative_AK4PFPuppi']



Output()

{'CHSPUPPIptresponse': <Hist (dataset,pt,jeteta,ptresponse) instance at 0x7fe053d84c40>, 'CHSPUPPIcorrected_ptresponse': <Hist (dataset,pt,jeteta,ptresponse) instance at 0x7fe206c75e80>, 'jetpt': <Hist (dataset,pt) instance at 0x7fe206c75670>, 'jeteta': <Hist (dataset,jeteta) instance at 0x7fe204dfa070>, 'jetphi': <Hist (dataset,jetphi) instance at 0x7fe053b88be0>, 'cutflow': defaultdict_accumulator(<class 'int'>, {})}
QCD unweighted output loaded


### Load coffea output file

In [60]:
#output = util.load('CoffeaJERCOutputs_binned_DY_WithoutDZCut.coffea')
#output = util.load('CoffeaJERCOutputs_binned_DY_DZCut.coffea')
output = util.load('CoffeaJERCOutputs_ZJet_Selection_binned_something.coffea')
print(output)
print ([a for a in output])

{'CHSPUPPIptresponse': <Hist (dataset,pt,jeteta,ptresponse) instance at 0x7fe1d7e9e1c0>, 'CHSPUPPIcorrected_ptresponse': <Hist (dataset,pt,jeteta,ptresponse) instance at 0x7fe2056a8700>, 'jetpt': <Hist (dataset,pt) instance at 0x7fe1d7e9e640>, 'jeteta': <Hist (dataset,jeteta) instance at 0x7fe1d7e9ee50>, 'jetphi': <Hist (dataset,jetphi) instance at 0x7fe0583af1c0>, 'cutflow': defaultdict_accumulator(<class 'int'>, {})}
['CHSPUPPIptresponse', 'CHSPUPPIcorrected_ptresponse', 'jetpt', 'jeteta', 'jetphi', 'cutflow']


In [61]:
# define gaussian function
def gauss(x, *p):
    A, mu, sigma = p
    return A*np.exp(-(x-mu)**2/(2.*sigma**2))

In [62]:
ptbins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 20, 23, 27, 30, 35, 40, 45, 57, 72, 90, 120, 
        150, 200, 300, 400, 550, 750, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 10000 ]





In [63]:
etabins =   [-4.889,  -4.716,  -4.538,  -4.363,  -4.191,  -4.013,  -3.839,  -3.664,  -3.489,
           -3.314,  -3.139,  -2.964,  -2.853,  -2.65,  -2.5,  -2.322,  -2.172,  -2.043,  -1.93,  -1.83,
           -1.74,  -1.653,  -1.566,  -1.479,  -1.392,  -1.305,  -1.218,  -1.131,  -1.044,  -0.957,  -0.879,
           -0.783,  -0.696,  -0.609,  -0.522,  -0.435,  -0.348,  -0.261,  -0.174,  -0.087,  0,  0.087,  0.174,
           0.261,  0.348,  0.435,  0.522,  0.609,  0.696,  0.783,  0.879,  0.957,  1.044,  1.131,  1.218,
           1.305,  1.392,  1.479,  1.566,  1.653,  1.74,  1.83,  1.93,  2.043,  2.172,  2.322,  2.5,  2.65,
           2.853,  2.964,  3.139,  3.314,  3.489,  3.664,  3.839,  4.013,  4.191,  4.363,  4.538,  4.716,
           4.889, ]#5.191 ]

In [64]:
jetpt_length = len(output['jetpt'].axis('pt')[1:-1])
jeteta_length = len(output['jeteta'].axis('jeteta')[1:-1])
jeteta_length = len(etabins)

mean = np.zeros((jetpt_length, jeteta_length))
median = np.zeros((jetpt_length, jeteta_length))
width = np.zeros((jetpt_length, jeteta_length))
idx = []

In [65]:
print(ptbins)
print(len(ptbins))
bins = [10,15,20,30]
#my_slices = tuple(slice(x) for x in bins)
print(ptbins[10:20])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 20, 23, 27, 30, 35, 40, 45, 57, 72, 90, 120, 150, 200, 300, 400, 550, 750, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 10000]
43
[11, 12, 13, 14, 15, 17, 20, 23, 27, 30]


In [66]:
import coffea, uproot3, numpy
def dumpHistos(histoname):
    xvals = output[histoname].axis('ptresponse' if 'ptresponse' in histoname else 'pt').centers()
    fout = uproot3.create("plots/{}_export.root".format(histoname))

    f_xvals = np.linspace(0,5,5001)
    j = 0
    fewptbins = [10,15,20,30,32]
    for i in range(len(ptbins)-1):
    #for i in fewptbins:

        ptBin = hist.Interval(ptbins[i], ptbins[i+1])
        print('pt bin '+str(ptBin))

        if not 'inf' in str(ptBin):
            #pt_string = '_pT'+str(int(ptBin.lo))+'to'+str(int(ptBin.hi))                                                                                                                                                                                                     
            pt_string = '_pT_{:0>6}_to_{:0>6}'.format(int(ptBin.lo),int(ptBin.hi))
        else:
            pt_string = '_pT'+str(ptBin.lo) + 'to' + str(ptBin.hi)
            pt_string = pt_string.replace('.0','').replace('-infto','0to')

        for k in range(len(etabins)-1):

            etaBin = hist.Interval(etabins[k], etabins[k+1])
            #eta_string = '_eta'+str(etaBin.lo)+'to'+str(etaBin.hi)                                                                                                                                                                                                           
            eta_string = '_eta_{:0>6.3f}_to_{:0>6.3f}'.format(etaBin.lo,etaBin.hi)
            eta_string = eta_string.replace('.','_')



            histo = output[histoname].integrate('jeteta', etaBin).integrate('pt', ptBin) if 'ptresponse' in histoname else output[histoname].integrate('jeteta', etaBin) 
            if i==0 or histoname!="GenJetCounts": fout["{}_{}_{}".format(histoname,pt_string if 'ptresponse' in histoname else "",eta_string)] = coffea.hist.export1d(histo.integrate('dataset'))
            histvals = np.repeat(histo.axis('ptresponse' if 'ptresponse' in histoname else 'pt').centers(), np.array(histo.values()[('QCD',)],dtype='int'))

            yvals = histo.values()[('QCD',)]



            try:
                p, arr = curve_fit(gauss, xvals, yvals, p0=[10,1,1])
            except:
                continue


            fgaus = gauss(f_xvals, *p)

    #         median[i,k] = f_xvals[fgaus == np.max(fgaus)]                                                                                                                                                                                                                   
            median[i,k] = np.median(histvals)
            mean[i,k] = p[1]
            width[i,k] = p[2]
            idx.append(i)

            if(etabins[k]==2.853 or etabins[k]==0.0):
                h = np.max(histo.values()[('QCD',)])
                ax = hist.plot1d(histo, overlay='dataset')
                ax.set_title("{}_{}_{}".format(histoname,pt_string,eta_string))
     #         plt.plot(f_xvals, fgaus)                                                                                                                                                                                                                                       
                plt.text(4,0.75*h,'Mean {0:0.2f}'.format(p[1]))
                plt.text(4,0.7*h,'Median {0:0.2f}'.format(np.median(histvals)))
                plt.text(4,0.65*h,'Width {0:0.2f}'.format(p[2]))
                #plt.text(4,0.65*h,'Width {0:0.2f}'.format(p[2]))

                plt.xscale("linear") if 'ptresponse' in histoname else plt.xscale("log")
                plt.savefig("plots/{}_{}_{}.pdf".format(histoname,pt_string,eta_string))
                plt.savefig("plots/{}_{}_{}.png".format(histoname,pt_string,eta_string))
                plt.show()
    fout.close()


In [None]:
!rm plots/*
#histos= ['GenJetCountsWithDZCut','GenJetCountsMatchedPUPPI','GenJetCountsMatchedCHS','GenJetCounts','ptresponse', 'corrected_ptresponse', 'CHSptresponse', 'CHScorrected_ptresponse', 'CHSPUPPIptresponse', 'CHSPUPPIcorrected_ptresponse']
histos= ['CHSPUPPIptresponse', 'CHSPUPPIcorrected_ptresponse']
#histos= ['ptresponse', 'corrected_ptresponse', 'CHSptresponse', 'CHScorrected_ptresponse', 'CHSPUPPIptresponse', 'CHSPUPPIcorrected_ptresponse']
for histo in histos:
    dumpHistos(histo)
