In [1]:
import os, subprocess
import json
import uproot3
import awkward as ak
import numpy as np
from coffea import processor, util, hist
import pickle
import pandas as pd

#Plot settings
%matplotlib inline
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.style.CMS)

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'medium',
         'ytick.labelsize':'medium'}
pylab.rcParams.update(params)

#line thickness
import matplotlib as mpl
#mpl.rcParams['lines.linewidth'] = 5

from plotter import *

#Dataset parameters
lumis = {}
lumis['2016'] = 35.9
lumis['2017'] = 41.5
lumis['2018'] = 59.9

with open('xsec.json') as f:
    xs = json.load(f)

with open('pmap.json') as f:
    pmap = json.load(f)

systematics = ['nominal',
               'jet_triggerUp','jet_triggerDown',
               'btagWeightUp','btagWeightDown','btagEffStatUp','btagEffStatDown',
               'UESUp','UESDown','JESUp','JESDown','JERUp','JERDown',
              ]

mc = ['QCD','Wjets','Zjets','ttbar','singlet','VV','ggF','VBF','ZH','WH']


#To calculate significance
def significance(s,b):
    if b==0:
        return 0
    z_squared = 2.0*(s+b)*np.log(1.0+1.0*s/b) - 2.0*s
    return np.sqrt(z_squared)

## 1. Processing plots after Jennet's run

Coffea outputs: /eos/uscms/store/user/jennetd/may-2021/vh-charm-category/outfiles

In [4]:
year = '2017'
outsum = processor.dict_accumulator()
nfiles = len(subprocess.getoutput("ls ../infiles-split/"+year+"*.json").split())
#nfiles = 10

# Check if pickle exists, and don't re-create it if it does
repickle=True

picklename = 'pickles/{}_CScores_VHCharmSample.pkl'.format(year)
if os.path.isfile(picklename):
    repickle=False

In [5]:
# Load all files - this takes a while
if repickle:
    for n in range(1, nfiles+1):

        with open('../infiles-split/{}_{}.json'.format(year, n)) as f:
            infiles = json.load(f)
    
        filename = '/myeosdir/vh-charm-category/outfiles-ddb2/{}_{}.coffea'.format(year, n)
        #filename = 'outfiles/'+year+'_'+str(n)+'.coffea' #For local testing
        
        if os.path.isfile(filename):
            try:
                out = util.load(filename)
                outsum.add(out)
            except:
                print("File {} is broken.".format(filename))
        else:
            print('Missing file: '+str(n), infiles.keys())
            #print("File " + filename + " is missing")
        
    scale_lumi = {k: xs[k] * 1000 *lumis[year] / w for k, w in outsum['sumw'].items()}
    outsum['templates-vh-2'].scale(scale_lumi, 'dataset')
    
    # Use pmap to group the datasets together
    templates = outsum['templates-vh-2'].group('dataset', hist.Cat('process', 'Process'), pmap)

    outfile = open(picklename, 'wb')
    pickle.dump(templates, outfile, protocol=-1)
    outfile.close()

In [6]:
# Read the histogram from the pickle file
templates = pickle.load(open(picklename,'rb')).sum('msd2','ddb2').integrate('msd1',slice(117.,131.)).integrate('ddb1',slice(0.7,1)).integrate('region','signal')

## See how many events are in each two processes

In [7]:
def event_in_process_larger(templates):
    
    #New label for columns
    cols = [">={}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges() #Coresponding edge value to use later

    df = pd.DataFrame(index = mc, columns = cols)
    
    #loop over the processes and ddc2 scores and calcualte the events
    for process in mc:
        for thres in edge_vals:
            df.loc[process, ">={}".format(round(thres,1))] = round(templates.integrate('process', process).integrate('ddc2', slice(thres,1)).values()[()])
    
    return df

event_in_process_larger(templates)

Unnamed: 0,>=0.0,>=0.2,>=0.4,>=0.6,>=0.8,>=1.0
QCD,25,13,11,3,0,0
Wjets,1,0,0,0,0,0
Zjets,1,1,0,0,0,0
ttbar,4,3,3,2,1,0
singlet,1,1,1,1,0,0
VV,0,0,0,0,0,0
ggF,1,0,0,0,0,0
VBF,0,0,0,0,0,0
ZH,1,1,1,1,0,0
WH,2,2,1,1,0,0


In [9]:
def event_in_process_smaller(templates):
    
    #New label for columns
    cols = ["<{}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges() #Coresponding edge value to use later

    df = pd.DataFrame(index = mc, columns = cols)
    
    #loop over the processes and ddc2 scores and calcualte the events
    for process in mc:
        for thres in edge_vals:
            df.loc[process, "<{}".format(round(thres,1))] = round(templates.integrate('process', process).integrate('ddc2', slice(0.,thres)).values()[()])
    
    return df

event_in_process_smaller(templates)

Unnamed: 0,<0.0,<0.2,<0.4,<0.6,<0.8,<1.0
QCD,12,12,14,22,25,25
Wjets,0,0,1,1,1,1
Zjets,1,1,1,1,1,1
ttbar,1,1,1,2,3,4
singlet,0,0,0,0,1,1
VV,0,0,0,0,0,0
ggF,1,1,1,1,1,1
VBF,0,0,0,0,0,0
ZH,0,0,0,1,1,1
WH,0,0,1,1,1,2


## ZH, WH processes significance

### Not scaled version

In [12]:
def significance_scaled(s,b):
    
    #s = s*137.3/41.5
    #b = b*137.3/41.5
    
    if b==0:
        return 0
    z_squared = 2.0*(s+b)*np.log(1.0+1.0*s/b) - 2.0*s
    return np.sqrt(z_squared)

def significance_table(templates, proc = 'ZH'):
    
    indx = ['>=', '<=', "Quadrature Sum"]
    cols = ["{}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges()
    
    significance_df = pd.DataFrame(index = indx, columns = cols)
    
    b_list = [x for x in mc if x not in ['ZH', 'WH']] #Background
    
    for cut in indx:
        for thres in edge_vals:
            
            if cut == '>=': #Integrate forward
                #Calculate the number of signals and background
                s = templates.integrate('process', proc)\
                                        .integrate('ddc2', slice(thres,1)).values()[()]
                
                b = templates.integrate('process', b_list)\
                                        .integrate('ddc2', slice(thres,1)).values()[()]

                significance_df.loc[cut, "{}".format(round(thres,1))] = significance_scaled(s,b)
                
            else: #Integrate backwards
                #Calculate the number of signals and background
                s = templates.integrate('process', proc)\
                                        .integrate('ddc2', slice(0.,thres)).values()[()]
               
                b = templates.integrate('process', b_list)\
                                        .integrate('ddc2', slice(0.,thres)).values()[()]

                significance_df.loc[cut, "{}".format(round(thres,1))] = significance_scaled(s,b)
    
    def hypot(x):
        return np.hypot(x[0], x[1])

    significance_df.loc['Quadrature Sum'] = significance_df.apply(hypot, axis = 0)
    
    return significance_df


significance_table(templates)

Unnamed: 0,0.0,0.2,0.4,0.6,0.8,1.0
>=,0.186177,0.224249,0.215904,0.239699,0.097101,0.0
<=,0.022416,0.022416,0.04857,0.094694,0.170232,0.186177
Quadrature Sum,0.187522,0.225366,0.2213,0.257726,0.195978,0.186177


In [13]:
significance_table(templates, "WH")

Unnamed: 0,0.0,0.2,0.4,0.6,0.8,1.0
>=,0.302799,0.368079,0.302861,0.341764,0.25155,0.0
<=,0.031673,0.031673,0.126576,0.175705,0.257081,0.302799
Quadrature Sum,0.304451,0.369439,0.328247,0.384285,0.359678,0.302799


### Scaled version

In [10]:
def significance_scaled(s,b):
    
    s = s*137.3/41.5
    b = b*137.3/41.5
    
    if b==0:
        return 0
    z_squared = 2.0*(s+b)*np.log(1.0+1.0*s/b) - 2.0*s
    return np.sqrt(z_squared)

def significance_table(templates, proc = 'ZH'):
    
    indx = ['>=', '<=', "Quadrature Sum"]
    cols = ["{}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges()
    
    significance_df = pd.DataFrame(index = indx, columns = cols)
    
    b_list = [x for x in mc if x not in ['ZH', 'WH']] #Background
    
    for cut in indx:
        for thres in edge_vals:
            
            if cut == '>=': #Integrate forward
                #Calculate the number of signals and background
                s = templates.integrate('process', proc)\
                                        .integrate('ddc2', slice(thres,1)).values()[()]
                
                b = templates.integrate('process', b_list)\
                                        .integrate('ddc2', slice(thres,1)).values()[()]

                significance_df.loc[cut, "{}".format(round(thres,1))] = significance_scaled(s,b)
                
            else: #Integrate backwards
                #Calculate the number of signals and background
                s = templates.integrate('process', proc)\
                                        .integrate('ddc2', slice(0.,thres)).values()[()]
               
                b = templates.integrate('process', b_list)\
                                        .integrate('ddc2', slice(0.,thres)).values()[()]

                significance_df.loc[cut, "{}".format(round(thres,1))] = significance_scaled(s,b)
    
    def hypot(x):
        return np.hypot(x[0], x[1])

    significance_df.loc['Quadrature Sum'] = significance_df.apply(hypot, axis = 0)
    
    return significance_df


significance_table(templates)

Unnamed: 0,0.0,0.2,0.4,0.6,0.8,1.0
>=,0.338639,0.407888,0.39271,0.43599,0.176619,0.0
<=,0.040772,0.040772,0.088344,0.172241,0.309636,0.338639
Quadrature Sum,0.341085,0.409921,0.402524,0.46878,0.356467,0.338639


In [11]:
significance_table(templates, "WH")

Unnamed: 0,0.0,0.2,0.4,0.6,0.8,1.0
>=,0.550765,0.669502,0.550877,0.621639,0.457548,0.0
<=,0.05761,0.05761,0.230231,0.319591,0.467607,0.550765
Quadrature Sum,0.553769,0.671977,0.597053,0.69898,0.654222,0.550765
