# Plotting 1D distribution of the charm score

In [1]:
import os, subprocess
import json
import uproot3
import awkward as ak
import numpy as np
from coffea import processor, util, hist
import pickle
import pandas as pd

#Plot settings
%matplotlib inline
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.style.CMS)

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'medium',
         'ytick.labelsize':'medium'}
pylab.rcParams.update(params)

#line thickness
import matplotlib as mpl
#mpl.rcParams['lines.linewidth'] = 5

from plotter import *

#Dataset parameters
lumis = {}
lumis['2016'] = 35.9
lumis['2017'] = 41.5
lumis['2018'] = 59.9

with open('xsec.json') as f:
    xs = json.load(f)

with open('pmap.json') as f:
    pmap = json.load(f)

systematics = ['nominal',
               'jet_triggerUp','jet_triggerDown',
               'btagWeightUp','btagWeightDown','btagEffStatUp','btagEffStatDown',
               'UESUp','UESDown','JESUp','JESDown','JERUp','JERDown',
              ]

mc = ['QCD','Wjets','Zjets','ttbar','singlet','VV','ggF','VBF','ZH','WH']


#To calculate significance
def significance(s,b):
    if b==0:
        return 0
    z_squared = 2.0*(s+b)*np.log(1.0+1.0*s/b) - 2.0*s
    return np.sqrt(z_squared)

## 1. Processing plots after Jennet's run

Coffea outputs: /eos/uscms/store/user/jennetd/may-2021/vh-charm-category/outfiles

In [2]:
year = '2017'
outsum = processor.dict_accumulator()
nfiles = len(subprocess.getoutput("ls ../infiles-split/"+year+"*.json").split())
#nfiles = 10

# Check if pickle exists, and don't re-create it if it does
repickle=True

picklename = 'pickles/{}_CScores.pkl'.format(year)
if os.path.isfile(picklename):
    repickle=False

In [3]:
# Load all files - this takes a while
if repickle:
    for n in range(1, nfiles+1):

        with open('../infiles-split/{}_{}.json'.format(year, n)) as f:
            infiles = json.load(f)
    
        filename = '/myeosdir/vh-charm-category/outfiles/{}_{}.coffea'.format(year, n)
        #filename = 'outfiles/'+year+'_'+str(n)+'.coffea' #For local testing
        
        if os.path.isfile(filename):
            out = util.load(filename)
            outsum.add(out)
        else:
            print('Missing file: '+str(n), infiles.keys())
            #print("File " + filename + " is missing")
        
    scale_lumi = {k: xs[k] * 1000 *lumis[year] / w for k, w in outsum['sumw'].items()}
    outsum['cutflow'].scale(scale_lumi, 'dataset')
    
    # Use pmap to group the datasets together
    cutflow = outsum['cutflow'].group('dataset', hist.Cat('process', 'Process'), pmap)

    outfile = open(picklename, 'wb')
    pickle.dump(cutflow, outfile, protocol=-1)
    outfile.close()

In [4]:
# Read the histogram from the pickle file
templates = pickle.load(open(picklename,'rb')).sum('msd2','ddb2').integrate('ddb1',slice(0.89,1)).integrate('region','signal')

## Creating the cutflow table

In [10]:
def table_cutflow(templates):
    
    #New label for columns
    cols = [">={}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges() #Coresponding edge value to use later

    cutflow_df = pd.DataFrame(index = mc, columns = cols)
    
    #loop over the processes and ddc2 scores and calcualte the events
    for process in mc:
        for thres in edge_vals:
            cutflow_df.loc[process, ">={}".format(round(thres,1))] = round(templates.integrate('process', process).integrate('ddc2', slice(thres,1)).values()[()])
            
    return cutflow_df
    
cutflow_df = table_cutflow(templates)

In [11]:
cutflow_df

Unnamed: 0,>=0.0,>=0.2,>=0.4,>=0.6,>=0.8,>=1.0
QCD,230514,158648,77251,29843,4828,0
Wjets,7243,5746,3472,1752,433,0
Zjets,3625,2854,1687,818,184,0
ttbar,3039,2730,2064,1319,405,0
singlet,528,463,354,220,82,0
VV,284,253,185,111,34,0
ggF,8,5,2,0,0,0
VBF,3,2,1,0,0,0
ZH,7,6,5,3,1,0
WH,11,11,8,5,2,0


## Calculating the significance

In [6]:
def significance_table(templates):
    
    indx = ['VH']
    cols = [">={}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges()
    
    significance_df = pd.DataFrame(index = indx, columns = cols)
    
    for thres in edge_vals:

        b_list = [x for x in mc if x not in ['ZH', 'WH']] #Background

        #Calculate the number of signals and background
        s = templates.integrate('process', ['ZH', 'WH']).integrate('ddc2', slice(thres,1)).values()[()]
        b = templates.integrate('process', b_list).integrate('ddc2', slice(thres,1)).values()[()]

        significance_df.loc['VH', ">={}".format(round(thres,1))] = significance(s,b)
    
    return significance_df

significance_df = significance_table(templates)

In [7]:
significance_df

Unnamed: 0,>=0.0,>=0.2,>=0.4,>=0.6,>=0.8,>=1.0
VH,0.036793,0.041727,0.044366,0.047184,0.047108,0


## Significance around the higgs mass

In [6]:
def significance_table(templates):
    
    indx = ['VH']
    cols = [">={}".format(round(x,1)) for x in templates.axis('ddc2').edges()]
    edge_vals = templates.axis('ddc2').edges()
    
    #Integrate near the higgs mass
    templates_higgs = templates.integrate('msd1', slice(117.,131.))
    
    significance_df = pd.DataFrame(index = indx, columns = cols)
    
    for thres in edge_vals:

        b_list = [x for x in mc if x not in ['ZH', 'WH']] #Background

        #Calculate the number of signals and background
        s = templates_higgs.integrate('process', ['ZH', 'WH']).integrate('ddc2', slice(thres,1)).values()[()]
        b = templates_higgs.integrate('process', b_list).integrate('ddc2', slice(thres,1)).values()[()]

        significance_df.loc['VH', ">={}".format(round(thres,1))] = significance(s,b)
    
    return significance_df

significance_df = significance_table(templates)

In [7]:
significance_df

Unnamed: 0,>=0.0,>=0.2,>=0.4,>=0.6,>=0.8,>=1.0
VH,0.209386,0.235625,0.256123,0.229262,0.193408,0
