In [1]:
import os, subprocess
import json
import uproot
import awkward as ak
import numpy as np
from coffea import processor, util, hist

%matplotlib inline
import matplotlib.pyplot as plt
from cycler import cycler

import mplhep as hep
plt.style.use([hep.style.ROOT, hep.style.CMS])

In [2]:
colors = {}
colors['QCD'] = '#1f77b4'
colors['VBF'] = '#ff7f0e'
colors['VV'] = '#2ca02c'
colors['Wjets'] = '#d62728'
colors['WH'] = '#9467bd'
colors['Zjets'] = '#8c564b'
colors['ZH'] = '#e377c2'
colors['ggF'] = '#7f7f7f'
colors['ttbar'] = '#bcdb22'
colors['singlet'] = '#bcdb22'
colors['ttH'] = '#17becf'

In [3]:
lumis = {}
lumis['2016'] = 35.9
lumis['2017'] = 41.5
lumis['2018'] = 59.9

nfiles_mc = {}
nfiles_mc['2016'] = 64
nfiles_mc['2017'] = 89
nfiles_mc['2018'] = 106

with open('xsec.json') as f:
  xs = json.load(f)

with open('pmap.json') as f:
  pmap = json.load(f)

In [4]:
year = '2018'
outsum = processor.dict_accumulator()

In [5]:
# Higgs mass window
mbb_min = 110
mbb_max = 138

In [6]:
def significance(s,b):
    if b==0:
        return 0
    z_squared = 2.0*(s+b)*np.log(1.0+1.0*s/b) - 2.0*s
    return np.sqrt(z_squared)

In [7]:
def plot_overlay(x,name):
    x.label = 'Events'
    axes = hist.plotgrid(x, overlay='process', line_opts={}, order=['QCD','Zjets','Wjets','ttbar','singlet','VV','ggF','VBF','WH','ZH','ttH'])
    axes[0, 0].set_prop_cycle(cycler(color=colors.values()))
    axes[0, 0].set_yscale('log')
    axes[0, 0].set_ylim(.001, 100000)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    fig_name = year+'/plot-all/'+name+'.png'
    plt.savefig(fig_name,bbox_inches='tight')

In [8]:
def plot_stack(x,name):
    x.label = 'Events'
    axes = hist.plot1d(x, overlay='process', fill_opts={'edgecolor': (0,0,0,1)}, stack=True, order=['ttH','ZH','WH','VBF','ggF','VV','ttbar','singlet','Wjets','Zjets','QCD'])
    axes.set_prop_cycle(cycler(color=colors.values()))
    axes.set_yscale('log')
    axes.set_ylim(.001, 100000)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    fig_name = year+'/plot-all/'+name+'_stack.png'
    plt.savefig(fig_name,bbox_inches='tight')

In [9]:
def yield_plot(sr, name):
    
    nggF_sel = sr[('ggF',)]
    nVBF_sel = sr[('VBF',)]
    nWH_sel = sr[('WH',)]
    nZH_sel = sr[('ZH',)]
    nttH_sel = sr[('ttH',)]
    
    nQCD_sel = sr[('QCD',)]
    nVV_sel = sr[('VV'),]
    nWjets_sel = sr[('Wjets',)]
    nZjets_sel = sr[('Zjets',)]
    nttbar_sel = sr[('ttbar',)]
    nst_sel = sr[('singlet',)]

    sr_name = name+'-like'
    categories = [sr_name]
    
    yields = {}
    yields['ttH'] = [nttH_sel]
    yields['ZH'] = [nZH_sel]
    yields['WH'] = [nWH_sel]
    yields['VBF'] = [nVBF_sel]
    yields['ggF'] = [nggF_sel]
               
    yields['VV'] = [nVV_sel]
    yields['ttbar'] = [nttbar_sel]
    yields['singlet'] = [nst_sel]
    yields['Wjets'] = [nWjets_sel]
    yields['Zjets'] = [nZjets_sel]
    yields['QCD'] = [nQCD_sel]
    
    with open(year+'/plot-all/'+name+'_yield.json', 'w') as outfile:
        json.dump(yields, outfile)
    
    print(yields)
    
    y = [0]
    for p in ['ttH','ZH','WH','VBF','ggF','VV','Wjets','Zjets','QCD','ttbar','singlet']:
        bars = y
        if p == 'ttH':
            plt.bar(categories, yields[p], width=1, color=colors[p], label=p)
        else:
            plt.bar(categories, yields[p], width=1, color=colors[p], bottom=bars, label=p)
            
        y = [y[0]+yields[p][0]]

    plt.ylabel('Events')
    plt.yscale('log')
    plt.ylim(0.1,100000)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    fig_name = year+'/plot-all/'+name+'_yield.png'
    plt.savefig(fig_name,bbox_inches='tight')

In [11]:
# Load all MC
for n in range(1,nfiles_mc[year]+1):
    print(n)
    filename = 'condor/outfiles/'+year+'_'+str(n)+'.coffea'
    if os.path.isfile(filename):
        out = util.load(filename)
        outsum.add(out)
    else:
        print("File " + filename + " is missing")
        
scale_lumi = {k: xs[k] * 1000 *lumis[year] / w for k, w in outsum['sumw'].items()}
outsum['templates-pt'].scale(scale_lumi, 'dataset')

1
File condor/outfiles/2018_1.coffea is missing
2
File condor/outfiles/2018_2.coffea is missing
3
File condor/outfiles/2018_3.coffea is missing
4
File condor/outfiles/2018_4.coffea is missing
5
File condor/outfiles/2018_5.coffea is missing
6
File condor/outfiles/2018_6.coffea is missing
7
File condor/outfiles/2018_7.coffea is missing
8
File condor/outfiles/2018_8.coffea is missing
9
File condor/outfiles/2018_9.coffea is missing
10
File condor/outfiles/2018_10.coffea is missing
11
File condor/outfiles/2018_11.coffea is missing
12
File condor/outfiles/2018_12.coffea is missing
13
File condor/outfiles/2018_13.coffea is missing
14
File condor/outfiles/2018_14.coffea is missing
15
File condor/outfiles/2018_15.coffea is missing
16
File condor/outfiles/2018_16.coffea is missing
17
File condor/outfiles/2018_17.coffea is missing
18
File condor/outfiles/2018_18.coffea is missing
19
File condor/outfiles/2018_19.coffea is missing
20
File condor/outfiles/2018_20.coffea is missing
21
File condor/out

KeyError: 'sumw'

In [None]:
templates0 = outsum['templates-pt'].integrate('region', 'signal')
del outsum

In [None]:
templates = templates0.group('dataset', hist.Cat('process', 'Process'), pmap)

In [None]:
x = templates.sum('msd1','msd2','n2ddt2').integrate('ddb1',int_range=slice(0.89,1))

In [None]:
plot_overlay(x,'pt2')

In [None]:
plot_stack(x,'pt2')

In [None]:
x = templates.sum('pt2', 'n2ddt2', 'msd1').integrate('ddb1',int_range=slice(0.89,1))

In [None]:
plot_overlay(x,'msd2')

In [None]:
plot_stack(x,'msd2')

In [None]:
x = templates.sum('pt2','msd2','msd1').integrate('ddb1',int_range=slice(0.89,1))

In [None]:
plot_overlay(x,'n2ddt2')

In [None]:
plot_stack(x,'n2ddt2')

In [None]:
x = templates.integrate('msd1',int_range=slice(mbb_min, mbb_max)).integrate('ddb1',int_range=slice(0.89,1))

In [None]:
if year == '2017':

    cuts3 = []
    sigs3 = []

    msd2_slices = [i*7+40 for i in range(1,23)]
    for msd2_min in msd2_slices:
        for msd2_max in msd2_slices:
            if msd2_max <= msd2_min:
                continue

            sliced = x.sum('pt2','n2ddt2').integrate('msd2',int_range=slice(msd2_min,msd2_max))
            s = sliced.values()[('ZH',)] + sliced.values()[('WH',)]
            b = sliced.values()[('ggF',)] + sliced.values()[('VBF',)] + sliced.values()[('ttH',)] 
            b += sliced.values()[('QCD',)] + sliced.values()[('Wjets',)] + sliced.values()[('Zjets',)] + sliced.values()[('VV',)] + sliced.values()[('ttbar',)] + sliced.values()[('singlet',)]
            
            sigs3 += [significance(s,b)]
            cuts3 += [[msd2_min,msd2_max]]
        
    cuts3 = np.array(cuts3)
    print(max(sigs3))
    print(cuts3[np.argmax(sigs3)])

    msd_min_vh = cuts3[np.argmax(sigs3)][0]
    msd_max_vh = cuts3[np.argmax(sigs3)][1]

    plt.hist2d(cuts3[:,0], cuts3[:,1], density=False, weights=sigs3, bins=[21,21]);
    plt.xlabel('$msd_{min}$');
    plt.ylabel('$msd_{max}$');
    plt.savefig(year+'/plot-all/vh_2d_msdminmax.png')
    
else:
    msd2_min = 75
    msd2_max = 96

In [None]:
if year == '2017':

    cuts5 = []
    sigs5 = []

    pt2_slices = [300, 350, 400, 450, 500, 550, 600, 675, 800]
    n2ddt2_slices = [-0.25+0.25*i for i in range(1,3)]

    for pt2 in pt2_slices:
        for n2ddt2 in n2ddt2_slices:

            msd2_min = msd_min_vh
            msd2_max = msd_max_vh
        
            sliced = x.integrate('msd2',int_range=slice(msd2_min,msd2_max)).integrate('pt2',int_range=slice(pt2,1200)).integrate('n2ddt2',int_range=slice(-0.25,n2ddt2))
        
            s = sliced.values()[('ZH',)] + sliced.values()[('WH',)]
            b = sliced.values()[('ggF',)] + sliced.values()[('VBF',)] + sliced.values()[('ttH',)] 
            b += sliced.values()[('QCD',)] + sliced.values()[('Wjets',)] + sliced.values()[('Zjets',)] + sliced.values()[('VV',)] + sliced.values()[('ttbar',)] + sliced.values()[('singlet',)]
            
            sigs5 += [significance(s,b)]
            cuts5 += [[pt2,n2ddt2]]
        
    cuts5 = np.array(cuts5)
    print(max(sigs5))
    print(cuts5[np.argmax(sigs5)])

    n2ddt2_cut = cuts5[np.argmax(sigs5)][1]
    pt2_cut = cuts5[np.argmax(sigs5)][0]

    plt.hist2d(cuts5[:,0], cuts5[:,1], density=False, weights=sigs5, bins=[[300, 350, 400, 450, 500, 550, 600, 675, 800, 1200],[0,0.25,0.5]]);
    plt.xlabel('$p_{T}$');
    plt.ylabel('n2ddt');
    plt.savefig(year+'/plot-all/vh_2d_n2ddtpt.png')
    
else:
    pt2_cut = 550
    n2ddt2_cut = 0

In [None]:
sr = x.integrate('msd2',int_range=slice(msd2_min,msd2_max)).integrate('n2ddt2',int_range=slice(-0.25,n2ddt2_cut)).integrate('pt2',int_range=slice(pt2_cut,1200)).values()
yield_plot(sr, 'vh-bkgopt')

In [None]:
# significance
s = sr[('WH',)] + sr[('ZH',)]
b = sr[('QCD',)] + sr[('Zjets',)] + sr[('Wjets',)]  + sr[('VV',)] + sr[('ttbar',)] +sr[('singlet',)] + sr[('ggF',)] + sr[('VBF',)] + sr[('ttH',)]

print(s,b)
print(significance(s,b))

# this only makes sense in the mass window of Higgs

In [None]:
mhist = templates.integrate('ddb1',int_range=slice(0.89,1)).integrate('msd2',int_range=slice(msd2_min,msd2_max)).integrate('n2ddt2',int_range=slice(-0.25,n2ddt2_cut)).integrate('pt2',int_range=slice(pt2_cut,1200))
plot_stack(mhist,'vh-msd1')

In [None]:
import pandas as pd

In [None]:
df1 = pd.DataFrame(index=['$V$ jet $m_{sd}$','$V$ jet N2DDT','$V$ jet $p_T$','$H$ jet DeepDoubleB'],columns=['ggF','VBF','WH','ZH','ttH'])
df2 = pd.DataFrame(index=['$V$ jet $m_{sd}$','$V$ jet N2DDT','$V$ jet $p_T$','$H$ jet DeepDoubleB'],columns=['QCD','Wjets','Zjets','VV','ttbar','singlet'])

In [None]:
# msd2 cut
tmp = templates.sum('msd1','pt2','n2ddt2','ddb1').integrate('msd2',int_range=slice(msd2_min,msd2_max)).values()
cutname = '$V$ jet $m_{sd}$'

df1['ggF'][cutname] = tmp[('ggF'),]
df1['VBF'][cutname] = tmp[('VBF'),]
df1['WH'][cutname] = tmp[('WH'),]
df1['ZH'][cutname] = tmp[('ZH'),]
df1['ttH'][cutname] = tmp[('ttH'),]

df2['QCD'][cutname] = tmp[('QCD'),]
df2['Wjets'][cutname] = tmp[('Wjets'),]
df2['Zjets'][cutname] = tmp[('Zjets'),]
df2['VV'][cutname] = tmp[('VV'),]
df2['ttbar'][cutname] = tmp[('ttbar'),]
df2['singlet'][cutname] = tmp[('singlet'),]

In [None]:
# n2ddt2
tmp = templates.sum('msd1','pt2','ddb1').integrate('msd2',int_range=slice(msd2_min,msd2_max)).integrate('n2ddt2',int_range=slice(-0.25,n2ddt2_cut)).values()
cutname = '$V$ jet N2DDT'

df1['ggF'][cutname] = tmp[('ggF'),]
df1['VBF'][cutname] = tmp[('VBF'),]
df1['WH'][cutname] = tmp[('WH'),]
df1['ZH'][cutname] = tmp[('ZH'),]
df1['ttH'][cutname] = tmp[('ttH'),]

df2['QCD'][cutname] = tmp[('QCD'),]
df2['Wjets'][cutname] = tmp[('Wjets'),]
df2['Zjets'][cutname] = tmp[('Zjets'),]
df2['VV'][cutname] = tmp[('VV'),]
df2['ttbar'][cutname] = tmp[('ttbar'),]
df2['singlet'][cutname] = tmp[('singlet'),]

In [None]:
# pt2
tmp = templates.sum('msd1','ddb1').integrate('msd2',int_range=slice(msd2_min,msd2_max)).integrate('n2ddt2',int_range=slice(-0.25,n2ddt2_cut)).integrate('pt2',int_range=slice(pt2_cut,1200)).values()
cutname = '$V$ jet $p_T$'

df1['ggF'][cutname] = tmp[('ggF'),]
df1['VBF'][cutname] = tmp[('VBF'),]
df1['WH'][cutname] = tmp[('WH'),]
df1['ZH'][cutname] = tmp[('ZH'),]
df1['ttH'][cutname] = tmp[('ttH'),]

df2['QCD'][cutname] = tmp[('QCD'),]
df2['Wjets'][cutname] = tmp[('Wjets'),]
df2['Zjets'][cutname] = tmp[('Zjets'),]
df2['VV'][cutname] = tmp[('VV'),]
df2['ttbar'][cutname] = tmp[('ttbar'),]
df2['singlet'][cutname] = tmp[('singlet'),]

In [None]:
# ddb1
tmp = templates.sum('msd1').integrate('msd2',int_range=slice(msd2_min,msd2_max)).integrate('n2ddt2',int_range=slice(-0.25,n2ddt2_cut)).integrate('pt2',int_range=slice(pt2_cut,1200)).integrate('ddb1',int_range=slice(0.89,1)).values()
cutname = '$H$ jet DeepDoubleB'

df1['ggF'][cutname] = tmp[('ggF'),]
df1['VBF'][cutname] = tmp[('VBF'),]
df1['WH'][cutname] = tmp[('WH'),]
df1['ZH'][cutname] = tmp[('ZH'),]
df1['ttH'][cutname] = tmp[('ttH'),]

df2['QCD'][cutname] = tmp[('QCD'),]
df2['Wjets'][cutname] = tmp[('Wjets'),]
df2['Zjets'][cutname] = tmp[('Zjets'),]
df2['VV'][cutname] = tmp[('VV'),]
df2['ttbar'][cutname] = tmp[('ttbar'),]
df2['singlet'][cutname] = tmp[('singlet'),]

In [None]:
print(df1)
df1 = df1.astype('int')
df1.to_latex(buf=year+'/cutflow-sig.tex')

In [None]:
print(df2)
df2 = df2.astype('int')
df2.to_latex(buf=year+'/cutflow-bkg.tex')

In [None]:
# Hard-coded -- combine results from all 3 years

VH = 0
tot_H = 0

#2016
VH += 0.576513 + 0.335428
tot_H += 0.00495358 + 0.0620203 + 0.576513 + 0.335428 + 0.0373796

#2017
VH += 1.62591 + 1.05862
tot_H += 0.223702 + 0.245804 + 1.62591 + 1.05862 + 0.159492

#2018
VH += 0.957751 + 0.626182
tot_H += 0.0775505 + 0.0235961 + 0.957751 + 0.626182 + 0.0504347

print(VH/tot_H)

In [None]:
np.sqrt(.115**2 + .181**2 + .196**2)