Include all the packages needed for this exercise as reported below

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
import uproot
import pandas
import scipy
from scipy.optimize import curve_fit

Write an enumeration where are reported the sample id for each sample, considering the same sample_id in AnaTuple.h - https://github.com/hh-italian-group/hh-bbtautau/blob/cmsdas_2019/Analysis/include/AnaTuple.h#L16

In [None]:
class Samples:
    Signal_NonRes = -125
    Signal_Radion = [ -260, -270, -280, -300, -320, -340, -350,
                      -400, -450, -500, -550, -600, -650, -750, -800, -900 ]
    Data = 0
    TT = 1
    DY = 2
    Wjets= 3
    SM_Higgs = 4
    other_bkg = 5
    
class Regions:
    OS_Isolated = 1
    OS_AntiIsolated = 2
    SS_Isolated = 3
    SS_AntiIsolated = 4

Define a class for btagging working points

In [None]:
class btag_wp:
    Loose = 0.5426
    Medium = 0.8484
    Tight = 0.9535

Here you can find the values of the cross-sections and branching ratios for each mass for the resonance samples (for the non-resonant are already included in the ntuples) and the branching ratio for the two higgses decaying in 2 tau leptons and 2 b-jets

In [None]:
cross_sections = { 260 : 10.31 , 270 : 9.62 , 280 : 8.79 , 300 : 7.20 , 320 : 6.19 , 340 : 5.22 , 
         350 : 4.79 , 400 : 3.37, 450 : 2.20 , 500 : 1.96, 550 : 1.2 , 600 : 1.24, 650 : 0.82 , 
         750 : 0.66, 800 : 0.54, 900 : 0.37}

br_hh = { 260 : 0.24 , 270 : 0.28 , 280 : 0.31 , 300 : 0.32 , 320 : 0.33 , 340 : 0.32, 
         350 : 0.32, 400 : 0.28, 450 : 0.26, 500 : 0.25, 550 : 0.24, 600 : 0.24, 650 : 0.24, 
         750 : 0.237, 800 : 0.237, 900 : 0.237}

br_h_bb = 5.809e-01
br_h_tt = 6.256e-02
br_hh_bbtt = #calculate total BR

Open the file, called anaTuples, which are different for each channel. Then create a pandas DataFrame.

In [None]:
#open the file according where you are working

#path = "/gpfs/ddn/cms/user/cmsdas/2019/hh_bbtautau/anaTuples/" #in Pisa
#path = "/eos/home-m/mgrippo/CMSDAS_2019_hh_bbtautau/anaTuples/" #on Swan

channel = #specify the channel with which you want to work: eTau, muTau or tauTau

#selected only the used branches
branches = [ 'sample_id', 'region_id', 'csv_b*', 'weight', 'MT2', 'm_ttbb_kinfit']

#open file with uproot and create pandas DataFrame
with uproot.open(path+channel+"_tuple.root") as file:
    tree = file[channel]
    df = tree.arrays(branches, outputtype=pandas.DataFrame)

In [None]:
signal =  #specify the signal with which you want to work "non-res" or "res"

Define QCD contribution, considering that the contribution in the signal region (= opposite-sign for taus and both isolated - OS_Isolated) can be estimated from data in the sideband regions after subtracting background MC contribution. 
The yield of QCD in the signal region is estimated using the following formula:

N(OS_Isolated) = N(SS_Isolated) * N(OS_AntiIsolated)/N(SS_AntiIsolated)

In [None]:
def CalculateQCD(selection) :
    #specify the region using the branch in anaTuples called region_id
    
    #calculate the number of events for data and bkg MC in each sideband region and with a specific selection 
    #passed from outside
    
    #be carefull if the subtracction gives a negative number return 0!
    

Write a method to calculate the significance, taking into account the difference between resonance and non-resonant samples. Remember to calculate the contribution for the background adding the qcd estimation

In [None]:
def CalculateSigma(mX, selection, region, is_res) :
    #for non-resonant the weights already include cross-section and final BR for the SM
    #while for resonant the signal yields are normalized to 1 pb; therefore you should apply the cross-section*final BR 
    
    #define how to calculate s/sqrt(s+b)

The aim of the exercise is to calculate the significance for each resonant and non-resonant sample, using different preselection:

* Requiring that we are in the signal region, that means that the two tau leptons should be opposite charge and they should be isolated. And requiring that both b-jets pass the medium working point for b-tagging

In [None]:
b_tag_sel_1 = df.csv_b1 > btag_wp.Medium
b_tag_sel_2 = df.csv_b2 > btag_wp.Medium 
signal_region = df.region_id == Regions.OS_Isolated

presel = b_tag_sel_1 & b_tag_sel_2 

print('{:10} {:10}'.format('mX', 'sigma'))

if(signal == "non-res") :
    mX = 125
    sigma = CalculateSigma(mX, presel,signal_region,False)
    print('{:<10.2f} {:<10.6f}'.format(mX, sigma))
    
elif(signal == "res") :
    mX_list = np.array(Samples.Signal_Radion)
    mX = [ -x for x in mX_list]
    sigma = np.zeros(len(mX))
    for n in range(len(mX)):
        sigma[n] = CalculateSigma(mX[n], presel,signal_region,True)
        print('{:<10.2f} {:<10.2f}'.format(mX[n], sigma[n]))

    sigma_plt = plt.plot(mX, sigma)
    plt.xlabel(r'$M_X$ [GeV]')
    plt.ylabel(r'Significance')
    plt.title('Significance')
    plt.show()

* As second point it should be added a different mass cut for the resonant candidate, to see how it changes the significance

Here a method to calculate the maximum probable value and the width using the 68 percentile

In [None]:
def CalcMpvAndWidth(var):
    hist, bin_edges = np.histogram(var, bins=200, range=(0, np.amax(var)))
    max_bin = np.argmax(hist)
    mpv = (bin_edges[max_bin] + bin_edges[max_bin + 1]) / 2
    
    inner_percentile = 68
    interval = np.percentile(var, [(100-inner_percentile)/2, 100 - ((100-inner_percentile)/2)])
    width = (interval[1]-interval[0])/2
    
    return mpv, width

Repeat the previous calculation but adding this cut on MT2 variable for non-resonant analysis and on m_ttbb_kinfit for resonant analysis.

Pay attention that each calculation of sigma depends on this cut.

Does the significance improve with this cut?