In [1]:
import dadrah.analysis.root_plotting_util as rpu
import dadrah.util.run_paths as runpa
import dadrah.util.string_constants as stco
import dadrah.selection.selection_util as seut
import pofah.jet_sample as js
import pofah.phase_space.cut_constants as cuts

import pathlib
import argparse
import scipy as sci
from scipy.stats import kstwo
from scipy.stats import chi2

import numpy as np
import ROOT as rt
import root_numpy as rtnp
import uuid
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
import root_numpy as rtnp
import mplhep as hep
plt.style.use(hep.style.ROOT)
import cmsstyle #mplhep for Python2 cheatsheet

Welcome to JupyROOT 6.22/00


## variable params

In [2]:
xsecs_sig   = [0,20,40,60,80,100] # signal cross sections
ae_run_n = 113
#qr_run_n, qr_model_str, train_share = 8, 'dense_70pct_train', 0.7
#qr_run_n, qr_model_str, train_share = 9, 'dense_50pct_train', 0.5
#qr_run_n, qr_model_str, train_share = 7, 'dense_polyfit', 0.2
qr_run_n, qr_model_str, train_share = 31, 'dense_kfold', 0
#quantiles = [0.0, 0.3, 0.5, 0.7, 0.9, 0.99]
quantiles = [0.0, 0.3, 0.5, 0.7, 0.9]

In [3]:
xsec_train = 0 # signal cross section used when training the QR
mX = 3.5
sample_id_qcd = 'qcdSigAllTest'+str(int((1-train_share)*100))+'pct' if train_share else 'qcdSigAll'
sample_id_sig = 'GtoWW35naReco'
sample_ids = [sample_id_qcd, sample_id_sig]
quant_tmplt_idx = 0
quant_tmplt = quantiles[quant_tmplt_idx]

In [4]:
paths = runpa.RunPaths(in_data_dir=stco.dir_path_dict['base_dir_qr_selections'], in_data_names=stco.file_name_path_dict, out_data_dir=stco.dir_path_dict['base_dir_qr_analysis'])
path_ext_dict = {'vae_run': str(ae_run_n), 'qr_run': str(qr_run_n), 'sig': sample_id_sig, 'xsec': str(int(xsec_train)), 'loss': 'rk5_05'}
paths.extend_in_path_data(path_ext_dict)
paths.extend_out_path_data({**path_ext_dict, 'hypothesis_testing': None})

## fixed params

In [5]:
# Rescaling the inputs to lumi
qcd_xsec         = 8730000.0 # Crossection in fb             
qcd_gen_events   = 134366091.0+199435365.0+90490645.0+134264102.0 #(all generated QCD)
sig_xsec_default = 10. #In units of fb (10 fb == 0.01 pb) 
sig_gen_events   = 972050.0 #(all generated signal)
lumi             = qcd_gen_events/qcd_xsec # assuming 64/fb since qcd_gen_events/8730000 ~ 64

scale_qcd = qcd_xsec*lumi/qcd_gen_events
scale_sig = (1-train_share)*sig_xsec_default*lumi/sig_gen_events #X% already used for quantile regression, must be removed
print('Usign luminosity {}'.format(lumi))
print('Scale QCD with {}'.format(scale_qcd))
print('Scale signal with {}'.format(scale_sig))

Usign luminosity 63.98123745704467
Scale QCD with 1.0
Scale signal with 0.0006582093252100682


In [6]:
# efficiencies
quantiles_tmp = np.asarray(quantiles+[1.])
effs = (quantiles_tmp)[template_q_idx+1:] - (quantiles_tmp)[template_q_idx:-1]
effs = effs[1:]/effs[0] # efficiency-test-quantiles/efficiency-template-quantile

NameError: name 'template_q_idx' is not defined

In [None]:
effs

In [None]:
n_bins = 35
min_mjj = 1600.
max_mjj = 5200.
expo_binning = False
dijet_binning = True


if dijet_binning:
    bin_edges = np.array([1200, 1255, 1320, 1387, 1457, 1529,
                          1604, 1681, 1761, 1844, 1930, 2019, 
                          2111, 2206, 2305, 2406, 2512, 2620, 
                          2733, 2849, 2969, 3093, 3221, 3353,
                          3490, 3632, 3778, 3928, 4084, 4245, 
                          4411, 4583, 4760, 4943, 5132, 5327]).astype('float')
elif expo_binning:
    x_shift = 3
    lin_bins = np.linspace(0.,1.,n_bins)
    exp_bins = lin_bins/(np.exp(-lin_bins+x_shift)/np.exp(x_shift-1))
    bin_edges = exp_bins*(max_mjj-min_mjj)+min_mjj
    
else: # simple linear binning
    bin_edges = np.array(np.linspace(1200., max_mjj, n_bins).tolist()).astype('float') #100 GeV binning. Stop at 5600! Fit fails if going to 6800
    
n_bins = len(bin_edges)-1
max_bin = bin_edges[-1]
min_bin = bin_edges[0]

## data reading

In [None]:
def read_raw_selection_data(quantiles, ae_run_n, qr_run_n, sample_id_sig, xsec_train, paths):
    samples = {}

    for sample_id in sample_ids:
        samples[sample_id] = js.JetSample.from_input_file(sample_id, paths.in_file_path(sample_id), **cuts.signalregion_cuts)
        
    # qcd raw data
    samples_ortho_quantiles_qcd = seut.divide_sample_into_orthogonal_quantiles(samples[sample_id_qcd], quantiles[1:])
    mjj_vals_qcd = [sample_ortho['mJJ'] for sample_ortho in samples_ortho_quantiles_qcd]
    # signal raw data
    samples_ortho_quantiles_sig = seut.divide_sample_into_orthogonal_quantiles(samples[sample_id_sig], quantiles[1:])
    mjj_vals_sig = [sample_ortho['mJJ'] for sample_ortho in samples_ortho_quantiles_sig]
    
    return mjj_vals_qcd, mjj_vals_sig

In [None]:
def read_raw_selection_data_to_numpy_hist(quantiles, ae_run_n, qr_run_n, sample_id_sig, xsec_train, bin_edges, paths):
    
    mjj_vals_qcd, mjj_vals_sig = read_raw_selection_data(quantiles, ae_run_n, qr_run_n, sample_id_sig, xsec_train, paths)
    
    datas_qcd = {}
    datas_sig = {}
    
    # qcd histogram data
    for mjj, q in zip(mjj_vals_qcd[template_q_idx:], quantiles[template_q_idx:]):
        counts, _, _ = plt.hist(mjj, bins=bin_edges)
        datas_qcd[q] = counts
        
    # sig histogram data
    for mjj, q in zip(mjj_vals_sig[template_q_idx:], quantiles[template_q_idx:]):
        counts, _, _ = plt.hist(mjj, bins=bin_edges)
        datas_sig[q] = counts
    
    # make signal injection dataset for all signal xsecs
    histos_data_inj = {}
    
     # for each quantile
    for quant in quantiles[template_q_idx:]:
        
        histos_data_inj_quant = {}
        
        # for each signal cross section (each signal injection value)
        for xsec in xsecs_sig:
        
            # get qcd data
            dat = datas_qcd[quant]
            # add signal data according to cross section
            scale_xsec_sig = xsec/sig_xsec_default
            sig_inj = datas_sig[quant]*scale_sig*scale_xsec_sig
            # TODO: sample sig bin height from poisson?
            
            histos_data_inj_quant[xsec] = dat + sig_inj
            
        histos_data_inj[quant] = histos_data_inj_quant
            
    return bin_edges[:-1], histos_data_inj     
    

In [None]:
bin_centers, hist_data = read_raw_selection_data_to_numpy_hist(quantiles, ae_run_n, qr_run_n, sample_id_sig, xsec_train, bin_edges, paths)

# 2-sample test with poisson likelihood

In [None]:
np.set_printoptions(precision=3, suppress=False)

In [None]:
def poisson_log_likelihood(observed, expected): # bin_h_exp must be already scaled to efficiency
    like = sci.stats.poisson.pmf(k=observed.astype('int'), mu=expected.astype('int'))
    return np.sum(-np.log(like), axis=-1) # calculating negative log like (the smaller p, the larger -log like => test for right tail)

In [None]:
def p_value_from_toys(toy_likelihoods, obs_likelihood):
    return sum(toy_likelihoods > obs_likelihood)/float(len(toy_likelihoods))

In [None]:
likelihoods_per_quantile = {}
expected_per_xse = hist_data[quant_tmplt]

for quant_test, eff in zip(quantiles[quant_tmplt_idx+1:], effs):
    
    likelihoods_per_xsec = {}
    observed_per_xsec = hist_data[quant_test]
    
    for xsec in xsecs_sig:
        observed = observed_per_xsec[xsec]
        expected = expected_per_xsec[xsec]*eff
        likelihood = poisson_log_likelihood(observed, expected)
        likelihoods_per_xsec[xsec] = likelihood
        print('quant {}, xsec {}: likelihood {}'.format(quant_test,xsec,likelihood))
    
    likelihoods_per_quantile[quant_test] = likelihoods_per_xsec

### compute toys for null-test-statistic distribution

In [None]:
toy_n = int(1e5)
toy_likes_per_xsec = {}
for xsec in xsecs_sig:
    # take expected bin heights as mu
    expected = expected_per_xsec[xsec]
    toy = sci.stats.poisson.rvs(expected, size=(toy_n,len(expected)))
    toy_likes = poisson_log_likelihood(toy, expected)
    toy_likes_per_xsec[xsec] = toy_likes

## compute and plot p-value under toy null distribution

In [None]:
for quantile, obs_likelihoods_per_xsec in likelihoods_per_quantile.items():
    fig, axs = plt.subplots(1,len(xsecs_sig[::2]),sharey=True,figsize=(18,4))
    for xsec, ax in zip(xsecs_sig[::2], axs.flat):
        toy_likes = toy_likes_per_xsec[xsec]
        obs_like = obs_likelihoods_per_xsec[xsec]
        pval = p_value_from_toys(toy_likes, obs_like)
        _ = ax.hist(toy_likes, bins=100)
        ax.vlines(obs_like, color='r', ymin=0, ymax=3e3, ls=':',lw=3)
        ax.set_title('xsec {}'.format(xsec), fontsize=16)
    fig.suptitle('quantile {}'.format(quantile), fontsize=19)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])