In [30]:
import joblib 
import numpy as np
import pandas as pd 
from pathlib import Path
import matplotlib.pyplot as plt
import hist
from hist import Hist 
from uncertainties import ufloat, unumpy
from tqdm import tqdm
import re

In [None]:
label_ptrange = [500, 600, 800, 1000, 1200, 1500, 2000]
label_var = ['jet_pt', 'jet_eta', 'jet_nTracks', 'jet_trackWidth', 'jet_trackC1', 'jet_trackBDT', 'GBDT_newScore']

# label_var = ['ntrk', 'bdt']
# label_var = ['ntrk']
label_leadingtype = ["LeadingJet", "SubLeadingJet"]
label_etaregion = ["Forward", "Central"]
label_type = ["Gluon", "Quark", "B_Quark", "C_Quark"]


In [2]:
import sys
core_code_path = '/global/cfs/projectdirs/atlas/hrzhao/HEP_Repo/QG_Calibration/NewWorkflow'
sys.path.append(core_code_path)

from core.Calculate_SF import convert_histdict2unumpy, Construct_unumpy
from core.utils import HistBins, label_var, label_pt_bin

In [3]:
nominal_path = '/global/cfs/projectdirs/atlas/hrzhao/HEP_Repo/QG_Calibration/NewWorkflow/trained_lightGBM_new/nominal'
nominal_path = Path(nominal_path)

In [106]:
def construct_Forward_Central(data_pred_hist_period_event_weight):
    label_ptrange = [500, 600, 800, 1000, 1200, 1500, 2000]
    label_var = ['jet_pt', 'jet_eta', 'jet_nTracks', 'jet_trackWidth', 'jet_trackC1', 'jet_trackBDT', 'GBDT_newScore']

    label_leadingtype = ["LeadingJet", "SubLeadingJet"]
    label_etaregion = ["Forward", "Central"]
    Data_period = dict.fromkeys(label_var)
    
    for var in label_var:
        Data_period[var] = dict.fromkeys(label_ptrange[:-1])
        for l_pt in label_ptrange[:-1]:
            sel_HistMap_Data_unumpy = {}
            for i, l_leadingtype  in enumerate(label_leadingtype):
                for j, l_etaregion in enumerate(label_etaregion):
                    key_data = str(l_pt) + "_" + l_leadingtype + "_" + l_etaregion + "_" + "Data" + "_" + var
                    sel_HistMap_Data_unumpy[key_data] = data_pred_hist_period_event_weight[key_data]

                    Forward_Data, Central_Data = Construct_unumpy(HistMap_unumpy=sel_HistMap_Data_unumpy, n_bins = len(HistBins[var]) - 1, sampletype="Data")
            
            Data_period[var][l_pt] = {
                "Forward_Data": Forward_Data,
                "Central_Data": Central_Data,
            } 
    
    return Data_period

def bootstrap(nominal_Data, period):
    bootstrap_Data = dict.fromkeys(nominal_Data.keys())
    
    for var, data_var in nominal_Data.items():
        bootstrap_Data[var] = dict.fromkeys(data_var.keys())
        for pt, data_pt in data_var.items():
            bootstrap_Data[var][pt] = dict.fromkeys(data_pt.keys())
            for region, data_region in data_pt.items():
                hist_var_pt_region_unumpy = unumpy.nominal_values(data_region)
                bootstrap = np.random.poisson(lam=hist_var_pt_region_unumpy)

                if period == "18":
                    variances = 58.45/39.91 ** 2 * bootstrap
                else: 
                    variances = bootstrap

                bootstrap_Data[var][pt][region] =  unumpy.uarray(bootstrap, np.sqrt(variances))
    
    return bootstrap_Data

def merged_bootstrap_period(bootstrap_data):
    keys = [*bootstrap_data.keys()]
    
    bootstrap_data_merged = bootstrap_data[keys[0]].copy()

    for bootstrap_data_key in keys[1:]:
        bootstrap_data_period = bootstrap_data[bootstrap_data_key]
        for var, data_var in bootstrap_data_period.items():
            for pt, data_pt in data_var.items():
                for region, data_region in data_pt.items():
                    bootstrap_data_merged[var][pt][region] += data_region
    
    return bootstrap_data_merged



In [109]:
data_Forward_Central_periods = dict.fromkeys(['1516', '17', '18'])

for file in sorted(nominal_path.rglob("data*_pred_hist*")):
    period_search_pattern = r"data(\d+)_pred_hists"
    period = re.search(period_search_pattern, file.stem).group((1))
    assert period in ["1516", "17", "18"]
    
    data_pred_hist_period = joblib.load(file)
    data_pred_hist_period_event_weight = convert_histdict2unumpy(data_pred_hist_period['event_weight']) 

    data_Forward_Central_periods[period] = construct_Forward_Central(data_pred_hist_period_event_weight)

In [124]:
# Generate bootstrap data from poisson distribution 
n_bootstrap = 100
bootstrap_Data = dict.fromkeys(np.arange(1, n_bootstrap+1, 1))

for i in tqdm(range(1, n_bootstrap+1)):

    bootstrap_data = {}
    for k, v in data_Forward_Central_periods.items():
        bootstrap_data[k] = bootstrap(v, period=k)

    bootstrap_Data[i] = merged_bootstrap_period(bootstrap_data) # here we have a copy! 

100%|██████████| 100/100 [00:07<00:00, 14.02it/s]


In [125]:
joblib.dump(bootstrap_Data, "bootstrap_Data.pkl")

['bootstrap_Data.pkl']

In [126]:
pythia_path = nominal_path / "MC_merged_hist.pkl"
data_path = nominal_path / "Data_merged_hist.pkl"

pythia = joblib.load(pythia_path)
data = joblib.load(data_path)

In [127]:
data_event_weight =  convert_histdict2unumpy(data['event_weight']) 
## Data18 makes difference between bin content and bin errors 
nominal_data = construct_Forward_Central(data_event_weight)

In [130]:
nominal_data

{'jet_pt': {500: {'Forward_Data': array([5619073.823544502+/-2640.189360279742,
          5550812.180197716+/-2623.5926013711505,
          5123358.421682358+/-2520.137532054656,
          4510589.116436958+/-2364.214046197943, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0,
          0.0+/-0, 0.0+/-0, 0.0+/-0, 0.0+/-0], dtype=object),
   'Central_Data': array([5707765.149751663+/-2658.1680192272015,
          5610669.984716415+/-2636.1393431017727,
          5147027.018118858+/-2525.3796461719803,
      

In [129]:
joblib.dump(nominal_data, "nominal_data.pkl")

['nominal_data.pkl']

In [19]:
data_nominal_unumpy = convert_histdict2unumpy(data[nominal_key])
pythia_nominal_unumpy = convert_histdict2unumpy(pythia[nominal_key])

In [17]:
label_ptrange = [500, 600, 800, 1000, 1200, 1500, 2000]
label_var = ['jet_pt', 'jet_eta', 'jet_nTracks', 'jet_trackWidth', 'jet_trackC1', 'jet_trackBDT', 'GBDT_newScore']

label_leadingtype = ["LeadingJet", "SubLeadingJet"]
label_etaregion = ["Forward", "Central"]
label_type = ["Gluon", "Quark", "B_Quark", "C_Quark"]

Extraction_Results = {}
nominal_Data = dict.fromkeys(label_var)
nominal_Pythia = dict.fromkeys(label_var)


for var in label_var:
    Extraction_Results[var] = {}
    nominal_Data[var] = dict.fromkeys(label_ptrange[:-1])
    nominal_Pythia[var] = dict.fromkeys(label_ptrange[:-1])

    for l_pt in label_ptrange[:-1]:

        sel_HistMap_MC_unumpy = {}
        sel_HistMap_Data_unumpy = {}

        for i, l_leadingtype  in enumerate(label_leadingtype):
            for j, l_etaregion in enumerate(label_etaregion):
                key_data = str(l_pt) + "_" + l_leadingtype + "_" + l_etaregion + "_" + "Data" + "_" + var
                sel_HistMap_Data_unumpy[key_data] = data_pred_hist_period_event_weight[key_data]

                Forward_Data, Central_Data = Construct_unumpy(HistMap_unumpy=sel_HistMap_Data_unumpy, n_bins = len(HistBins[var]) - 1, sampletype="Data")

        nominal_Data[var][l_pt] = {
            "Forward_Data": Forward_Data,
            "Central_Data": Central_Data,
        }

In [46]:

Extraction_Results = {}
nominal_Data = dict.fromkeys(label_var)
nominal_Pythia = dict.fromkeys(label_var)
for var in label_var:
    Extraction_Results[var] = {}
    nominal_Data[var] = dict.fromkeys(label_ptrange[:-1])
    nominal_Pythia[var] = dict.fromkeys(label_ptrange[:-1])

    for l_pt in label_ptrange[:-1]:

        sel_HistMap_MC_unumpy = {}
        sel_HistMap_Data_unumpy = {}

        for i, l_leadingtype  in enumerate(label_leadingtype):
            for j, l_etaregion in enumerate(label_etaregion):
                key_data = str(l_pt) + "_" + l_leadingtype + "_" + l_etaregion + "_" + "Data" + "_" + var
                sel_HistMap_Data_unumpy[key_data] = data_nominal_unumpy[key_data]

                for k, l_type in enumerate(label_type):
                    key_mc = str(l_pt) + "_" + l_leadingtype + "_" + l_etaregion + "_" + l_type + "_" + var
                    sel_HistMap_MC_unumpy[key_mc] = pythia_nominal_unumpy[key_mc]

        Forward, Central, Quark, Gluon, Forward_Quark, Forward_Gluon, Central_Quark, Central_Gluon  = Construct_unumpy(HistMap_unumpy=sel_HistMap_MC_unumpy, n_bins = len(HistBins[var]) - 1, sampletype="MC")
        Forward_Data, Central_Data = Construct_unumpy(HistMap_unumpy=sel_HistMap_Data_unumpy, n_bins = len(HistBins[var]) - 1, sampletype="Data")
        
        nominal_Pythia[var][l_pt] = {
            "Forward": Forward,
            "Central": Central,
            "Quark": Quark,
            "Gluon":Gluon,
            "Forward_Quark":Forward_Quark,
            "Forward_Gluon":Forward_Gluon,
            "Central_Quark":Central_Quark,
            "Central_Gluon":Central_Gluon
        }

        nominal_Data[var][l_pt] = {
            "Forward_Data": Forward_Data,
            "Central_Data": Central_Data,
        }

In [86]:
# Generate bootstrap data from poisson distribution 
n_bootstrap = 5_000
bootstrap_Data = dict.fromkeys(np.arange(1, n_bootstrap+1, 1))

for i in tqdm(range(1, n_bootstrap+1)):
    bootstrap_Data[i] = dict.fromkeys(label_var)
    for var, data_var in nominal_Data.items():
        bootstrap_Data[i][var] = dict.fromkeys(label_ptrange[:-1])
        for pt, data_pt in data_var.items():
            bootstrap_Data[i][var][pt] = dict.fromkeys(["Forward_Data", "Central_Data"])
            for region, data_region in data_pt.items():
                hist_var_pt_region_unumpy = unumpy.nominal_values(data_region)
                bootstrap = np.random.poisson(lam=hist_var_pt_region_unumpy)
                bootstrap_Data[i][var][pt][region]=unumpy.uarray(bootstrap, unumpy.std_devs(data_region))


100%|██████████| 5000/5000 [02:46<00:00, 30.08it/s]


In [87]:
joblib.dump(bootstrap_Data, "bootstrap_Data")

['bootstrap_Data']