In [1]:
import random

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import roc_curve, auc


import uproot 
import awkward as ak
import pickle

In [2]:
file = "/home/dejavu/Projects/qgcal/data/dijet_sample_newBDT_flatten_data.npy"
all_jets = np.load(file)
all_jets[:, 0] = all_jets[:, 0]/1e3

In [3]:
all_jets.shape # Flatten events, need to reshape to split 
# leading, subleading
# forward, central

(26953840, 9)

In [4]:
all_jets = np.reshape(all_jets, (len(all_jets)//2,2,9))

In [5]:
all_jets.shape

(13476920, 2, 9)

In [6]:
branch_names = ["jet_pt", "jet_eta", "jet_nTracks", "jet_trackWidth", "jet_trackC1", "jet_trackBDT", "jet_PartonTruthLabelID", "total_weight", "newBDTScore"]

# Split Jets 

In [7]:
label_pt = ["LeadingJet", "SubJet"]
label_eta = ["Forward", "Central"]
label_type = ["Gluon", "Quark", "C_Quark", "B_Quark", "Data", "Other"]
label_var = ["pt", "eta", "ntrk", "width", "c1", "bdt", "newBDT"]
label_pt_bin = [500, 600, 800, 1000, 1200, 1500, 2000]

In [16]:
def split_pt_eta_jet(jets):
    # divide jets into 4 regions 
    forward_idx = np.argmax(np.abs(jets[:,:,1]), axis=1) # compare abs eta of jets inside events
    central_idx = -1*forward_idx+1

    leading_forward_idx = forward_idx == 0 # leading forward 
    subleading_forward_idx = forward_idx == 1 # subleading forward 

    leading_central_idx = central_idx == 0 # leading central 
    subleading_central_idx = central_idx == 1 # subleading central 

    return [jets[leading_forward_idx, 0, :], jets[leading_central_idx, 0, :],  
            jets[subleading_forward_idx, 1, :], jets[subleading_central_idx, 1, :]]

def split_jet_type(jets):
    gluon_idx = np.where(jets[:,6]==21)[0]
    light_quark_idx = np.where((jets[:,6]==1) | (jets[:,6]==2)| (jets[:,6]==3))[0]
    c_quark_idx = np.where(jets[:,6]==4)[0]
    b_quark_idx = np.where(jets[:,6]==5)[0]
    data_idx = np.where(jets[:,6]==-9999)[0]
    others_idx = np.where(jets[:,6]==-1)[0]

    gluon = jets[gluon_idx]
    quark = jets[light_quark_idx]
    c_quark = jets[c_quark_idx]
    b_quark = jets[b_quark_idx]
    data = jets[data_idx]
    others = jets[others_idx]

    return [gluon, quark, c_quark, b_quark, data, others]

def split_jet_pt(jets):
    splited_jet_pt = {}
    for i, pt_start in enumerate(label_pt_bin[:-1]):
        pt_selected_idx = np.where((jets[:,0] >= pt_start) & (jets[:,0] < label_pt_bin[i+1]))[0]
        splited_jet_pt[pt_start] = jets[pt_selected_idx]

    return splited_jet_pt


In [17]:
HistMap = {}
###### define functions
def GetHistBin(histname):
	if 'pt' in histname:
		return 60,0,2000
	elif 'eta' in histname:
		return 50,-2.5,2.5
	elif 'ntrk' in histname:
		return 60,0,60
	elif 'bdt' in histname:
		return 60,-0.8,0.7
	elif 'width' in histname:
		return 60,0.,0.4
	elif 'c1' in histname:
		return 60,0.,0.4
	elif 'newBDT' in histname:
		return 60,-0.8,0.7

def FillTH1F(histname, var, w):
    if 'Data' in histname:
        w = np.ones(len(var))
    if histname in HistMap:
        HistMap[histname][0].append(var)
        HistMap[histname][1].append(w)
    else:
        HistMap[histname] = [[],[]] #The first list is for the data, the second for the weights
        HistMap[histname] = [[],[]]
        HistMap[histname][0].append(var)
        HistMap[histname][1].append(w)

def FillHisto(prefix, jetlist):
	for i in range(6):
		FillTH1F(prefix+"_"+label_var[i], jetlist[:,i], jetlist[:,7])

	FillTH1F(prefix+"_"+label_var[6], jetlist[:,8], jetlist[:,7])


In [15]:
all_jets[0]

array([[ 6.09158937e+02,  1.06175935e+00,  2.30000000e+01,
         6.13516457e-02,  2.61259615e-01,  1.01512097e-01,
        -9.99900000e+03,  1.00000000e+00,  1.18058348e-01],
       [ 5.66184000e+02, -3.90782118e-01,  1.40000000e+01,
         5.92113920e-02,  2.05691591e-01, -1.14423379e-01,
        -9.99900000e+03,  1.00000000e+00, -1.14457757e-02]])

In [18]:
splited_pt_eta_jets = split_pt_eta_jet(all_jets)
label_pt_eta = [label1_pt + "_" + label2_eta for label1_pt in label_pt for label2_eta in label_eta]

n_jets = 0
for i, splited_pt_eta_jet in enumerate(splited_pt_eta_jets):
    n_jets += len(splited_pt_eta_jet)

    splited_pt_eta_jets_types = split_jet_type(splited_pt_eta_jet)

    for j, jet_type in enumerate(splited_pt_eta_jets_types):
        if jet_type.shape[0] == 0:
            continue 

        splited_jet_pt_bins = split_jet_pt(jet_type)

        for k, splited_jet_pt_bin in splited_jet_pt_bins.items():
            prefix = str(k) + "_" + label_pt_eta[i] + "_" + label_type[j]
            FillHisto(prefix, splited_jet_pt_bin)

assert n_jets == len(all_jets)*2 # Check if jets are splited correctly 

In [19]:
HistMap

{'500_LeadingJet_Forward_Data_pt': [[array([507.30325   , 584.026125  , 563.7410625 , ..., 528.2225    ,
          555.133875  , 500.07490625])],
  [array([1., 1., 1., ..., 1., 1., 1.])]],
 '500_LeadingJet_Forward_Data_eta': [[array([ 0.71226513, -0.38155711,  1.41804993, ..., -1.60686862,
           1.05504882, -0.83009017])],
  [array([1., 1., 1., ..., 1., 1., 1.])]],
 '500_LeadingJet_Forward_Data_ntrk': [[array([13., 24., 17., ..., 26., 14., 20.])],
  [array([1., 1., 1., ..., 1., 1., 1.])]],
 '500_LeadingJet_Forward_Data_width': [[array([0.02981496, 0.12542112, 0.02517764, ..., 0.0187413 , 0.01895971,
          0.01500422])],
  [array([1., 1., 1., ..., 1., 1., 1.])]],
 '500_LeadingJet_Forward_Data_c1': [[array([0.17971382, 0.27901891, 0.19649383, ..., 0.10864899, 0.16082266,
          0.16572972])],
  [array([1., 1., 1., ..., 1., 1., 1.])]],
 '500_LeadingJet_Forward_Data_bdt': [[array([-0.12203559,  0.08166298,  0.01176638, ...,  0.23523769,
          -0.16798499,  0.08679799])],
  

In [20]:
foutput = uproot.recreate("./dijet_pythia_data1516.root")

In [21]:
for hist in HistMap.keys():
    print(hist)
    nbin,binmin,binmax = GetHistBin(hist)
    histogram = np.histogram(a = HistMap[hist][0], weights = HistMap[hist][1], bins = nbin, range = (binmin,binmax))
    #print(histogram)
    foutput[hist] = histogram

    weight = np.array(HistMap[hist][1])
    binning = np.linspace(binmin,binmax,nbin)
    sum_w2 = np.zeros([nbin], dtype=np.float32)
    digits = np.digitize(HistMap[hist][0],binning)
    for i in range(nbin):
        weights_in_current_bin = weight[0][np.where(digits == i)[0]]
        sum_w2[i] = np.sum(np.power(weights_in_current_bin, 2))
    #print(sum_w2)
    histogram_err = np.histogram(a = binning, weights = sum_w2, bins = nbin, range = (binmin,binmax))
    foutput[hist+"_err"] = histogram_err

500_LeadingJet_Forward_Data_pt
500_LeadingJet_Forward_Data_eta
500_LeadingJet_Forward_Data_ntrk
500_LeadingJet_Forward_Data_width
500_LeadingJet_Forward_Data_c1
500_LeadingJet_Forward_Data_bdt
500_LeadingJet_Forward_Data_newBDT
600_LeadingJet_Forward_Data_pt
600_LeadingJet_Forward_Data_eta
600_LeadingJet_Forward_Data_ntrk
600_LeadingJet_Forward_Data_width
600_LeadingJet_Forward_Data_c1
600_LeadingJet_Forward_Data_bdt
600_LeadingJet_Forward_Data_newBDT
800_LeadingJet_Forward_Data_pt
800_LeadingJet_Forward_Data_eta
800_LeadingJet_Forward_Data_ntrk
800_LeadingJet_Forward_Data_width
800_LeadingJet_Forward_Data_c1
800_LeadingJet_Forward_Data_bdt
800_LeadingJet_Forward_Data_newBDT
1000_LeadingJet_Forward_Data_pt
1000_LeadingJet_Forward_Data_eta
1000_LeadingJet_Forward_Data_ntrk
1000_LeadingJet_Forward_Data_width
1000_LeadingJet_Forward_Data_c1
1000_LeadingJet_Forward_Data_bdt
1000_LeadingJet_Forward_Data_newBDT
1200_LeadingJet_Forward_Data_pt
1200_LeadingJet_Forward_Data_eta
1200_LeadingJet_