In [1]:
import argparse
import numpy as np
import pandas as pd 
import joblib
import uproot 
import awkward as ak
from pathlib import Path

from typing import Dict, List 
import re
import pickle
from root2pkl import *

In [2]:
sample_path = '/global/cfs/projectdirs/atlas/hrzhao/qgcal/Samples_Dec11/pythia/pythiaA/user.wasu.Dec11a.mc16_13TeV.364709.Pythia8EvtGen_A14NNPDF23LO_jetjet_JZ9WithSW_minitrees.root/user.wasu.31564717._000001.minitrees.root'
sample_path = Path(sample_path)


In [13]:
period_search_pattern = "pythia[A,D,E]"
period_folder = sample_path.parent.parent
period = re.search(period_search_pattern, period_folder.stem).group()[-1]
assert period in ["A", "D", "E"]

read_SumofWeights_Period((period_folder.parent/ f'pythia{period}_hist'), period)

array([9.60340000e+07, 1.68158606e+04, 1.73514685e+02, 1.40059219e+01,
       1.31401094e-01, 9.53733236e-03, 3.02887896e-03, 8.73200652e-04,
       1.47115647e-04])

In [9]:
period.group()

'pythiaA'

In [5]:
sample_path.parent.parent.stem

'pythiaA'

In [None]:
branch_names = ["run", "event", "pu_weight", "jet_fire", "jet_pt", "jet_eta", "jet_nTracks", "jet_trackWidth", "jet_trackC1", "jet_trackBDT", "jet_PartonTruthLabelID"]
branch_names_tobesaved = ["jet_pt", "jet_eta", "jet_nTracks", "jet_trackWidth", "jet_trackC1", "jet_trackBDT", "jet_PartonTruthLabelID"]

In [None]:
sample['nominal'].keys()

In [None]:
branch_names = ["run", "event", "pu_weight", "jet_fire", "jet_pt", "jet_eta", "jet_nTracks", "jet_trackWidth", "jet_trackC1", "jet_trackBDT", "jet_PartonTruthLabelID"]
sample_ak = sample['nominal'].arrays(branch_names, library='ak')

In [None]:
luminosity_periods = {
    "A" : 36000,
    "D" : 44500,
    "E" : 58500
}

xsec = np.array([7.8050E+07, 7.8050E+07, 2.4330E+06, 2.6450E+04, 2.5461E+02, 4.5532E+00, 2.5754E-01, 1.6215E-02, 6.2506E-04, 1.9639E-05])*1E3 # pb
eff = np.array([9.753257E-01, 2.442497E-02, 9.863129E-03, 1.165838E-02, 1.336560E-02, 1.452648E-02, 9.471878E-03, 1.1097E-02, 1.015436E-02, 1.2056E-02])

In [None]:
JZ_slice_number = sample_ak.run%100 # JZ slice for each event
event_weight = luminosity_periods['A'] * sample_ak["pu_weight"] * xsec[JZ_slice_number] * eff[JZ_slice_number] #/ sum_of_weights[JZ_slice - 1] # JZ_slice - 1 because of 1...9 -> 0...8
# pu_weight is already multiplied by mcEventWeight in MonoJetx.cxx 

sample = ak.with_field(base = sample_ak, what = event_weight, where = "event_weight")


In [None]:
# event trigger selection 
event_trigger_idx = sample["jet_fire"] == 1
sample = sample[event_trigger_idx]

# pT cut 
pt_cut_idx = sample["jet_pt"][:,1] > 500000
sample = sample[pt_cut_idx]

pt_max_cut_idx = sample["jet_pt"][:,0] < 2000000 
sample = sample[pt_max_cut_idx]

# ratio < 1.5
sample = sample[sample["jet_pt"][:,0]/sample["jet_pt"][:,1] < 1.5]

# eta cut 
sample = sample[np.abs(sample["jet_eta"][:,0]) < 2.1]
sample = sample[np.abs(sample["jet_eta"][:,1]) < 2.1]


sample = sample[np.abs(sample["event_weight"]) < 100] 

In [None]:
sample.run

In [None]:
sample_pd = ak.to_pandas(sample)
sample_dijet_pd = sample_pd.loc[(slice(None), slice(0,1)), :]
sample_dijet_pd = sample_dijet_pd.drop(['pu_weight', 'mconly_weight', 'jet_fire'], axis = 1)


In [None]:
sample_dijet_pd

In [None]:
sample_dijet_pd.shape

In [None]:
sample_dijet_pd.iloc[:,2]

In [None]:
pt_idx = sample_dijet_pd.columns.get_loc('jet_pt')
eta_idx = sample_dijet_pd.columns.get_loc('jet_eta')

sample_dijet_pd.iloc[:, pt_idx] = sample_dijet_pd.iloc[:, pt_idx] / 1000

sample_dijet_np = sample_dijet_pd.to_numpy().reshape((len(sample_dijet_pd)//2, 2, len(sample_dijet_pd.columns)))
# assert np.allclose(sample_pd.loc[0]['jet_eta'].to_numpy(), sample_dijet_np[0])

In [None]:
forward_idx = np.argmax(np.abs(sample_dijet_np[:,:,eta_idx]), axis=1) # compare abs eta of jets inside events
central_idx = -1*forward_idx+1

is_forward = np.zeros((len(sample_dijet_np),2))
is_forward[np.arange(len(is_forward)), forward_idx] = 1

is_leading = np.zeros((len(sample_dijet_np),2))
is_leading[:, 0] = 1

In [None]:
sample_dijet_np_label = np.concatenate((sample_dijet_np, np.broadcast_to(is_forward[:,:,None], (sample_dijet_np.shape[:2] + (1,)))), axis = 2)
sample_dijet_np_label = np.concatenate((sample_dijet_np_label, np.broadcast_to(is_leading[:,:,None], (sample_dijet_np_label.shape[:2] + (1,)))), axis = 2)

In [None]:
sample_dijet_np_label.shape[-1]

In [None]:
sample_pd_label = pd.DataFrame(sample_dijet_np_label.reshape(-1, sample_dijet_np_label.shape[-1]), columns = sample_dijet_pd.columns.to_list() + ["is_forward", "is_leading"], dtype=np.float64)

In [None]:
sample_pd_label

In [None]:
label_pt_bin = [500, 600, 800, 1000, 1200, 1500, 2000]
sample_pd_label['pt_idx'] = pd.cut(x=sample_pd_label['jet_pt'], bins=label_pt_bin, right=False, labels=False)


In [None]:
sample_pd_label['target'] = '-'
target_idx = sample_pd_label.columns.get_loc('target')
gluon_idx = sample_pd_label['jet_PartonTruthLabelID'] == 21
quark_idx = ((sample_pd_label['jet_PartonTruthLabelID'] > 0) & (sample_pd_label['jet_PartonTruthLabelID'] < 10))

sample_pd_label.iloc[gluon_idx, target_idx] = 1
sample_pd_label.iloc[quark_idx, target_idx] = 0



In [None]:
sample_pd_label.loc[sample_pd_label['jet_PartonTruthLabelID'] == -1]

In [None]:
sample_pd_label

In [None]:
np.unique(sample_ak.run)

In [None]:
xsec[sample_ak.run%100]

In [None]:
JZ_slice_number

In [None]:
sample_ak

In [None]:
sample_ak.jet_pt[5]

In [None]:
JZ_search_pattern = "36470[0-9]"
JZ_slice_number = int(re.search(JZ_search_pattern, sample_path.parent.stem).group()) % 100

In [None]:
JZ_slice_number

In [None]:
for field in sample_array.fields:
    print(field, sample_array[field][0])

In [None]:
sample_array.event


In [None]:
sample_pd = sample['nominal'].arrays(library='pd')

In [None]:
type(sample_pd)

In [None]:
sample_pd[3].head()

In [None]:
sample_pd[1].loc[0]

In [14]:
pythiaA_path = '/global/cfs/projectdirs/atlas/hrzhao/qgcal/Samples_Dec11/pythia/pythiaA/'
pythiaA_path = Path(pythiaA_path)

In [None]:
sorted(pythiaA_path.rglob("*JZ?WithSW_minitrees.root/*.root"))

# For data

In [33]:
sample_path = '/global/cfs/projectdirs/atlas/hrzhao/qgcal/Samples_New/data/data1516/user.wasu.Oct18.data16_13TeV.periodA.physics_Main_minitrees.root/user.wasu.30894374._000001.minitrees.root'
sample_path = Path(sample_path)

In [34]:
sample = uproot.open(sample_path)

In [35]:
sample['nominal'].keys()

['run',
 'event',
 'last',
 'year',
 'mconly_weight',
 'syst_weight',
 'pu_weight',
 'weight_pileup_up',
 'weight_pileup_down',
 'pdf_weight',
 'ph_fire',
 'jet_fire',
 'jet_pt',
 'jet_eta',
 'jet_phi',
 'jet_m',
 'jet_PartonTruthLabelID',
 'jet_weight',
 'jet_truthPt',
 'jet_truthEta',
 'jet_truthPhi',
 'jet_nTracks',
 'jet_trackWidth',
 'jet_trackC1',
 'jet_trackBDT']

In [36]:
branch_names = ["run", "event", "pu_weight", "jet_fire", "jet_pt", "jet_eta", "jet_nTracks", "jet_trackWidth", "jet_trackC1", "jet_trackBDT", "jet_PartonTruthLabelID"]
sample_ak = sample['nominal'].arrays(branch_names, library='ak')

In [38]:
ak.to_pandas(sample_ak)

Unnamed: 0_level_0,Unnamed: 1_level_0,run,event,pu_weight,jet_fire,jet_pt,jet_eta,jet_nTracks,jet_trackWidth,jet_trackC1,jet_trackBDT,jet_PartonTruthLabelID
entry,subentry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,298862,268768274,1.0,True,614843.5625,-1.369181,8.0,0.052751,0.231578,-0.306219,-9999
0,1,298862,268768274,1.0,True,589851.5625,-1.247243,13.0,0.051210,0.252397,-0.064603,-9999
1,0,298862,268438262,1.0,True,608761.5000,-1.949220,23.0,0.021191,0.177640,0.205810,-9999
1,1,298862,268438262,1.0,True,607602.0625,0.769535,22.0,0.183881,0.280812,-0.017188,-9999
2,0,298862,268100752,1.0,True,511956.6875,-1.685459,11.0,0.037250,0.218517,-0.074827,-9999
...,...,...,...,...,...,...,...,...,...,...,...,...
2623,1,298862,137034663,1.0,True,720934.8750,-1.009078,28.0,0.172463,0.314916,0.106358,-9999
2624,0,298862,136448197,1.0,True,710949.3125,-1.112797,13.0,0.042108,0.164087,-0.175678,-9999
2624,1,298862,136448197,1.0,True,660111.1875,0.348371,29.0,0.073809,0.276776,0.257864,-9999
2625,0,298862,136959564,1.0,True,549620.9375,0.355632,11.0,0.058083,0.178468,-0.186729,-9999


In [53]:
np.all(sample_ak['jet_PartonTruthLabelID']==-9999)

True

In [48]:
is_Data =  np.unique(ak.flatten(sample_ak['jet_PartonTruthLabelID']))[0] == -9999

In [49]:
ak.ones_like(sample_ak['event'])

<Array [1, 1, 1, 1, 1, 1, ... 1, 1, 1, 1, 1, 1] type='2626 * uint64'>

In [50]:
len(sample_ak)

2626