In [1]:
# Starting with Dan's code for loading data from root files
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# test GPU
# Import tensorflow and test if GPU is available
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# load data and convert them to awkward arrays using uproot
import uproot
import vector
import awkward as ak

# path to the signal and background files
path_sig = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*VBFH*.root"
path_bkg = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*Ztt*.root"

# glob the files
import glob
files_sig = glob.glob(path_sig)
files_bkg = glob.glob(path_bkg)

# hack: only use the first file
files_sig = files_sig[:1]
files_bkg = files_bkg[:1]

print("Found", files_sig)

variables_higgs = [  
    ["tau_0_p4", "tau_0_q", "met_p4", 'tau_0', 0.],
    ["tau_1_p4", "tau_0_q", "met_p4", 'tau_1', 0.],
    # "ditau_deta","ditau_dphi","ditau_dr","ditau_higgspt","ditau_scal_sum_pt", #"ditau_mmc_mlm_m",
    ["jet_0_p4", 0., "met_p4", 0., 0., 0., 1.],
    ["jet_1_p4", 0., "met_p4", 0., 0., 0., 1.],
    # "dijet_p4", # fixme add dEta
    # "met_p4", 
    # "n_jets","n_jets_30","n_jets_40","n_electrons","n_muons","n_taus",
]

variable_names = list(set([var for vars in variables_higgs for var in vars if isinstance(var, str)]))
print("Variables to load:", variable_names)

arrays = []
for files in [files_sig, files_bkg]:
    arrays.append([])
    for file in files:
        print("Reading file", file)
        f = uproot.open(file)['NOMINAL']
        data = f.arrays(variable_names, library="ak")
        files_arr = []
        for vars in variables_higgs:
            objects_arr = []
            for var in vars:
                if isinstance(var, str):                    
                    if 'p4' in var:
                        # We need to extract the 4-vector pt, eta, phi, mass
                        p4 = vector.zip({'x': data[var]['fP']['fX'],
                                        'y': data[var]['fP']['fY'],
                                        'z': data[var]['fP']['fZ'],
                                        't': data[var]['fE']})

                        if 'met' not in var:
                            objects_arr.append(p4.rho)  # pt
                            objects_arr.append(p4.eta)  # eta
                            objects_arr.append(p4.phi)  # phi
                            objects_arr.append(p4.tau)  # mass
                        else:
                            objects_arr.append(p4.rho)  # pt
                            objects_arr.append(p4.phi)  # phi

                    
                    elif var in  ['tau_0', 'tau_1']:
                        isMuon = data[var] == 1
                        isElec = data[var] == 2
                        isHadr = data[var] == 3
                        objects_arr.append(isMuon)
                        objects_arr.append(isElec)
                        objects_arr.append(isHadr)
                        
                    else:
                        objects_arr.append(data[var])
                else:
                    objects_arr.append(var)

            files_arr.append(objects_arr)
        arrays[-1].append(files_arr)

for i,sample_arrays in enumerate(arrays):
    print("Sample", i, "has", len(sample_arrays), "files")
    for j,file_arrays in enumerate(sample_arrays):
        print("File", j, "has", len(file_arrays), "Objects")
        for k,array in enumerate(file_arrays):
            print("Object", k, "has", len(array), "variables")
            # for l,object in enumerate(array):
            #     print("Object", l, "has", len(object), "entries")
                    


2024-10-31 15:17:15.600170: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-31 15:17:15.621313: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-31 15:17:15.627922: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: []


2024-10-31 15:17:19.198128: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Found ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root']
Variables to load: ['jet_0_p4', 'jet_1_p4', 'tau_0', 'tau_0_q', 'tau_0_p4', 'met_p4', 'tau_1', 'tau_1_p4']
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.700901.Sh_2214_Ztt_maxHTpT_Mll10_40_BF.PHYS.e8514_s4162_r14622_p6266.smPre_n_0_HS.NOMINAL.root
Sample 0 has 1 files
File 0 has 4 Objects
Object 0 has 11 variables
Object 1 has 11 variables
Object 2 has 11 variables
Object 3 has 11 variables
Sample 1 has 1 files
File 0 has 4 Objects
Object 0 has 11 variables
Object 1 has 11 variables
Object 2 has 11 variables
Object 3 has 11 variables


In [2]:
# convert the awkward arrays to tensorflow tensors
tensors = []
labels = []
n_events = []
for i,arr_sample in enumerate(arrays):
    n_evt = 0
    tensors.append([])
    labels.append([])
    for j, arr_file in enumerate(arr_sample):
        n_evt_file = None
        tensors_object = []
        for k, arr_object in enumerate(arr_file):
            tensors_var = []
            for arr_var in arr_object:
                if isinstance(arr_var, (float, int)):
                    tensor = tf.constant(arr_var, shape=(n_evt_file,), dtype=tf.float32) 
                else:
                    tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
                    if n_evt_file is None:
                        n_evt_file = tensor.shape[0]
                
                tensors_var.append(tensor)

            # stack object variables along the last axis
            tensor_object = tf.stack(tensors_var, axis=-1)
            tensors_object.append(tensor_object)
            print(i, j, k, tensor_object.shape)

        # stack the tensors along the axis 1
        tensor_stack = tf.stack(tensors_object, axis=1)
        tensors[-1].append(tensor_stack)
        n_evt += tensor_stack.shape[0]

        # add the label
        labels[-1].append(tf.constant(i==0, shape=(tensor_stack.shape[0],), dtype=tf.int32)) # signal files go first, so i==0 is signal
        print(i, j, tensors[-1][-1].shape, labels[-1][-1].shape)
    
    n_events.append(n_evt)

print("Signal events:", n_events[0])
print("Background events:", n_events[1])


0 0 0 (195502, 11)
0 0 1 (195502, 11)
0 0 2 (195502, 11)
0 0 3 (195502, 11)
0 0 (195502, 4, 11) (195502,)
1 0 0 (1, 11)
1 0 1 (1, 11)
1 0 2 (1, 11)
1 0 3 (1, 11)
1 0 (1, 4, 11) (1,)
Signal events: 195502
Background events: 1


In [4]:
t = tensors[-1][-1]
print(t[:, 0, 0])

tf.Tensor([34.244442], shape=(1,), dtype=float32)
