In [1]:
import os
import numpy as np
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# test GPU
# Import tensorflow and test if GPU is available
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# load data and convert them to awkward arrays using uproot
import uproot


# path to the signal and background files
path_sig = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*VBFH*.root"
path_bkg = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*Ztt*.root"

# glob the files
import glob
files_sig = glob.glob(path_sig)
files_bkg = glob.glob(path_bkg)

print("Found", files_sig)


# define list of variable that we want to read from the files
variables_higgs = [  
    "tau_0_p4",
    "tau_1_p4",
    "ditau_deta","ditau_dphi","ditau_dr","ditau_higgspt","ditau_scal_sum_pt", #"ditau_mmc_mlm_m",
    "jet_0_p4",
    "jet_1_p4",
    "dijet_p4", # fixme add dEta
    "met_p4", 
    "n_jets","n_jets_30","n_jets_40","n_electrons","n_muons","n_taus",
    "boson_0_truth_p4"
]
import vector
import awkward as ak

# use uproot to convert the root files to awkward arrays
arrays = []
arrays_truth = []
for files in [files_sig, files_bkg]:
    arrays.append([])
    arrays_truth.append([])
    print("Reading file ", files)
    for file in files:
        f = uproot.open(file)['NOMINAL']
        data = f.arrays(variables_higgs, library="ak")
        print("Data shape (events)(variables) :",len(data),len(data.fields))
        arr = []
        arr_truth = []
        for var in variables_higgs:
            if ('p4' in var) and (var != "boson_0_truth_p4"):
                # We need to extract the 4-vector pt, eta, phi, mass
                p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr.append(p4.rho) # pt
                arr.append(p4.eta) # eta
                arr.append(p4.phi) # phi
                arr.append(p4.tau) # mass

            elif (var == "boson_0_truth_p4"):
                target_p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr_truth.append(target_p4.rho) # pt
                arr_truth.append(target_p4.eta) # eta
                arr_truth.append(target_p4.phi) # phi
                arr_truth.append(target_p4.tau) # mass
                
            else:
                arr.append(data[var])
        arrays[-1].append(arr)
        arrays_truth[-1].append(arr_truth)

2024-12-10 01:00:04.449810: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-10 01:00:04.469581: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-10 01:00:04.475710: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: []
Found ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r15224_p6

2024-12-10 01:00:08.331182: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Reading file  ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r15224_p6284.smPre_n

In [2]:
# awkward arrays tensors
# Čisto na shape jednotlivých blokov pre arrays a arrays_truth
for arr in [arrays, arrays_truth]:
    print("-" * 50)
    for j, arr_file in enumerate(arr):
        arrays_j = arr_file
        print(f"File {j} contains {len(arrays_j)} blocks")
        for i, arr_data in enumerate(arrays_j):
            # Kontrola, či je arr_data iterovateľný
            if isinstance(arr_data, (list, ak.Array, np.ndarray)):
                num_rows = len(arr_data[0]) if len(arr_data) > 0 and hasattr(arr_data[0], "__len__") else 1
                num_columns = len(arr_data)
                print(f"Block {i}: Shape of matrix: ({num_rows}, {num_columns})")
            else:
                # Ak je skalár
                print(f"Block {i}: Scalar value of type {type(arr_data)}")

--------------------------------------------------
File 0 contains 8 blocks
Block 0: Shape of matrix: (195502, 35)
Block 1: Shape of matrix: (7156, 35)
Block 2: Shape of matrix: (209617, 35)
Block 3: Shape of matrix: (7282, 35)
Block 4: Shape of matrix: (210515, 35)
Block 5: Shape of matrix: (1844, 35)
Block 6: Shape of matrix: (195498, 35)
Block 7: Shape of matrix: (1878, 35)
File 1 contains 12 blocks
Block 0: Shape of matrix: (1, 35)
Block 1: Shape of matrix: (5619, 35)
Block 2: Shape of matrix: (1092, 35)
Block 3: Shape of matrix: (31486, 35)
Block 4: Shape of matrix: (5, 35)
Block 5: Shape of matrix: (103, 35)
Block 6: Shape of matrix: (26, 35)
Block 7: Shape of matrix: (54, 35)
Block 8: Shape of matrix: (73851, 35)
Block 9: Shape of matrix: (2694, 35)
Block 10: Shape of matrix: (13107, 35)
Block 11: Shape of matrix: (22, 35)
--------------------------------------------------
File 0 contains 8 blocks
Block 0: Shape of matrix: (195502, 4)
Block 1: Shape of matrix: (7156, 4)
Block 2:

In [3]:
# Premena awkward arrays na tensorflow tensors s truth vektormi
tensors = []
truth_vectors = []
n_events = []

# Iterácia cez signal a background datasets
for i, (arr_sample, arr_truth_sample) in enumerate(zip(arrays, arrays_truth)):
    n_evt = 0
    tensors.append([])
    truth_vectors.append([])
    
    for j, (arr_file, arr_truth_file) in enumerate(zip(arr_sample, arr_truth_sample)):
        tensors_var = []
        truths_var = []

        # Iterácia cez jednotlivé premenné a truth vektory
        for arr_var in arr_file:
            # Konverzia premenných na tensorflow tenzory
            tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
            tensors_var.append(tensor)
        
        for truth_var in arr_truth_file:
            # Konverzia truth na tensorflow tensor
            truth_tensor = tf.constant(ak.to_numpy(truth_var), dtype=tf.float32)
            truths_var.append(truth_tensor)

        # Stack premenných
        tensor_stack = tf.stack(tensors_var, axis=1)  # Premenné (napr. 35 stĺpcov)
        truth_stack = tf.stack(truths_var, axis=1)    # Pravdivostné vektory (napr. 4 stĺpce)

        # Pridanie do zoznamov
        tensors[-1].append(tensor_stack)
        truth_vectors[-1].append(truth_stack)

        n_evt += tensor_stack.shape[0]

        # Výstup pre kontrolu
        print(i, j, tensor_stack.shape, truth_stack.shape)

    n_events.append(n_evt)

# Výpis celkových počtov udalostí
print("Signal events:", n_events[0])
print("Background events:", n_events[1])


0 0 (195502, 35) (195502, 4)
0 1 (7156, 35) (7156, 4)
0 2 (209617, 35) (209617, 4)
0 3 (7282, 35) (7282, 4)
0 4 (210515, 35) (210515, 4)
0 5 (1844, 35) (1844, 4)
0 6 (195498, 35) (195498, 4)
0 7 (1878, 35) (1878, 4)
1 0 (1, 35) (1, 4)
1 1 (5619, 35) (5619, 4)
1 2 (1092, 35) (1092, 4)
1 3 (31486, 35) (31486, 4)
1 4 (5, 35) (5, 4)
1 5 (103, 35) (103, 4)
1 6 (26, 35) (26, 4)
1 7 (54, 35) (54, 4)
1 8 (73851, 35) (73851, 4)
1 9 (2694, 35) (2694, 4)
1 10 (13107, 35) (13107, 4)
1 11 (22, 35) (22, 4)
Signal events: 829292
Background events: 128060


In [4]:
# convert tensors to dataset
# Creating a unified list of datasets without distinguishing between signal and background
# print(type(tensors))
# print(type(tensors[0]))
# print(type(tensors[0][0]))

datasets = []
for tensors_sample,truth_sample in zip(tensors,truth_vectors):
    #print(type(tensors_sample))
    #print(len(tensors_sample))
    for tensor_file, truth_file in zip(tensors_sample,truth_sample):
        #print(type(tensor_file))
        #print(len(tensor_file))
        dataset_sample = tf.data.Dataset.from_tensor_slices((tensor_file,truth_file))
        datasets.append(dataset_sample)

# print(len(datasets))
#print(type(dataset))
#print(tensors[0][1])
# Print number of datasets
#print(len(dataset))
weights_list = []
# print(len(tensors))
for i in range(len(tensors)):
    weights = [ x.shape[0] / n_events[i] for x in tensors[i]]
    weights_list.extend(weights)

#print(weights_list)
#print(len(weights_list))
dataset = tf.data.Dataset.sample_from_datasets(datasets, weights=weights_list)
# Combine the datasets using sample_from_datasets method
# This will shuffle the events from all datasets into a single dataset

print(type(dataset))
# Printout the shape of the first event
for x in dataset.take(1):
    print(x)

<class 'tensorflow.python.data.ops.directed_interleave_op._DirectedInterleaveDataset'>
(<tf.Tensor: shape=(35,), dtype=float32, numpy=
array([ 8.68837891e+01,  2.20202118e-01, -2.98558760e+00, -1.34869913e-06,
        4.26977577e+01,  4.52400029e-01, -1.85521102e+00,  1.05658375e-01,
        2.32197911e-01,  1.13037658e+00,  1.15397882e+00,  1.92922638e+02,
        1.29581543e+02,  1.09196526e+02, -3.36378717e+00,  1.17451191e+00,
        1.36458311e+01,  1.01617905e+02,  2.29900551e+00,  4.15259659e-01,
        9.21717739e+00,  1.95825272e+02, -2.40395665e+00,  8.09227824e-01,
        1.78318237e+03,  9.18484039e+01,  0.00000000e+00, -1.97342169e+00,
        5.86059736e-03,  2.00000000e+00,  2.00000000e+00,  2.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  1.00000000e+00], dtype=float32)>, <tf.Tensor: shape=(4,), dtype=float32, numpy=
array([193.37822   ,   0.42222098,  -2.3526988 , 124.99947   ],
      dtype=float32)>)
