In [1]:
import os
import numpy as np
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# test GPU
# Import tensorflow and test if GPU is available
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# load data and convert them to awkward arrays using uproot
import uproot


# path to the signal and background files
path_sig = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*VBFH*.root"
path_bkg = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*Ztt*.root"

# glob the files
import glob
files_sig = glob.glob(path_sig)
files_bkg = glob.glob(path_bkg)
all_files = files_sig + files_bkg

print("All files", all_files)


# define list of variable that we want to read from the files
variables_higgs = [  
    "tau_0_p4",
    "tau_1_p4",
    "ditau_deta","ditau_dphi","ditau_dr","ditau_higgspt","ditau_scal_sum_pt", #"ditau_mmc_mlm_m",
    "jet_0_p4",
    "jet_1_p4",
    "dijet_p4", # fixme add dEta
    "met_p4", 
    "n_jets","n_jets_30","n_jets_40","n_electrons","n_muons","n_taus",
    "boson_0_truth_p4"
]
import vector
import awkward as ak

# use uproot to convert the root files to awkward arrays
arrays = []
arrays_truth = []
# for files in [files_sig, files_bkg]:
#     arrays.append([])
#     arrays_truth.append([])
#     print("Reading file ", files)
for file in all_files:
        f = uproot.open(file)['NOMINAL']
        data = f.arrays(variables_higgs, library="ak")
        print("Data shape (events)(variables) :",len(data),len(data.fields))
        arr = []
        arr_truth = []
        for var in variables_higgs:
            if ('p4' in var) and (var != "boson_0_truth_p4"):
                # We need to extract the 4-vector pt, eta, phi, mass
                p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr.append(p4.rho) # pt
                arr.append(p4.eta) # eta
                arr.append(p4.phi) # phi
                arr.append(p4.tau) # mass

            elif (var == "boson_0_truth_p4"):
                target_p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr_truth.append(target_p4.rho) # pt
                arr_truth.append(target_p4.eta) # eta
                arr_truth.append(target_p4.phi) # phi
                arr_truth.append(target_p4.tau) # mass
                
            else:
                arr.append(data[var])
        arrays.append(arr)
        arrays_truth.append(arr_truth)

print("arrays", len(arrays))
print("arrays_truth",len(arrays_truth))

2024-12-11 11:37:22.850109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-11 11:37:22.870596: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-11 11:37:22.876973: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: []
All files ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r1522

2024-12-11 11:37:26.589813: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Data shape (events)(variables) : 195502 18
Data shape (events)(variables) : 7156 18
Data shape (events)(variables) : 209617 18
Data shape (events)(variables) : 7282 18
Data shape (events)(variables) : 210515 18
Data shape (events)(variables) : 1844 18
Data shape (events)(variables) : 195498 18
Data shape (events)(variables) : 1878 18
Data shape (events)(variables) : 1 18
Data shape (events)(variables) : 5619 18
Data shape (events)(variables) : 1092 18
Data shape (events)(variables) : 31486 18
Data shape (events)(variables) : 5 18
Data shape (events)(variables) : 103 18
Data shape (events)(variables) : 26 18
Data shape (events)(variables) : 54 18
Data shape (events)(variables) : 73851 18
Data shape (events)(variables) : 2694 18
Data shape (events)(variables) : 13107 18
Data shape (events)(variables) : 22 18
arrays 20
arrays_truth 20


In [2]:
# awkward arrays tensors
# Čisto na shape jednotlivých blokov pre arrays a arrays_truth
for arr in [arrays, arrays_truth]:
    print("-" * 50)
    
    for i, arr_data in enumerate(arr):
            # Kontrola, či je arr_data iterovateľný
            if isinstance(arr_data, (list, ak.Array, np.ndarray)):
                num_rows = len(arr_data[0]) if len(arr_data) > 0 and hasattr(arr_data[0], "__len__") else 1
                num_columns = len(arr_data)
                print(f"Block {i}: Shape of matrix: ({num_rows}, {num_columns})")
            else:
                # Ak je skalár
                print(f"Block {i}: Scalar value of type {type(arr_data)}")

--------------------------------------------------
Block 0: Shape of matrix: (195502, 35)
Block 1: Shape of matrix: (7156, 35)
Block 2: Shape of matrix: (209617, 35)
Block 3: Shape of matrix: (7282, 35)
Block 4: Shape of matrix: (210515, 35)
Block 5: Shape of matrix: (1844, 35)
Block 6: Shape of matrix: (195498, 35)
Block 7: Shape of matrix: (1878, 35)
Block 8: Shape of matrix: (1, 35)
Block 9: Shape of matrix: (5619, 35)
Block 10: Shape of matrix: (1092, 35)
Block 11: Shape of matrix: (31486, 35)
Block 12: Shape of matrix: (5, 35)
Block 13: Shape of matrix: (103, 35)
Block 14: Shape of matrix: (26, 35)
Block 15: Shape of matrix: (54, 35)
Block 16: Shape of matrix: (73851, 35)
Block 17: Shape of matrix: (2694, 35)
Block 18: Shape of matrix: (13107, 35)
Block 19: Shape of matrix: (22, 35)
--------------------------------------------------
Block 0: Shape of matrix: (195502, 4)
Block 1: Shape of matrix: (7156, 4)
Block 2: Shape of matrix: (209617, 4)
Block 3: Shape of matrix: (7282, 4)
Bl

In [10]:
# Premena awkward arrays na tensorflow tensors s truth vektormi
tensors = []
truth_vectors = []
n_events = []

# Iterácia cez signal a background datasets
for j, (arr_file, arr_truth_file) in enumerate(zip(arrays, arrays_truth)):
        tensors_var = []
        truths_var = []
        n_evt = 0

        # Iterácia cez jednotlivé premenné a truth vektory
        for arr_var in arr_file:
            # Konverzia premenných na tensorflow tenzory
            tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
            tensors_var.append(tensor)
        
        for truth_var in arr_truth_file:
            # Konverzia truth na tensorflow tensor
            truth_tensor = tf.constant(ak.to_numpy(truth_var), dtype=tf.float32)
            truths_var.append(truth_tensor)
    
        # Stack premenných
        tensor_stack = tf.stack(tensors_var, axis=1)  # Premenné (napr. 35 stĺpcov)
        truth_stack = tf.stack(truths_var, axis=1)    # Pravdivostné vektory (napr. 4 stĺpce)

        # Pridanie do zoznamov
        tensors.append(tensor_stack)
        truth_vectors.append(truth_stack)

        n_evt += tensor_stack.shape[0]

        # Výstup pre kontrolu
        print(j, tensor_stack.shape, truth_stack.shape)

        n_events.append(n_evt)

# Výpis celkových počtov udalostí
print("Tensors:", len(tensors))
print("Truth_vectors:", len(truth_vectors))
print("Events:", n_events)
print("Total events", np.sum(n_events))



0 (195502, 35) (195502, 4)
1 (7156, 35) (7156, 4)
2 (209617, 35) (209617, 4)
3 (7282, 35) (7282, 4)
4 (210515, 35) (210515, 4)
5 (1844, 35) (1844, 4)
6 (195498, 35) (195498, 4)
7 (1878, 35) (1878, 4)
8 (1, 35) (1, 4)
9 (5619, 35) (5619, 4)
10 (1092, 35) (1092, 4)
11 (31486, 35) (31486, 4)
12 (5, 35) (5, 4)
13 (103, 35) (103, 4)
14 (26, 35) (26, 4)
15 (54, 35) (54, 4)
16 (73851, 35) (73851, 4)
17 (2694, 35) (2694, 4)
18 (13107, 35) (13107, 4)
19 (22, 35) (22, 4)
Tensors: 20
Truth_vectors: 20
Events: [195502, 7156, 209617, 7282, 210515, 1844, 195498, 1878, 1, 5619, 1092, 31486, 5, 103, 26, 54, 73851, 2694, 13107, 22]
Total events 957352


In [None]:
# convert tensors to dataset
# Creating a unified list of datasets without distinguishing between signal and background
# print(type(tensors))
# print(type(tensors[0]))
# print(type(tensors[0][0]))

datasets = []

for tensor_file, truth_file in zip(tensors,truth_vectors):
        # print(type(tensor_file))
        # print(len(tensor_file))
        dataset_sample = tf.data.Dataset.from_tensor_slices((tensor_file,truth_file))
        datasets.append(dataset_sample)

# print(len(datasets))
#print(type(dataset))
#print(tensors[0][1])
# Print number of datasets
#print(len(dataset))
# print(len(tensors))
weights_list = []
for tensor, total_events in zip(tensors, n_events):
            weights = [tensor.shape[0] / total_events]
            weights_list.extend(weights)
print("weights_list_len",len(weights_list))

#print(weights_list)
#print(len(weights_list))
dataset = tf.data.Dataset.sample_from_datasets(datasets, weights=weights_list)
# Combine the datasets using sample_from_datasets method
# This will shuffle the events from all datasets into a single dataset

print(type(dataset))
# Printout the shape of the first event
for x in dataset.take(1):
    print(x)

<class 'tensorflow.python.framework.ops.EagerTensor'>
195502
<class 'tensorflow.python.framework.ops.EagerTensor'>
7156
<class 'tensorflow.python.framework.ops.EagerTensor'>
209617
<class 'tensorflow.python.framework.ops.EagerTensor'>
7282
<class 'tensorflow.python.framework.ops.EagerTensor'>
210515
<class 'tensorflow.python.framework.ops.EagerTensor'>
1844
<class 'tensorflow.python.framework.ops.EagerTensor'>
195498
<class 'tensorflow.python.framework.ops.EagerTensor'>
1878
<class 'tensorflow.python.framework.ops.EagerTensor'>
1
<class 'tensorflow.python.framework.ops.EagerTensor'>
5619
<class 'tensorflow.python.framework.ops.EagerTensor'>
1092
<class 'tensorflow.python.framework.ops.EagerTensor'>
31486
<class 'tensorflow.python.framework.ops.EagerTensor'>
5
<class 'tensorflow.python.framework.ops.EagerTensor'>
103
<class 'tensorflow.python.framework.ops.EagerTensor'>
26
<class 'tensorflow.python.framework.ops.EagerTensor'>
54
<class 'tensorflow.python.framework.ops.EagerTensor'>
7385