In [2]:
import os
import numpy as np
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# test GPU
# Import tensorflow and test if GPU is available
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# load data and convert them to awkward arrays using uproot
import uproot


# path to the signal and background files
path_sig = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*VBFH*.root"
path_bkg = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*Ztt*.root"

# glob the files
import glob
files_sig = glob.glob(path_sig)
files_bkg = glob.glob(path_bkg)

print("Found", files_sig)


# define list of variable that we want to read from the files
variables_higgs = [  
    "tau_0_p4",
    "tau_1_p4",
    "ditau_deta","ditau_dphi","ditau_dr","ditau_higgspt","ditau_scal_sum_pt", #"ditau_mmc_mlm_m",
    "jet_0_p4",
    "jet_1_p4",
    "dijet_p4", # fixme add dEta
    "met_p4", 
    "n_jets","n_jets_30","n_jets_40","n_electrons","n_muons","n_taus",
    "boson_0_truth_p4"
]
import vector
import awkward as ak

# use uproot to convert the root files to awkward arrays
arrays = []
arrays_truth = []
for files in [files_sig, files_bkg]:
    arrays.append([])
    arrays_truth.append([])
    print("Reading file ", files)
    for file in files:
        f = uproot.open(file)['NOMINAL']
        data = f.arrays(variables_higgs, library="ak")
        print("Data shape (events)(variables) :",len(data),len(data.fields))
        arr = []
        arr_truth = []
        for var in variables_higgs:
            if ('p4' in var) and (var != "boson_0_truth_p4"):
                # We need to extract the 4-vector pt, eta, phi, mass
                p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr.append(p4.rho) # pt
                arr.append(p4.eta) # eta
                arr.append(p4.phi) # phi
                arr.append(p4.tau) # mass

            elif (var == "boson_0_truth_p4"):
                target_p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr_truth.append(target_p4.rho) # pt
                arr_truth.append(target_p4.eta) # eta
                arr_truth.append(target_p4.phi) # phi
                arr_truth.append(target_p4.tau) # mass
                
            else:
                arr.append(data[var])
        arrays[-1].append(arr)
        arrays_truth[-1].append(arr_truth)

2024-12-02 02:47:28.294622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-02 02:47:28.316712: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-02 02:47:28.323496: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: []
Found ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r15224_p6

2024-12-02 02:47:32.376053: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Reading file  ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r15224_p6284.smPre_n

In [3]:
# awkward arrays tensors
# Čisto na shape jednotlivých blokov pre arrays a arrays_truth
for arr in [arrays, arrays_truth]:
    print("-" * 50)
    for j, arr_file in enumerate(arr):
        arrays_j = arr_file
        print(f"File {j} contains {len(arrays_j)} blocks")
        for i, arr_data in enumerate(arrays_j):
            # Kontrola, či je arr_data iterovateľný
            if isinstance(arr_data, (list, ak.Array, np.ndarray)):
                num_rows = len(arr_data[0]) if len(arr_data) > 0 and hasattr(arr_data[0], "__len__") else 1
                num_columns = len(arr_data)
                print(f"Block {i}: Shape of matica: ({num_rows}, {num_columns})")
            else:
                # Ak je skalár
                print(f"Block {i}: Scalar value of type {type(arr_data)}")

--------------------------------------------------
File 0 contains 8 blocks
Block 0: Shape of matica: (195502, 35)
Block 1: Shape of matica: (7156, 35)
Block 2: Shape of matica: (209617, 35)
Block 3: Shape of matica: (7282, 35)
Block 4: Shape of matica: (210515, 35)
Block 5: Shape of matica: (1844, 35)
Block 6: Shape of matica: (195498, 35)
Block 7: Shape of matica: (1878, 35)
File 1 contains 12 blocks
Block 0: Shape of matica: (1, 35)
Block 1: Shape of matica: (5619, 35)
Block 2: Shape of matica: (1092, 35)
Block 3: Shape of matica: (31486, 35)
Block 4: Shape of matica: (5, 35)
Block 5: Shape of matica: (103, 35)
Block 6: Shape of matica: (26, 35)
Block 7: Shape of matica: (54, 35)
Block 8: Shape of matica: (73851, 35)
Block 9: Shape of matica: (2694, 35)
Block 10: Shape of matica: (13107, 35)
Block 11: Shape of matica: (22, 35)
--------------------------------------------------
File 0 contains 8 blocks
Block 0: Shape of matica: (195502, 4)
Block 1: Shape of matica: (7156, 4)
Block 2:

In [25]:
# Premena awkward arrays na tensorflow tensors s truth vektormi
import tensorflow as tf
import awkward as ak

tensors = []
truth_vectors = []
n_events = []

# Iterácia cez signal a background datasets
for i, (arr_sample, arr_truth_sample) in enumerate(zip(arrays, arrays_truth)):
    n_evt = 0
    tensors.append([])
    truth_vectors.append([])
    
    for j, (arr_file, arr_truth_file) in enumerate(zip(arr_sample, arr_truth_sample)):
        tensors_var = []
        truths_var = []

        # Iterácia cez jednotlivé premenné a truth vektory
        for arr_var in arr_file:
            # Konverzia premenných na tensorflow tenzory
            tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
            tensors_var.append(tensor)
        
        for truth_var in arr_truth_file:
            # Konverzia truth na tensorflow tensor
            truth_tensor = tf.constant(ak.to_numpy(truth_var), dtype=tf.float32)
            truths_var.append(truth_tensor)

        # Stack premenných
        tensor_stack = tf.stack(tensors_var, axis=1)  # Premenné (napr. 35 stĺpcov)
        truth_stack = tf.stack(truths_var, axis=1)    # Pravdivostné vektory (napr. 4 stĺpce)

        # Pridanie do zoznamov
        tensors[-1].append(tensor_stack)
        truth_vectors[-1].append(truth_stack)

        n_evt += tensor_stack.shape[0]

        # Výstup pre kontrolu
        print(i, j, tensor_stack.shape, truth_stack.shape)

    n_events.append(n_evt)

# Výpis celkových počtov udalostí
print("Signal events:", n_events[0])
print("Background events:", n_events[1])


0 0 (195502, 35) (195502, 4)
0 1 (7156, 35) (7156, 4)
0 2 (209617, 35) (209617, 4)
0 3 (7282, 35) (7282, 4)
0 4 (210515, 35) (210515, 4)
0 5 (1844, 35) (1844, 4)
0 6 (195498, 35) (195498, 4)
0 7 (1878, 35) (1878, 4)
1 0 (1, 35) (1, 4)
1 1 (5619, 35) (5619, 4)
1 2 (1092, 35) (1092, 4)
1 3 (31486, 35) (31486, 4)
1 4 (5, 35) (5, 4)
1 5 (103, 35) (103, 4)
1 6 (26, 35) (26, 4)
1 7 (54, 35) (54, 4)
1 8 (73851, 35) (73851, 4)
1 9 (2694, 35) (2694, 4)
1 10 (13107, 35) (13107, 4)
1 11 (22, 35) (22, 4)
Signal events: 829292
Background events: 128060


In [15]:
# convert the awkward arrays to tensorflow tensors
tensors = []
labels = []
n_events = []
for i,arr_sample in enumerate(arrays):
    n_evt = 0
    tensors.append([])
    labels.append([])
    for j, arr_file in enumerate(arr_sample):
        tensors_var = []
        for arr_var in arr_file:
            tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
            tensors_var.append(tensor)

        # stack the tensors along the first axis
        tensor_stack = tf.stack(tensors_var, axis=1)
        tensors[-1].append(tensor_stack)
        n_evt += tensor_stack.shape[0]

        # add the label
        labels[-1].append(tf.constant(i==0, shape=(tensor.shape[0]), dtype=tf.int32)) # signal files go first, so i==0 is signal
        print(i, j, tensors[-1][-1].shape, labels[-1][-1].shape)
    n_events.append(n_evt)

print("Signal events:", n_events[0])
print("Background events:", n_events[1])
print(len(tensors))
print(type(tensors))
print(tensors[0][0])


0 0 (195502, 35) (195502,)
0 1 (7156, 35) (7156,)
0 2 (209617, 35) (209617,)
0 3 (7282, 35) (7282,)
0 4 (210515, 35) (210515,)
0 5 (1844, 35) (1844,)
0 6 (195498, 35) (195498,)
0 7 (1878, 35) (1878,)
1 0 (1, 35) (1,)
1 1 (5619, 35) (5619,)
1 2 (1092, 35) (1092,)
1 3 (31486, 35) (31486,)
1 4 (5, 35) (5,)
1 5 (103, 35) (103,)
1 6 (26, 35) (26,)
1 7 (54, 35) (54,)
1 8 (73851, 35) (73851,)
1 9 (2694, 35) (2694,)
1 10 (13107, 35) (13107,)
1 11 (22, 35) (22,)
Signal events: 829292
Background events: 128060
2
<class 'list'>
tf.Tensor(
[[47.543556    1.6271921   0.50715464 ...  0.          1.
   1.        ]
 [53.350586    0.7268461  -0.72196424 ...  1.          0.
   1.        ]
 [62.28143     0.49770078  1.5526772  ...  0.          1.
   1.        ]
 ...
 [34.928345   -0.8959155  -0.98953533 ...  0.          1.
   1.        ]
 [47.789715    0.8239082   2.8537514  ...  1.          0.
   1.        ]
 [65.47977     1.2410988   3.1411672  ...  1.          0.
   1.        ]], shape=(195502, 35), d

In [None]:
# convert tensors to dataset
# Creating a unified list of datasets without distinguishing between signal and background
print(type(tensors))
print(type(tensors[0]))
print(type(tensors[0][0]))

datasets = []
for tensors_sample in tensors:
    #print(type(tensors_sample))
    #print(len(tensors_sample))
    for tensor_file in tensors_sample:
        #print(type(tensor_file))
        #print(len(tensor_file))
        dataset_sample = tf.data.Dataset.from_tensor_slices((tensor_file))
        datasets.append(dataset_sample)

print(len(datasets))
#print(type(dataset))
#print(tensors[0][1])
# Print number of datasets
#print(len(dataset))
weights_list = []
for i in range(len(tensors)):
    weights = [ x.shape[0] / n_events[i] for x in tensors[i]]
    weights_list.extend(weights)

#print(weights_list)
#print(len(weights_list))
dataset = tf.data.Dataset.sample_from_datasets(datasets, weights=weights_list)
# Combine the datasets using sample_from_datasets method
# This will shuffle the events from all datasets into a single dataset

print(type(dataset))
# Printout the shape of the first event
for x in dataset.take(1):
    print(x[0])

<class 'list'>
<class 'list'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
20
<class 'tensorflow.python.data.ops.directed_interleave_op._DirectedInterleaveDataset'>
tf.Tensor(47.543556, shape=(), dtype=float32)
