In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# test GPU
# Import tensorflow and test if GPU is available
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# load data and convert them to awkward arrays using uproot
import uproot


# path to the signal and background files
path_sig = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*VBFH*.root"
path_bkg = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*Ztt*.root"

# glob the files
import glob
files_sig = glob.glob(path_sig)
files_bkg = glob.glob(path_bkg)

print("Found", files_sig)


# define list of variable that we want to read from the files
variables_higgs = [  
    "tau_0_p4",
    "tau_1_p4",
    "ditau_deta","ditau_dphi","ditau_dr","ditau_higgspt","ditau_scal_sum_pt", #"ditau_mmc_mlm_m",
    "jet_0_p4",
    "jet_1_p4",
    "dijet_p4", # fixme add dEta
    "met_p4", 
    "n_jets","n_jets_30","n_jets_40","n_electrons","n_muons","n_taus",
]

import vector
import awkward as ak

# use uproot to convert the root files to awkward arrays
arrays = []
for files in [files_sig, files_bkg]:
    arrays.append([])
    for file in files:
        print("Reading file", file)
        f = uproot.open(file)['NOMINAL']
        data = f.arrays(variables_higgs, library="ak")
        arr = []
        for var in variables_higgs:
            if 'p4' in var:
                # We need to extract the 4-vector pt, eta, phi, mass
                p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr.append(p4.rho) # pt
                arr.append(p4.eta) # eta
                arr.append(p4.phi) # phi
                arr.append(p4.tau) # mass
            
            else:
                arr.append(data[var])

        arrays[-1].append(arr)    

print(len(arrays[0]))
print(len(arrays[1]))

    


2024-11-16 21:36:12.243572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-16 21:36:12.264628: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-16 21:36:12.271124: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: []
Found ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r15224_p6

2024-11-16 21:36:16.340679: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125

In [3]:
# convert the awkward arrays to tensorflow tensors
tensors = []
labels = []
n_events = []
for i,arr_sample in enumerate(arrays):
    n_evt = 0
    tensors.append([])
    labels.append([])
    for j, arr_file in enumerate(arr_sample):
        tensors_var = []
        for arr_var in arr_file:
            tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
            tensors_var.append(tensor)

        # stack the tensors along the first axis
        tensor_stack = tf.stack(tensors_var, axis=1)
        tensors[-1].append(tensor_stack)
        n_evt += tensor_stack.shape[0]

        # add the label
        labels[-1].append(tf.constant(i==0, shape=(tensor.shape[0]), dtype=tf.int32)) # signal files go first, so i==0 is signal
        print(i, j, tensors[-1][-1].shape, labels[-1][-1].shape)
    n_events.append(n_evt)

print("Signal events:", n_events[0])
print("Background events:", n_events[1])
print(len(tensors))
print(type(tensors))


0 0 (195502, 35) (195502,)
0 1 (7156, 35) (7156,)
0 2 (209617, 35) (209617,)
0 3 (7282, 35) (7282,)
0 4 (210515, 35) (210515,)
0 5 (1844, 35) (1844,)
0 6 (195498, 35) (195498,)
0 7 (1878, 35) (1878,)
1 0 (1, 35) (1,)
1 1 (5619, 35) (5619,)
1 2 (1092, 35) (1092,)
1 3 (31486, 35) (31486,)
1 4 (5, 35) (5,)
1 5 (103, 35) (103,)
1 6 (26, 35) (26,)
1 7 (54, 35) (54,)
1 8 (73851, 35) (73851,)
1 9 (2694, 35) (2694,)
1 10 (13107, 35) (13107,)
1 11 (22, 35) (22,)
Signal events: 829292
Background events: 128060
2
<class 'list'>


In [6]:
# convert tensors to dataset
# Creating a unified list of datasets without distinguishing between signal and background
print(type(tensors))
print(type(tensors[0]))
print(type(tensors[0][0]))

datasets = []
for tensors_sample in tensors:
    print(type(tensors_sample))
    print(len(tensors_sample))
    for tensor_file in tensors_sample:
        print(type(tensor_file))
        print(len(tensor_file))
        dataset = tf.data.Dataset.from_tensor_slices((tensor_file))
        datasets.append(dataset)
print(len(tensors))
#print(type(dataset))
#print(tensors[0][1])
# Print number of datasets
#print(len(dataset))
weights_list = []
for i in range(len(tensors)):
    weights = [ x.shape[0] / n_events[i] for x in tensors[i]]
    weights_list.extend(weights)

print(weights_list)
dataset = tf.data.Dataset.sample_from_datasets(datasets, weights=weights_list)
# Combine the datasets using sample_from_datasets method
# This will shuffle the events from all datasets into a single dataset


# Printout the shape of the first event
for x in dataset.take(5):
    print(x)

<class 'list'>
<class 'list'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'list'>
8
<class 'tensorflow.python.framework.ops.EagerTensor'>
195502
<class 'tensorflow.python.framework.ops.EagerTensor'>
7156
<class 'tensorflow.python.framework.ops.EagerTensor'>
209617
<class 'tensorflow.python.framework.ops.EagerTensor'>
7282
<class 'tensorflow.python.framework.ops.EagerTensor'>
210515
<class 'tensorflow.python.framework.ops.EagerTensor'>
1844
<class 'tensorflow.python.framework.ops.EagerTensor'>
195498
<class 'tensorflow.python.framework.ops.EagerTensor'>
1878
<class 'list'>
12
<class 'tensorflow.python.framework.ops.EagerTensor'>
1
<class 'tensorflow.python.framework.ops.EagerTensor'>
5619
<class 'tensorflow.python.framework.ops.EagerTensor'>
1092
<class 'tensorflow.python.framework.ops.EagerTensor'>
31486
<class 'tensorflow.python.framework.ops.EagerTensor'>
5
<class 'tensorflow.python.framework.ops.EagerTensor'>
103
<class 'tensorflow.python.framework.ops.EagerTensor'>