In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# test GPU
# Import tensorflow and test if GPU is available
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

# load data and convert them to awkward arrays using uproot
import uproot


# path to the signal and background files
path_sig = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*VBFH*.root"
path_bkg = "/scratch/ucjf-atlas/njsf164/data_higgs_root/*Ztt*.root"

# glob the files
import glob
files_sig = glob.glob(path_sig)
files_bkg = glob.glob(path_bkg)

print("Found", files_sig)


# define list of variable that we want to read from the files
variables_higgs = [  
    "tau_0_p4",
    "tau_1_p4",
    "ditau_deta","ditau_dphi","ditau_dr","ditau_higgspt","ditau_scal_sum_pt", #"ditau_mmc_mlm_m",
    "jet_0_p4",
    "jet_1_p4",
    "dijet_p4", # fixme add dEta
    "met_p4", 
    "n_jets","n_jets_30","n_jets_40","n_electrons","n_muons","n_taus",
]


import vector
import awkward as ak

# use uproot to convert the root files to awkward arrays
arrays = []
for files in [files_sig, files_bkg]:
    arrays.append([])
    for file in files:
        print("Reading file", file)
        f = uproot.open(file)['NOMINAL']
        data = f.arrays(variables_higgs, library="ak")
        arr = []
        for var in variables_higgs:
            if 'p4' in var:
                # We need to extract the 4-vector pt, eta, phi, mass
                p4 = vector.zip({'x':data[var]['fP']['fX'], 
                                'y':data[var]['fP']['fY'], 
                                'z':data[var]['fP']['fZ'],
                                't':data[var]['fE']})
                
                arr.append(p4.rho) # pt
                arr.append(p4.eta) # eta
                arr.append(p4.phi) # phi
                arr.append(p4.tau) # mass
            
            else:
                arr.append(data[var])

        arrays[-1].append(arr)    

print(len(arrays[0]))
print(len(arrays[1]))

    


2024-11-12 15:41:42.877421: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-12 15:41:42.889548: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-12 15:41:42.893167: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: []
Found ['/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root', '/scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125_tth30h20.PHYS.e8559_s4159_r15224_p6

2024-11-12 15:41:45.702460: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603424.PhPy8_VBFH125_ttlm15hp20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603425.PhPy8_VBFH125_ttl13l7.PHYS.e8559_s4159_r15224_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603423.PhPy8_VBFH125_ttlp15hm20.PHYS.e8559_s4162_r14622_p6284.smPre_n_0_HS.NOMINAL.root
Reading file /scratch/ucjf-atlas/njsf164/data_higgs_root/user.kilie.Htt_lh_02.mc23_13p6TeV.603422.PhPy8_VBFH125

In [2]:
# convert the awkward arrays to tensorflow tensors
tensors = []
labels = []
n_events = []
for i,arr_sample in enumerate(arrays):
    n_evt = 0
    tensors.append([])
    labels.append([])
    for j, arr_file in enumerate(arr_sample):
        tensors_var = []
        for arr_var in arr_file:
            tensor = tf.constant(ak.to_numpy(arr_var), dtype=tf.float32)
            tensors_var.append(tensor)

        # stack the tensors along the first axis
        tensor_stack = tf.stack(tensors_var, axis=1)
        tensors[-1].append(tensor_stack)
        n_evt += tensor_stack.shape[0]

        # add the label
        labels[-1].append(tf.constant(i==0, shape=(tensor.shape[0]), dtype=tf.int32)) # signal files go first, so i==0 is signal
        print(i, j, tensors[-1][-1].shape, labels[-1][-1].shape)
    n_events.append(n_evt)

print("Signal events:", n_events[0])
print("Background events:", n_events[1])


0 0 (195502, 36) (195502,)
0 1 (7156, 36) (7156,)
0 2 (209617, 36) (209617,)
0 3 (7282, 36) (7282,)
0 4 (210515, 36) (210515,)
0 5 (1844, 36) (1844,)
0 6 (195498, 36) (195498,)
0 7 (1878, 36) (1878,)
1 0 (1, 36) (1,)
1 1 (5619, 36) (5619,)
1 2 (1092, 36) (1092,)
1 3 (31486, 36) (31486,)
1 4 (5, 36) (5,)
1 5 (103, 36) (103,)
1 6 (26, 36) (26,)
1 7 (54, 36) (54,)
1 8 (73851, 36) (73851,)
1 9 (2694, 36) (2694,)
1 10 (13107, 36) (13107,)
1 11 (22, 36) (22,)
Signal events: 829292
Background events: 128060


In [4]:
# convert tensors to dataset
d = []
for tensors_sample, labels_sample in zip(tensors, labels):
    d.append([])
    for tensor_file, label_file in zip(tensors_sample, labels_sample):
        dataset = tf.data.Dataset.from_tensor_slices((tensor_file, label_file))
        d[-1].append(dataset)



# combine the datasets using the sanple_from_datasets method
# this will shuffle the events from all files into a single dataset
datasets = []
for i,datasets_sample in enumerate(d):
    weights = [ x.shape[0] / n_events[i] for x in tensors[i] ]
    dataset = tf.data.Dataset.sample_from_datasets(datasets_sample, weights=weights)
    datasets.append(dataset)

# combine the signal and background datasets
dataset = tf.data.Dataset.sample_from_datasets(datasets, weights=[0.5, 0.5], stop_on_empty_dataset=True)

# printout the shape of the first event
for x in dataset.take(5):
    print(x)


(<tf.Tensor: shape=(36,), dtype=float32, numpy=
array([ 4.75435562e+01,  1.62719214e+00,  5.07154644e-01,  2.33601554e-06,
        2.86771793e+01,  1.62386715e+00, -1.35885978e+00,  1.05658375e-01,
        3.32498550e-03,  1.86601448e+00,  1.86601734e+00,  7.98840256e+01,
        7.62207336e+01,  9.79067383e+01,  7.68746872e+01, -5.88225007e-01,
       -2.71475554e+00,  8.21009636e+00,  6.61246338e+01,  3.41954112e+00,
        1.86800742e+00,  4.54609299e+00,  9.46995926e+01,  3.01311755e+00,
        2.80362153e+00,  5.30990417e+02,  4.75952301e+01,  0.00000000e+00,
       -1.26162064e+00,  7.29064504e-03,  2.00000000e+00,  2.00000000e+00,
        2.00000000e+00,  0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
      dtype=float32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
(<tf.Tensor: shape=(36,), dtype=float32, numpy=
array([ 5.9510761e+01,  6.7389560e-01,  4.7589517e-01, -9.5367432e-07,
        3.2608398e+01,  3.3506560e-01, -1.0794258e+00,  5.1099772e-04,
        3.3882999e