In [1]:
import os
import pandas as pd
import numpy as np
import awkward0
import uproot3_methods

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')

In [3]:
def _transform(dataframe, start=0, stop=-1, jet_size=0.8):
    from collections import OrderedDict
    v = OrderedDict()

    df = dataframe.iloc[start:stop]
    def _col_list(prefix, max_particles=200):
        return ['%s_%d'%(prefix,i) for i in range(max_particles)]
    
    _px = df[_col_list('PX')].values
    _py = df[_col_list('PY')].values
    _pz = df[_col_list('PZ')].values
    _e = df[_col_list('E')].values
    
    mask = _e>0
    n_particles = np.sum(mask, axis=1)

    px = awkward0.JaggedArray.fromcounts(n_particles, _px[mask])
    py = awkward0.JaggedArray.fromcounts(n_particles, _py[mask])
    pz = awkward0.JaggedArray.fromcounts(n_particles, _pz[mask])
    energy = awkward0.JaggedArray.fromcounts(n_particles, _e[mask])

    p4 = uproot3_methods.TLorentzVectorArray.from_cartesian(px, py, pz, energy)

    print(p4)
    pt = p4.pt

    jet_p4 = p4.sum()

    # outputs
    _label = df['is_signal_new'].values
    v['label'] = np.stack((_label, 1-_label), axis=-1)
    v['train_val_test'] = df['ttv'].values
    
    v['jet_pt'] = jet_p4.pt
    v['jet_eta'] = jet_p4.eta
    v['jet_phi'] = jet_p4.phi
    v['jet_mass'] = jet_p4.mass
    v['n_parts'] = n_particles

    v['part_px'] = px
    v['part_py'] = py
    v['part_pz'] = pz
    v['part_energy'] = energy

    v['part_pt_log'] = np.log(pt)
    v['part_ptrel'] = pt/v['jet_pt']
    v['part_logptrel'] = np.log(v['part_ptrel'])

    v['part_e_log'] = np.log(energy)
    v['part_erel'] = energy/jet_p4.energy
    v['part_logerel'] = np.log(v['part_erel'])

    v['part_raw_etarel'] = (p4.eta - v['jet_eta'])
    _jet_etasign = np.sign(v['jet_eta'])
    _jet_etasign[_jet_etasign==0] = 1
    v['part_etarel'] = v['part_raw_etarel'] * _jet_etasign

    v['part_phirel'] = p4.delta_phi(jet_p4)
    v['part_deltaR'] = np.hypot(v['part_etarel'], v['part_phirel'])

    def _make_image(var_img, rec, n_pixels = 64, img_ranges = [[-0.8, 0.8], [-0.8, 0.8]]):
        wgt = rec[var_img]
        x = rec['part_etarel']
        y = rec['part_phirel']
        img = np.zeros(shape=(len(wgt), n_pixels, n_pixels))
        for i in range(len(wgt)):
            hist2d, xedges, yedges = np.histogram2d(x[i], y[i], bins=[n_pixels, n_pixels], range=img_ranges, weights=wgt[i])
            img[i] = hist2d
        return img

#     v['img'] = _make_image('part_ptrel', v)

    return v

In [7]:
def convert(source, destdir, basename, step=None, limit=None):
    df = pd.read_hdf(source, key='table')
    logging.info('Total events: %s' % str(df.shape[0]))
    if limit is not None:
        df = df.iloc[0:limit]
        logging.info('Restricting to the first %s events:' % str(df.shape[0]))
    if step is None:
        step = df.shape[0]
    idx=-1
    while True:
        idx+=1
        start=idx*step
        if start>=df.shape[0]: break
        if not os.path.exists(destdir):
            os.makedirs(destdir)
        output = os.path.join(destdir, '%s_%d.awkd'%(basename, idx))
        logging.info(output)
        if os.path.exists(output):
            logging.warning('... file already exist: continue ...')
            continue
        v=_transform(df, start=start, stop=start+step)
        awkward0.save(output, v, mode='x')

In [4]:
cd ..

/home/rd804/ParticleNet


In [5]:
srcDir = 'data'
destDir = 'data/converted'

In [12]:
# conver training file
convert(os.path.join(srcDir, 'train.h5'), destdir=destDir, basename='train_file')

[2023-04-07 05:37:00,741] INFO: Total events: 1211000
[2023-04-07 05:37:00,744] INFO: data/converted/train_file_0.awkd


[[TLorentzVector(x=-250.35, y=-223.65, z=-334.74, t=474.07) TLorentzVector(x=-48.866, y=-56.791, z=-71.025, t=103.24) TLorentzVector(x=-55.415, y=-49.969, z=-74.236, t=105.26) ... TLorentzVector(x=-0.88906, y=-0.71787, z=-0.7002, t=1.3402) TLorentzVector(x=-0.11787, y=-0.4955, z=-0.89844, t=1.0328) TLorentzVector(x=-0.29766, y=-0.10618, z=-0.28663, t=0.42665)] [TLorentzVector(x=120.06, y=76.852, z=-48.274, t=150.5) TLorentzVector(x=63.802, y=42.755, z=-29.455, t=82.257) TLorentzVector(x=36.763, y=26.991, z=-16.714, t=48.574) ... TLorentzVector(x=0.39235, y=0.25878, z=-0.077853, t=0.47642) TLorentzVector(x=0.41934, y=0.2065, z=-0.25419, t=0.53207) TLorentzVector(x=0.38646, y=0.059892, z=0.10168, t=0.40407)] [TLorentzVector(x=10.428, y=-147.57, z=203.56, t=251.65) TLorentzVector(x=10.718, y=-54.498, z=88.101, t=104.15) TLorentzVector(x=5.7241, y=-43.421, z=64.595, t=78.043) ... TLorentzVector(x=0.24462, y=-0.47074, z=0.77402, t=0.93837) TLorentzVector(x=0.27355, y=-0.36748, z=1.2391, t=1

  return self._trymemo("mass", lambda self: self.awkward0.numpy.sqrt(self.mag2))


In [8]:
# conver validation file
convert(os.path.join(srcDir, 'val.h5'), destdir=destDir, basename='val_file')

[2023-04-07 05:32:56,706] INFO: Total events: 403000
[2023-04-07 05:32:56,709] INFO: data/converted/val_file_0.awkd


[[TLorentzVector(x=85.228, y=-227.71, z=109.54, t=266.68) TLorentzVector(x=46.856, y=-124.28, z=59.789, t=145.65) TLorentzVector(x=24.653, y=-64.709, z=31.602, t=76.117) ... TLorentzVector(x=0.037146, y=-0.69501, z=0.34333, t=0.77607) TLorentzVector(x=0.3087, y=-0.36873, z=0.60691, t=0.77434) TLorentzVector(x=0.10099, y=-0.3093, z=0.46224, t=0.56527)] [TLorentzVector(x=82.459, y=303.01, z=-224.13, t=385.81) TLorentzVector(x=15.37, y=85.793, z=-62.898, t=107.48) TLorentzVector(x=14.435, y=54.47, z=-40.078, t=69.149) ... TLorentzVector(x=0.095928, y=0.34755, z=-0.28316, t=0.45845) TLorentzVector(x=0.22452, y=0.26034, z=-0.081911, t=0.3534) TLorentzVector(x=0.2848, y=0.19123, z=-0.11261, t=0.36106)] [TLorentzVector(x=-108.09, y=93.248, z=-22.613, t=144.54) TLorentzVector(x=-21.67, y=46.335, z=-49.186, t=70.963) TLorentzVector(x=-19.737, y=36.744, z=-37.797, t=56.287) ... TLorentzVector(x=-0.11548, y=0.49445, z=-0.017129, t=0.50804) TLorentzVector(x=-0.14214, y=0.31359, z=-0.65712, t=0.741

In [11]:
# conver testing file
convert(os.path.join(srcDir, 'test.h5'), destdir=destDir, basename='test_file')

[2023-04-07 05:35:45,944] INFO: Total events: 404000
[2023-04-07 05:35:45,947] INFO: data/converted/test_file_0.awkd


[[TLorentzVector(x=-172.34, y=110.13, z=-76.504, t=218.36) TLorentzVector(x=-111.32, y=93.168, z=-50.391, t=153.66) TLorentzVector(x=-56.524, y=46.127, z=-23.695, t=76.708) ... TLorentzVector(x=-0.89466, y=0.45467, z=-0.8631, t=1.3237) TLorentzVector(x=-0.59297, y=0.69404, z=-0.96427, t=1.3278) TLorentzVector(x=-0.44267, y=0.068935, z=-0.21699, t=0.49779)] [TLorentzVector(x=26.738, y=-91.614, z=76.382, t=122.24) TLorentzVector(x=17.645, y=-93.015, z=75.715, t=121.23) TLorentzVector(x=21.377, y=-68.011, z=55.618, t=90.42) ... TLorentzVector(x=0.32453, y=-0.32586, z=0.31468, t=0.55725) TLorentzVector(x=-0.12391, y=-0.38507, z=0.40464, t=0.57216) TLorentzVector(x=0.072476, y=-0.39063, z=-0.027319, t=0.39824)] [TLorentzVector(x=-97.906, y=79.641, z=-362.43, t=383.77) TLorentzVector(x=-54.921, y=37.994, z=-189.18, t=200.63) TLorentzVector(x=-33.829, y=23.793, z=-116.1, t=123.25) ... TLorentzVector(x=-0.19783, y=0.15493, z=-0.92937, t=0.96274) TLorentzVector(x=-0.11066, y=0.11556, z=-0.7897,

  return self._trymemo("mass", lambda self: self.awkward0.numpy.sqrt(self.mag2))
