In [1]:
import numpy as np
import h5py
from tqdm import tqdm
from numba import jit

Reference dataset taken from https://zenodo.org/record/3981290#.YIgTCi9Q3xW.

Description:
* 13 TeV collision data simulated with pythia 8.183.
* wboson.txt contains events generated from a W' boson with a mass of 600 GeV, which decays 100% of the time to a W boson and a Z boson. The W boson is forced to decay haronically and the Z boson decays into neutrinos.
* qstar.txt contains events generated from a excited quark q* with a mass of 600 GeV, which decays 100% of the time to a quark and a Z boson. The Z boson is forced to decay into neutrinos.
* events in the text format
* each line in the text represent one event, which contains variable number of detector-stable particles.
* each particle contains 7 features in order: [px, py, pz, E, pdgID, is-from-W, is-in-leading-jet]. The first four features are the four momentum of the particle, and pdgID is the pag number of the particle. is-from-W is 1 if the particle coming from W boson and 0 otherwise. is-in-leading-jet is 1 if the particle is inside the leading jet reconstructed from the anti-kT jet algorithm (R=1.0)

In [2]:
in_dir = '../data/external/'
procs = {
    'wboson': {
        'in_pcls': [2212, 2212],
        'out_pcls': [23, 24],
        'signal_pcl': 24,
        'com_energy': 13.0e+3,
        'unit': 'GeV',
    },
    'qstar': {
        'in_pcls': [2212, 2212],
        'com_energy': 13.0e+3,
        'unit': 'GeV',
    }
}
in_ext = '.txt'
out_path = '../data/processed/events.hdf5'
with h5py.File(out_path, 'w', libver='latest') as f_out:
    for in_fname, meta in procs.items():
        parton = f_out.create_group(in_fname)
        for key, val in meta.items():
            parton.attrs[key] = val
        with open(in_dir + in_fname + in_ext, 'r') as f:
            for evt_num, line in enumerate(tqdm(f)):
                data = np.fromstring(line, sep=' ') # flattened data for evt
                num_cols = 7 # specified in description
                num_pcls = len(data) // num_cols
                data = data.reshape((num_pcls, num_cols))
                evt_grp = parton.create_group(f'event_{evt_num:06}')
                evt_grp.attrs['num_pcls'] = num_pcls
                pmu = evt_grp.create_dataset(
                    name='pmu',
                    shape=(num_pcls, 4),
                    compression='lzf',
                    shuffle=True,
                    dtype='<f')
                pmu[...] = data[:, :4]
                pdg = evt_grp.create_dataset(
                    name='pdg',
                    shape=(num_pcls,),
                    compression='lzf',
                    shuffle=True,
                    dtype='<i4')
                pdg[...] = data[:, 4]
                is_signal = evt_grp.create_dataset(
                    name='is_signal',
                    shape=(num_pcls,),
                    compression='lzf',
                    shuffle=True,
                    dtype='<?')
                is_signal[...] = data[:, 5].astype(np.bool_)
                is_lead = evt_grp.create_dataset(
                    name='is_lead_jet',
                    shape=(num_pcls,),
                    compression='lzf',
                    shuffle=True,
                    dtype='<?')
                is_lead[...] = data[:, 6].astype(np.bool_)
        parton.attrs['num_evts'] = int(evt_num + 1)
        print(in_fname + ' conversion complete.')

100000it [04:57, 336.08it/s]


wboson conversion complete.


100000it [04:58, 334.48it/s]


qstar conversion complete.
