In [1]:
import numpy as np
import pandas as pd
import vaex as vpd

Reference dataset taken from https://zenodo.org/record/3981290#.YIgTCi9Q3xW.

Description:
* 13 TeV collision data simulated with pythia 8.183.
* wboson.txt contains events generated from a W' boson with a mass of 600 GeV, which decays 100% of the time to a W boson and a Z boson. The W boson is forced to decay haronically and the Z boson decays into neutrinos.
* qstar.txt contains events generated from a excited quark q* with a mass of 600 GeV, which decays 100% of the time to a quark and a Z boson. The Z boson is forced to decay into neutrinos.
* events in the text format
* each line in the text represent one event, which contains variable number of detector-stable particles.
* each particle contains 7 features in order: [px, py, pz, E, pdgID, is-from-W, is-in-leading-jet]. The first four features are the four momentum of the particle, and pdgID is the pag number of the particle. is-from-W is 1 if the particle coming from W boson and 0 otherwise. is-in-leading-jet is 1 if the particle is inside the leading jet reconstructed from the anti-kT jet algorithm (R=1.0)

In [2]:
# reading in the silly data storage layout
dir_path = '../data/external/'
fname = 'wboson'
fext = '.txt'
evt = {}
i = 0
with open(dir_path + fname + fext) as f:
    for line in f:
        evt[i] = {}
        data = np.fromstring(line, sep=' ') # flattened data for evt
        num_cols = 7 # specified in description
        num_rows = len(data) // num_cols
        data = data.reshape((num_rows, num_cols))
        evt[i]['px'], evt[i]['py'], evt[i]['pz'], evt[i]['E'], evt[i]['pdg'], evt[i]['is_from_W'], evt[i]['is_lead_jet'] = data.T
        i = i + 1

In [3]:
df = pd.DataFrame.from_dict(evt, orient='index')

In [4]:
df['evt_num'] = df.index

In [5]:
df = df.set_index(['evt_num']).apply(pd.Series.explode).reset_index().set_index('evt_num')

In [6]:
df = df.astype({
    'px': np.float32,
    'py': np.float32,
    'pz': np.float32,
    'E': np.float32,
    'pdg': np.int32,
    'is_from_W': np.bool_,
    'is_lead_jet': np.bool_
})
vdf = vpd.from_pandas(df, name='qstar', copy_index=True, index_name='evt_num')

In [None]:
vdf.export_hdf5(dir_path + fname + '.hdf5')