In [1]:
import h5py
import numpy as np
import pandas as pd

In [2]:
beam_energy = 600
num_dp = 30

with h5py.File(f"data/{beam_energy}AMeV_{num_dp}dp.bars.h5", "r") as h5in:
    #     # Note: Data in HDF5 file is not in the native Pandas Dataframe format
    multiplicity = pd.DataFrame(np.array(h5in["multiplicity"]))
    multiplicity.columns = ["nPN", "nPP", "nPH"]

    print(len(h5in["multiplicity"]))
    
    idx = np.random.permutation(np.arange(0, len(multiplicity), 1, dtype=np.int))
    print(idx)
    print(h5in["multiplicity"][idx[0]])

5000000
[1790648 4994167 1867668 ... 2638666  972129 4654991]
[2 2 2]


In [5]:
def shuffle_hdf5(infile, outfile, num_dp):
    num_bars = num_dp * 100
    chunksize = 1

    print(f"Reading from {infile}")
    with h5py.File(infile, "r") as h5in:
        num_events = len(h5in["multiplicity"])
        idx = np.random.permutation(np.arange(0, num_events, 1, dtype=np.int))
        
        print(f"> Writing to {outfile}")
        with h5py.File(outfile, "w") as h5out:
            flatfeatures = h5out.create_dataset(
                "flatfeatures",
                shape=(num_events, num_bars * 2),
                dtype=np.float32,
                chunks=(chunksize, num_bars * 2),
                compression="gzip",
                compression_opts=9,
            )

            consolidated = h5out.create_dataset(
                "consolidated", shape=(num_events, 3), dtype=np.int16
            )

            multiplicity = h5out.create_dataset(
                "multiplicity", (num_events, 3), np.int8
            )

            primhitsbars = h5out.create_dataset(
                "primhitsbars",
                shape=(num_events, num_bars),
                dtype=np.int8,
                chunks=(chunksize, num_bars),
                compression="gzip",
                compression_opts=9,
            )
            
            orgid = h5out.create_dataset(
                "orgid", (num_events, 1), np.int32
            )

            for o, i in enumerate(idx):
                flatfeatures[o] = h5in["flatfeatures"][i]
                consolidated[o] = h5in["consolidated"][i]
                multiplicity[o] = h5in["multiplicity"][i]
                primhitsbars[o] = h5in["primhitsbars"][i]
                orgid[o] = i

In [6]:
for num_dp in [15, 30]:
    inpfile = f"data/{beam_energy}AMeV_{num_dp}dp.bars.h5"
    outfile = f"data/{beam_energy}AMeV_{num_dp}dp.bars-shuffled.h5"
    shuffle_hdf5(inpfile, outfile, num_dp)

Reading from data/600AMeV_15dp.bars.h5
> Writing to data/600AMeV_15dp.bars-shuffled.h5
Reading from data/600AMeV_30dp.bars.h5
> Writing to data/600AMeV_30dp.bars-shuffled.h5
