# Convert Bars HDF5 files to pickled Pandas Dataframes and Apache Parquets

This conversion isn't strictly necessary, but comes in handy for quickly reading the data for scikit-learn, ploting, etc.

In [1]:
import h5py
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
def create_pandas(h5file):
    # Note: Data in HDF5 file is not in the native Pandas Dataframe format
    multiplicity = pd.DataFrame(np.array(h5file["multiplicity"]))
    multiplicity.columns = ["nPN", "nPP", "nPH"]
    consolidated = pd.DataFrame(np.array(h5file["consolidated"]))
    consolidated.columns = ["nHits", "nClus", "Edep"]
    flatfeatures = pd.DataFrame(np.array(h5file["flatfeatures"]))
    data = multiplicity.join(consolidated).join(flatfeatures)

    # Use MAX_NEUTRONS to test predictions for scenarios with limited number of neutrons
    # FIXME: nPH < nPN shouldn't be necessary, few cases (431/5M)
    idx = (data["nPN"] <= 5) & (data["nPH"] <= data["nPN"])
    data = data[idx]
    # sample(frac=1) == shuffle
    return data.sample(frac=1)

In [3]:
beam_energy = 600
for num_dp in [15, 30]:
    h5file = f"data/{beam_energy}AMeV_{num_dp}dp.bars.h5"
    pdfile = f"data/{beam_energy}AMeV_{num_dp}dp.bars.pkl"
    pafile = f"data/{beam_energy}AMeV_{num_dp}dp.bars.parquet"
    print(f"{h5file} -> {pdfile} + {pafile}")

    with h5py.File(h5file, "r") as h5in:
        df = create_pandas(h5in)
        df.to_pickle(pdfile)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, pafile, compression="snappy")

data/600AMeV_15dp.bars.h5 -> data/600AMeV_15dp.bars.pkl + data/600AMeV_15dp.bars.parquet


  # This is added back by InteractiveShellApp.init_path()


data/600AMeV_30dp.bars.h5 -> data/600AMeV_30dp.bars.pkl + data/600AMeV_30dp.bars.parquet


MemoryError: Unable to allocate 112. GiB for an array with shape (5000000, 6000) and data type float32