# Convert Trifeature HDF5 files to pickled Pandas Dataframes and Apache Parquets

This conversion isn't strictly necessary, but comes in handy for quickly reading the data for scikit-learn, ploting, etc.

In [1]:
import h5py
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
def create_pandas(h5file):
    # Note: Data in HDF5 file is not in the native Pandas Dataframe format
    data = pd.DataFrame(np.array(h5file["data"]))
    data.columns = ["nPN", "nPP", "nPH", "nHits", "nClus", "Edep"]
    # Use MAX_NEUTRONS to test predictions for scenarios with limited number of neutrons
    # FIXME: nPH < nPN shouldn't be necessary, few cases (431/1M)
    idx = (data["nPN"] <= 5) & (data["nPH"] <= data["nPN"])
    data = data[idx]
    # sample(frac=1) == shuffle
    return data.sample(frac=1)

In [3]:
beam_energy = 600
for num_dp in [15, 30]:
    h5file = f"data/{beam_energy}AMeV_{num_dp}dp.trifeature.h5"
    pdfile = f"data/{beam_energy}AMeV_{num_dp}dp.trifeature.pkl"
    pafile = f"data/{beam_energy}AMeV_{num_dp}dp.trifeature.parquet"
    csfile = f"data/{beam_energy}AMeV_{num_dp}dp.trifeature.csv"
    print(f"{h5file} -> {pdfile} + {pafile}")

    with h5py.File(h5file, "r") as h5in:
        df = create_pandas(h5in)
        df.to_pickle(pdfile)
        df.to_csv(csfile)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, pafile, compression="gzip")

data/600AMeV_15dp.trifeature.h5 -> data/600AMeV_15dp.trifeature.pkl + data/600AMeV_15dp.trifeature.parquet
data/600AMeV_30dp.trifeature.h5 -> data/600AMeV_30dp.trifeature.pkl + data/600AMeV_30dp.trifeature.parquet
