# Convert Cluster HDF5 files to pickled Pandas Dataframes and Apache Parquets

This conversion isn't strictly necessary, but comes in handy for quickly reading the data for scikit-learn, ploting, etc.

In [1]:
import h5py
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
def create_pandas(h5file):
    # Note: Data in HDF5 file is not in the native Pandas Dataframe format
    clusters = pd.DataFrame(np.array(h5file["clusters"]))
    clusters.columns = ["i_event", "prim", "T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]
    return clusters

In [6]:
beam_energy = 600
for num_dp in [15, 30]:
    h5file = f"data/{beam_energy}AMeV_{num_dp}dp.clusters.h5"
    pdfile = f"data/{beam_energy}AMeV_{num_dp}dp.clusters.pkl"
    pafile = f"data/{beam_energy}AMeV_{num_dp}dp.clusters.parquet"
    print(f"{h5file} -> {pdfile} + {pafile}")

    with h5py.File(h5file, "r") as h5in:
        df = create_pandas(h5in)
        df.to_pickle(pdfile)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, pafile, compression="snappy")

data/600AMeV_15dp.clusters.h5 -> data/600AMeV_15dp.clusters.pkl + data/600AMeV_15dp.clusters.parquet
data/600AMeV_30dp.clusters.h5 -> data/600AMeV_30dp.clusters.pkl + data/600AMeV_30dp.clusters.parquet
