# Convert Trifeature HDF5 files to pickled Pandas Dataframes and Apache Parquets

This conversion isn't strictly necessary, but comes in handy for quickly reading the data for scikit-learn, ploting, etc.

In [1]:
distances = [15]
doubleplanes = [12, 30]
energies = [600]
erels = [500]
neutrons = [1, 2, 3, 4, 5, 6]
physicss = ["inclxx"]
subruns = range(20)

In [2]:
import h5py
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import sys
import joblib

sys.path.append("..")
from helpers import filename_for

Welcome to JupyROOT 6.16/00


In [3]:
def hfd52others(distance, doubleplane, energy, erel, neutron, physics, subrun):
    h5file = filename_for(distance, doubleplane, energy, erel, neutron, physics, subrun, "trifeature.h5")
    pdfile = filename_for(distance, doubleplane, energy, erel, neutron, physics, subrun, "trifeature.pkl")
    pafile = filename_for(distance, doubleplane, energy, erel, neutron, physics, subrun, "trifeature.parquet")
    csfile = filename_for(distance, doubleplane, energy, erel, neutron, physics, subrun, "trifeature.csv")

    with h5py.File(h5file, "r") as h5in:
        # Note: Data in HDF5 file is not in the native Pandas Dataframe format
        data = pd.DataFrame(np.array(h5in["data"]))
        data.columns = ["nPN", "nPP", "nPH", "nHits", "nClus", "Edep"]
        data.to_pickle(pdfile)
        data.to_csv(csfile)
        table = pa.Table.from_pandas(data)
        pq.write_table(table, pafile, compression="gzip")

In [4]:
joblib.Parallel(n_jobs=-1, backend="multiprocessing")(
    joblib.delayed(hfd52others)(
        distance=distance,
        doubleplane=doubleplane,
        energy=energy,
        erel=erel,
        neutron=neutron,
        physics=physics,
        subrun=subrun,
    )
    for distance in distances
    for energy in energies
    for doubleplane in doubleplanes
    for neutron in neutrons
    for erel in erels
    for physics in physicss
    for subrun in subruns
);