In [None]:
distances = [15]
doubleplanes = [30]
energies = [600]
erels = [500]
neutrons = [1, 2, 3, 4, 5, 6]
physicss = ["inclxx"]
subruns = range(20)

In [2]:
import sys

import joblib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import sklearn.preprocessing

sys.path.append("..")
from helpers import filename_for

In [3]:
def create_scalers(distance, doubleplane, energy, erel, physics):
    scaler_tri = sklearn.preprocessing.MaxAbsScaler()
    scaler_e = sklearn.preprocessing.MaxAbsScaler()
    scaler_t = sklearn.preprocessing.MaxAbsScaler()

    files = [
        filename_for(distance, doubleplane, energy, erel, n, physics, subrun, "bars.parquet",)
        for n in neutrons
        for subrun in range(5)
    ]
    data = pd.concat([pd.read_parquet(file) for file in files], ignore_index=True)

    cols_tri = ["nHits", "nClus", "Edep"]
    cols_e = [str(i) for i in range(0, doubleplane * 100 * 2, 2)]
    cols_t = [str(i + 1) for i in range(0, doubleplane * 100 * 2, 2)]

    scaler_tri.fit(data[cols_tri])
    scaler_e.fit(data[cols_e].values.reshape(-1, 1))
    scaler_t.fit(data[cols_t].values.reshape(-1, 1))

    del data
    return (scaler_tri, scaler_e, scaler_t)


scalers = {
    "-".join(str(k) for k in [distance, doubleplane, energy, erel, physics]): create_scalers(
        distance, doubleplane, energy, erel, physics
    )
    for distance in distances
    for doubleplane in doubleplanes
    for energy in energies
    for erel in erels
    for physics in physicss
}

In [4]:
display(scalers)

{'15-30-600-500-inclxx': (MaxAbsScaler(), MaxAbsScaler(), MaxAbsScaler())}

In [5]:
def scalebar(distance, doubleplane, energy, erel, neutron, physics, subrun):
    inpfile = filename_for(distance, doubleplane, energy, erel, neutron, physics, subrun, "bars.parquet")
    outfile = filename_for(distance, doubleplane, energy, erel, neutron, physics, subrun, "bars-scaled.parquet")

    cols_tri = ["nHits", "nClus", "Edep"]
    cols_e = [str(i) for i in range(0, doubleplane * 100 * 2, 2)]
    cols_t = [str(i + 1) for i in range(0, doubleplane * 100 * 2, 2)]

    scaler_tri, scaler_e, scaler_t = scalers["-".join(str(k) for k in [distance, doubleplane, energy, erel, physics])]
    
    data = pd.read_parquet(inpfile)
    data.loc[data["nHits"] == 0, ["nPN", "nPP", "nPH"]] = 0
    data[cols_tri] = scaler_tri.transform(data[cols_tri])
    data[cols_e] = scaler_e.transform(data[cols_e])
    data[cols_t] = scaler_t.transform(data[cols_t])

    table = pa.Table.from_pandas(data)
    pq.write_table(table, outfile, compression="gzip")

In [6]:
joblib.Parallel(n_jobs=1, backend="multiprocessing")(
    joblib.delayed(scalebar)(
        distance=distance,
        doubleplane=doubleplane,
        energy=energy,
        erel=erel,
        neutron=neutron,
        physics=physics,
        subrun=subrun,
    )
    for distance in distances
    for energy in energies
    for doubleplane in doubleplanes
    for neutron in neutrons
    for erel in erels
    for physics in physicss
    for subrun in subruns
)

print("Done")

Done
