In [1]:
import json
import numpy as np
import os
import pandas as pd
import re

In [2]:
path_to_cellnumber_data = '../data/full_with_cellnumber/RFQopt_model_01_Dataset_05_217k-samples.txt'
cellnumber_data_file = open(path_to_cellnumber_data, 'r')
lines = cellnumber_data_file.readlines()

lines_split = [re.split('\s+', s) for s in lines][:-1]
assert(all([len(lines_split[i]) == 23 for i in range(len(lines_split))]))

In [3]:
columns = lines_split[0]
data_raw = lines_split[1:]

In [4]:
data_dict = {col: [entry[i] for entry in data_raw] for i, col in enumerate(columns) if col not in {'|', ''}}
data_df = pd.DataFrame(data_dict)
data_df = data_df.apply(pd.to_numeric)

x_df = data_df[['Bmax', 'mX1', 'mX2', 'mY1', 'mY2', 'mtau1', 'mtau2', 'PhiY1', 'PhiY2',
       'Phitau1', 'Phitau2', 'mY3ref', 'PhiY3ref', 'Eref']]
y_df = data_df[['Transmission',
       'Length', 'Energy', '#Cells', 'E_Long', 'E_X', 'E_Y']]

Reminder of column mappings:
```
DVAR1: Bmax  [ 8.5, 12.0 ]
DVAR2: mX1  [ 5, 140 ]
DVAR3: mX2  [ 15, 160 ]
DVAR4: mY1  [ 1.005, 1.7 ]
DVAR5: mY2  [ 1.055, 1.85 ]
DVAR6: mtau1  [ 1, 500 ]
DVAR7: mtau2  [ 1, 500 ]
DVAR8: PhiY1  [ -89.95, -30 ]
DVAR9: PhiY2  [ -87.45, -25 ]
DVAR10: Phitau1  [ 1, 500 ]
DVAR11: Phitau2  [ 1, 500 ]
DVAR12: mY3ref  [ 1.105, 2.0 ]
DVAR13: PhiY3ref  [ -84.95, -20 ]
DVAR14: Eref  [ 0.055, 0.075 ]


OBJ definitions:
----------------
OBJ1: transmission [%]
OBJ2: output energy [MeV]
OBJ3: RFQ length [cm]
OBJ4: longitudinal emittance [MeV*deg]
OBJ5: x-emittance [cm*mrad]
OBJ6: y-emittance [cm*mrad]
```

In [5]:
x_column_mapping = {x_col: f'DVAR{i+1}' for i, x_col in enumerate(x_df.columns)}
y_column_mapping = {}
i=1
for y_col in y_df.columns:
    if y_col != '#Cells':
        y_column_mapping[y_col] = f"OBJ{i}"
        i += 1
    else:
        y_column_mapping[y_col] = '#Cells'

In [6]:
# rename columns
x_df = x_df.rename(columns=x_column_mapping)
y_df = y_df.rename(columns=y_column_mapping)

In [10]:
# save as json file
x_dict = x_df.to_dict('r')
y_dict = y_df[[f"OBJ{i}" for i in range(1, 7)]].to_dict('r')
numcells = y_df['#Cells']

assert len(x_dict) == len(y_dict)

out_dict = {
    "samples": {
        f"{i}": {"dvar": x_dict[i], "obj": y_dict[i], "numcells": numcells[i]} for i in range(len(x_dict))
    }
}

In [17]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

with open('../data/full_with_cellnumber/dataset.json', 'w') as file:
    json.dump(out_dict, file, cls=NpEncoder, indent=4)