In [4]:
import numpy as np
import h5py
import os

Load the AMS02 .dat files and compare with test.h5; want to prepare a test.dat to use for HMC sampling

In [5]:
# Hardcoded model choices.
INPUTS = ['alpha', 'cmf', 'cpa', 'pwr1par', 'pwr1perr', 'pwr2par', 'pwr2perr', 'vspoles']
# These are hardcoded for transforms. Used in both parities.
X_MIN = np.array([0.,  2.5, 100., 0.4, 0.4, 0.4, 0.4, 400.]) 
X_MAX = np.array([85., 9.5, 870., 1.7, 1.7, 2.3, 2.3, 700.])
X_RANGE = np.array([ 85., 7., 770., 1.3000001, 1.3000001, 1.9, 1.9, 300.])
# X_MIN,X_MAX,X_RANGE = get_minmax_params(get_attributes(infile))
# These are selected from above and hardcoded.
PARAMETERS = ['cpa', 'pwr1par', 'pwr2par', 'pwr1perr', 'pwr2perr'] 
PARAMETERS_MIN = np.array([100., 0.4, 0.4, 0.4, 0.4]) 
PARAMETERS_MAX = np.array([870., 1.7, 1.7, 2.3, 2.3]) 
# These parameter don't include (alpha, cmf, vspoles) which we specify separately.
PARAMETERS_SPECIFIED = ['alpha', 'cmf', 'vspoles']
PARAMETERS_SPECIFIED_MIN = np.array([0.,  2.5, 400.])
PARAMETERS_SPECIFIED_MAX = np.array([85., 9.5, 700.])
# Calculated from positive file, used for both.
Y_LOG_MAX = 8.815241

NN_SPLIT_SEED = 36 # Random seed for splitting data into train/test sets for reproducibiity. Buffer size must be the same as num_samples.

# Now there are only 32 rigidity values.
# filename = f'{path}/{polarity}/model_collection_1AU_90deg_0deg_fixed.h5'
# with h5py.File(filename,'r') as f:
#     RIGIDITY_VALS = f['/info/rigidity'][:]
RIGIDITY_VALS = np.array(
      [  0.2       ,   0.20217378,   0.20659248,   0.21340226,
         0.22525435,   0.24034894,   0.25924241,   0.28573246,
         0.31835225,   0.35855115,   0.41265203,   0.48007787,
         0.56459135,   0.67849465,   0.82423959,   1.0121744 ,
         1.27012627,   1.61114001,   2.06592477,   2.70698217,
         3.54695997,   4.80077869,   6.49781115,   8.98694648,
        12.70126265,  17.95071033,  25.92424111,  37.84646324,
        56.45913512,  85.14084907, 129.78860616, 200.])

In [6]:
def _get_transform_params(X):
    """
    Helper function for calculating min max.
    """
    assert len(PARAMETERS) != len(PARAMETERS_SPECIFIED)
    input_dim = X.ndim
    if (X.ndim == 1 and len(X) == len(INPUTS)) or (X.ndim == 2 and X.shape[1] == len(INPUTS)):
        # Full set of inputs. 
        MIN, MAX = X_MIN, X_MAX
    elif ((X.ndim == 1 and len(X) == len(PARAMETERS)) or (X.ndim == 2 and X.shape[1] == len(PARAMETERS))):
        # Assume specified parameters have already been specified separately.
        MIN, MAX = PARAMETERS_MIN, PARAMETERS_MAX
    elif ((X.ndim == 1 and len(X) == len(PARAMETERS_SPECIFIED)) or (X.ndim == 2 and X.shape[1] == len(PARAMETERS_SPECIFIED))):
        # Assume other parameters have already been specified separately.
        MIN, MAX = PARAMETERS_SPECIFIED_MIN, PARAMETERS_SPECIFIED_MAX
    else:
        raise Exception
    return (MIN, MAX)
    

def transform_input(X):
    '''
    Parameters from HMC are all in min-max scaled space.
    This function tries to smartly handle case where some of the inputs are specified separately.
    '''
    MIN, MAX = _get_transform_params(X)
    RANGE = MAX - MIN
    rval = (X - MIN) / RANGE
    return rval


def untransform_input(X):
    '''
    Parameters from HMC are all in min-max scaled space.
    This function tries to smartly handle case where some of the inputs are specified separately.
    '''
    MIN, MAX = _get_transform_params(X)
    RANGE = MAX - MIN
    rval = X * RANGE + MIN
    return rval

def load_data_ams(filename, integrate=False):
    """ Load AMS data from Claudio. Each file contains measurements over a certain time interval. 
    Args:
        filename = Filename of observations.
                   Original dataset was '../data/BR2461.dat'
                   New datasets are in '../data/oct2022/'
                   New yearly datasets are in '../data/2024/yearly'
        integrate = If True, integrate over bin regions, so return r1, r2
                Otherwise, interpolate flux at the geoemtric mean of the bin and return bin_midpoints
    """
    dataset_ams = np.loadtxt(filename, usecols=(0,1,2,3)) # Rigidity1, Rigidity2, Flux, Error, dataset (only if yearly dataset)
    r1, r2 = dataset_ams[:,0], dataset_ams[:,1]

    if 'yearly' in filename:
        # Need to sort yearly datasets by r1
        sort_indices = np.argsort(r1)
        dataset_ams = dataset_ams[sort_indices, :]
        r1, r2 = dataset_ams[:,0], dataset_ams[:,1]

    bins = np.concatenate([r1[:], r2[-1:]])
    observed = dataset_ams[:,2]   # Observed Flux
    uncertainty = dataset_ams[:,3]
    assert len(bins) == len(observed)+1

    # bin_midpoints = (r1 + r2)/2  # Arithmetic mean
    bin_midpoints = (r1 * r2) ** 0.5  # Geometric mean seemed to work better in exp.

    if integrate:
        return bins, zip(r1, r2), observed, uncertainty
    else:
        return bins, bin_midpoints, observed, uncertainty

def load_dataset(polarity):
    # 8 input parameters for the NN: alpha, cmf, vspoles, cpa, pwr1par, pwr2par, pwr1perr, and pwr2perr.
    # features = ['alpha', 'cmf', 'cpa', 'pwr1par', 'pwr1perr', 'pwr2par', 'pwr2perr', 'vspoles']
    data_path = '/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/data/shuffled_may2025'
    test_file = f'{data_path}/{polarity}/test.h5'

    # Load test data
    with h5py.File(test_file, 'r') as h5:
        num_test_samples, num_inputs,  = h5['X_minmax'].shape
        _, num_flux,  = h5['Y_log_scaled'].shape
    x_test = h5py.File(test_file, 'r')['X_minmax'][:].reshape(num_test_samples, num_inputs)
    y_test = h5py.File(test_file, 'r')['Y_log_scaled'][:].reshape(num_test_samples, num_flux)   

Check .dat file

In [7]:
dat_file = '/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/GalacticCosmicRays/data/oct2022/AMS02_H-PRL2018/AMS02_H-PRL2018_20110611-20110707.dat'
dataset_ams = np.loadtxt(dat_file, usecols=(0,1,2,3)) # Rigidity1, Rigidity2, Flux, Error,
print(dataset_ams[:-5])

[[1.000000e+00 1.160000e+00 8.981000e+02 4.324591e+01]
 [1.160000e+00 1.330000e+00 8.547000e+02 3.231748e+01]
 [1.330000e+00 1.510000e+00 8.042000e+02 2.444688e+01]
 [1.510000e+00 1.710000e+00 7.392000e+02 2.018267e+01]
 [1.710000e+00 1.920000e+00 6.659000e+02 1.640853e+01]
 [1.920000e+00 2.150000e+00 5.835000e+02 1.308014e+01]
 [2.150000e+00 2.400000e+00 5.052000e+02 1.059859e+01]
 [2.400000e+00 2.670000e+00 4.320000e+02 8.527602e+00]
 [2.670000e+00 2.970000e+00 3.669000e+02 6.780855e+00]
 [2.970000e+00 3.290000e+00 3.100000e+02 5.445181e+00]
 [3.290000e+00 3.640000e+00 2.596000e+02 4.424930e+00]
 [3.640000e+00 4.020000e+00 2.157000e+02 3.690528e+00]
 [4.020000e+00 4.430000e+00 1.786000e+02 2.956349e+00]
 [4.430000e+00 4.880000e+00 1.473000e+02 2.412468e+00]
 [4.880000e+00 5.370000e+00 1.201000e+02 1.813836e+00]
 [5.370000e+00 5.900000e+00 9.757000e+01 1.474381e+00]
 [5.900000e+00 6.470000e+00 7.934000e+01 1.178813e+00]
 [6.470000e+00 7.090000e+00 6.429000e+01 9.460445e-01]
 [7.090000

Create y test .dat file adn test_specified_parameters.csv file

In [8]:
# 8 input parameters for the NN: alpha, cmf, vspoles, cpa, pwr1par, pwr2par, pwr1perr, and pwr2perr.
# features = ['alpha', 'cmf', 'cpa', 'pwr1par', 'pwr1perr', 'pwr2par', 'pwr2perr', 'vspoles']
h5_file = '/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/data/shuffled_may2025/neg/test.h5'

# Load test data
with h5py.File(h5_file, 'r') as h5:
    num_test_samples, num_inputs,  = h5['X_minmax'].shape
    _, num_flux,  = h5['Y_log_scaled'].shape
x_test = h5py.File(h5_file, 'r')['X_minmax'][:].reshape(num_test_samples, num_inputs)
y_test = h5py.File(h5_file, 'r')['Y_log_scaled'][:].reshape(num_test_samples, num_flux)

# get alpha, cmf, and vspoles (0, 1, and 7 index)
specified_parameters = x_test[:, [0, 1, 7]]

print(f'specified_parameters shape: {specified_parameters.shape}, x_test shape: {x_test.shape}, y_test shape: {y_test.shape}')
print('---')
print(f'First 10 samples of specified_parameters_unscaled:\n{specified_parameters[:10]}\n')
print(f'First 10 samples of x_test:\n{x_test[:10]}\n')
print(f'First 10 samples of y_test:\n{y_test[:10]}')
print('---')

# Undo scaling and log transforms
# Calculated from positive file, used for both.
Y_LOG_MAX = 8.815241
y_test = y_test * Y_LOG_MAX # Undo max scaling.
y_test = np.exp(y_test) # Undo log transform of target output.

specified_parameters_unscaled = untransform_input(specified_parameters)
x_test_unscaled = untransform_input(x_test)

print(f'specified_parameters_unscaled shape: {specified_parameters_unscaled.shape}, y_test shape: {y_test.shape}')
print('---')
print(f'First 10 samples of specified_parameters_unscaled:\n{specified_parameters_unscaled[:10]}\n')
print(f'First 10 samples of x_test_unscaled:\n{x_test_unscaled[:10]}\n')
print(f'First 10 samples of y_test_unscaled:\n{y_test[:10]}')
print('---')

specified_parameters shape: (198766, 3), x_test shape: (198766, 8), y_test shape: (198766, 32)
---
First 10 samples of specified_parameters_unscaled:
[[0.         0.85714287 0.33333334]
 [0.88235295 0.5714286  0.        ]
 [0.3529412  0.5714286  0.6666667 ]
 [0.29411766 0.5714286  0.6666667 ]
 [0.         1.         0.        ]
 [0.         0.5714286  1.        ]
 [0.29411766 0.85714287 1.        ]
 [0.47058824 0.5        0.        ]
 [0.         0.2857143  0.        ]
 [0.29411766 0.71428573 0.6666667 ]]

First 10 samples of x_test:
[[0.         0.85714287 0.37662336 0.69230765 0.         0.
  0.4736842  0.33333334]
 [0.88235295 0.5714286  1.         0.46153846 0.2307692  0.
  0.3157895  0.        ]
 [0.3529412  0.5714286  0.06493507 0.46153846 0.2307692  0.15789473
  0.6842106  0.6666667 ]
 [0.29411766 0.5714286  1.         0.2307692  0.46153846 0.15789473
  0.15789473 0.6666667 ]
 [0.         1.         0.         0.69230765 0.46153846 0.6842106
  0.4736842  0.        ]
 [0.        

In [39]:
# y_test is 32 rigidity values evenly distributed in log space between 0.2 and 200 GV
# Need to create a .dat version of y_test with 3 columns: Rigidity1, Rigidity2, Flux
rigidity_list = np.logspace(np.log10(0.2), np.log10(200), 33)
print(f'Rigidity list: {rigidity_list}')
r1 = rigidity_list[:-1]
r2 = rigidity_list[1:]
print(len(r1), len(r2), len(y_test[0]))  # Should both be 31
print(r1)
print(r2)
print(y_test[2][:])

# Create .dat file
for i in range(len(y_test)):
    output_dat_file = f'/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/data/shuffled_may2025/neg/dat_files/test_neg_r1r2flux_sample{i}.dat'
    with open(output_dat_file, 'w') as f:
        for j in range(len(r1)):
            f.write(f'{r1[j]} {r2[j]} {y_test[i][j]}\n')

Rigidity list: [2.00000000e-01 2.48187552e-01 3.07985305e-01 3.82190595e-01
 4.74274741e-01 5.88545435e-01 7.30348255e-01 9.06316728e-01
 1.12468265e+00 1.39566117e+00 1.73192865e+00 2.14921566e+00
 2.66704286e+00 3.30963420e+00 4.10705005e+00 5.09659350e+00
 6.32455532e+00 7.84837952e+00 9.73935050e+00 1.20859278e+01
 1.49978842e+01 1.86114408e+01 2.30956397e+01 2.86602514e+01
 3.55655882e+01 4.41346814e+01 5.47683927e+01 6.79641666e+01
 8.43393007e+01 1.04659823e+02 1.29876326e+02 1.61168438e+02
 2.00000000e+02]
32 32 32
[  0.2          0.24818755   0.30798531   0.38219059   0.47427474
   0.58854544   0.73034825   0.90631673   1.12468265   1.39566117
   1.73192865   2.14921566   2.66704286   3.3096342    4.10705005
   5.0965935    6.32455532   7.84837952   9.7393505   12.0859278
  14.99788419  18.61144082  23.09563969  28.6602514   35.5655882
  44.13468138  54.76839269  67.96416658  84.33930069 104.65982294
 129.87632632 161.16843755]
[  0.24818755   0.30798531   0.38219059   0.47427

KeyboardInterrupt: 

In [26]:
# Create .csv with specified_parameters_unscaled
output_csv_file = '/home/linneamw/sadow_koastore/personal/linneamw/research/gcr/data/shuffled_may2025/neg/test_neg_specparams.csv'
import pandas as pd
df = pd.DataFrame(specified_parameters_unscaled, columns=PARAMETERS_SPECIFIED)
print(df.head())
df.to_csv(output_csv_file, index=False)

       alpha  cmf     vspoles
0   0.000000  8.5  500.000003
1  75.000001  6.5  400.000000
2  30.000001  6.5  600.000006
3  25.000001  6.5  600.000006
4   0.000000  9.5  400.000000


In [31]:
specified_parameters = pd.read_csv(output_csv_file).values
print(f'specified_parameters from csv: {specified_parameters[0]}')
print(f'specified_parameters from csv: {specified_parameters[:5]}')

specified_parameters from csv: [  0.           8.50000006 500.00000298]
specified_parameters from csv: [[  0.           8.50000006 500.00000298]
 [ 75.0000006    6.50000018 400.        ]
 [ 30.00000075   6.50000018 600.00000596]
 [ 25.00000104   6.50000018 600.00000596]
 [  0.           9.5        400.        ]]


In [38]:
dataset_ams = np.loadtxt(output_dat_file, usecols=(0,1,2)) # Rigidity1, Rigidity2, Flux
print(dataset_ams[:40])

[[2.00000000e-01 2.48187552e-01 7.22679825e+01]
 [2.48187552e-01 3.07985305e-01 7.44473343e+01]
 [3.07985305e-01 3.82190595e-01 7.89810410e+01]
 [3.82190595e-01 4.74274741e-01 8.62369690e+01]
 [4.74274741e-01 5.88545435e-01 9.96225433e+01]
 [5.88545435e-01 7.30348255e-01 1.17997749e+02]
 [7.30348255e-01 9.06316728e-01 1.42931381e+02]
 [9.06316728e-01 1.12468265e+00 1.81048721e+02]
 [1.12468265e+00 1.39566117e+00 2.31991699e+02]
 [1.39566117e+00 1.73192865e+00 2.98709595e+02]
 [1.73192865e+00 2.14921566e+00 3.90785461e+02]
 [2.14921566e+00 2.66704286e+00 5.01272675e+02]
 [2.66704286e+00 3.30963420e+00 6.22025757e+02]
 [3.30963420e+00 4.10705005e+00 7.41789124e+02]
 [4.10705005e+00 5.09659350e+00 8.22874939e+02]
 [5.09659350e+00 6.32455532e+00 8.37724670e+02]
 [6.32455532e+00 7.84837952e+00 7.71973022e+02]
 [7.84837952e+00 9.73935050e+00 6.42598083e+02]
 [9.73935050e+00 1.20859278e+01 4.87213318e+02]
 [1.20859278e+01 1.49978842e+01 3.34340363e+02]
 [1.49978842e+01 1.86114408e+01 2.154285