# Notebook Dedicated to Build the Observables of Interest and Save a Dataset for ML

makes use of the data extracted from real_data_analysis notebook

In [1]:
import numpy as np
import pandas as pd

import seaborn as sn

import glob
import re

In [2]:
def alpha_num_sort(list_route):
    # Función auxiliar para extraer número de la ruta
    def extract_num(route):
        # Busca el primer número en el nombre del archivo
        match = re.search(r'(\d+)', route)
        return int(match.group(1)) if match else -1  # Si no hay número, devuelve -1

    # Ordena la lista usando el número extraído
    return sorted(list_route, key=extract_num)

# Load Data

Load the Data of each fragemented observables

### !! Before reading the Data and Saving the Dataset, define the cuts used to construct the pd.dataframe

In [32]:
en_inf_cut = 2.5
en_sup_cut = 12
posr_cut = 5000

In [33]:
main_dir = 'analysis_out/en_up_2_pt_5_MeV/'

observable_name = ['evtid', 'energy', 'posr', 'cos_alpha', 'hit_residual', 'clockCount50']

for var_i in observable_name:
    print(f'preparing observable {var_i}')
    #Prepare the empty arrays of the observables
    locals()[var_i] = np.array([])
    #create the file lists of the  fragemented observable in loop
    flist_var_i = alpha_num_sort(glob.glob(main_dir + var_i + '_*.npy'))

    for file_i in flist_var_i:
        locals()[var_i] = np.append(locals()[var_i], np.load(file_i))

#Prepare the empty arrays of the observables

preparing observable evtid
preparing observable energy
preparing observable posr
preparing observable cos_alpha
preparing observable hit_residual
preparing observable clockCount50


In [34]:
energy.shape

(41174,)

In [35]:
np.unique(energy).shape

(421,)

In [36]:
posr.shape

(41174,)

# Organize Data by events

1) Find where the evid breaks and extract the data within this interval as a single events;
2) Construct the normalized Δα distribution 10x10 and flat the image;
3) Extract only one energy and radial pos per events;

In [37]:
#evID index break list

data_break_i = [0]  #array whose elements are the index from where an event start and end.

N_data = len(evtid)

for i_dx in range(N_data - 1):
    if evtid[i_dx] != evtid[i_dx+1]:
        data_break_i.append(i_dx+1)

#add the last index to complete the data_break_i list
data_break_i.append(N_data - 1)

In [38]:
#empty list which will contain de non-repeated values of the observables for singles events
energy_ev = []
posr_ev = []
#Empty array to be filled with the 100 pixel values of the normalized Δα Images
pixels_ev = np.empty((0, 100))

N_terms = len(data_break_i)

for i_dx in range(N_terms - 2):
    init_i = data_break_i[i_dx]
    final_i = data_break_i[i_dx+1]

    #energy
    energy_ev_i = energy[init_i : final_i]
    energy_ev.append(energy_ev_i[0])

    #position
    posr_ev_i = posr[init_i : final_i]
    posr_ev.append(posr_ev_i[0])

    #Normalized Δα pixel values:
    #extract the time residuals and cos(α)
    time_residual_ev_i = hit_residual[init_i : final_i]
    cos_alpha_ev_i = cos_alpha[init_i : final_i]

    #Construct the 2D binned histogram
    bins = 10 #Image dim
    H_ev_i, _, _ = np.histogram2d(x = cos_alpha_ev_i, y = time_residual_ev_i, bins = [bins, bins])
    #print(H_ev_i)
    #Normalize the Histogram by NCounts in a slice of cos(α) along all the time residual
    sum_hit_ev_i = np.sum(H_ev_i, axis = 0)
    H_norm_ev_i = H_ev_i/sum_hit_ev_i
    H_norm_ev_i = np.nan_to_num(H_norm_ev_i, nan = 0.0)

    H_norm_flaten = H_norm_ev_i.reshape(1,100) #here we should have an array with 100 entries
    #print(H_norm_flaten0.shape)
    pixels_ev = np.append(pixels_ev, H_norm_flaten, axis = 0)

energy_ev = np.array(energy_ev)
posr_ev = np.array(posr_ev)

  H_norm_ev_i = H_ev_i/sum_hit_ev_i


In [40]:
energy_ev.shape

(421,)

In [41]:
posr_ev.shape

(421,)

In [42]:
pixels_ev.shape

(421, 100)

# Construct the Pandas Dataframe of Real Data

In [39]:
save = True
N_pixels = pixels_ev.shape[1]

for i in range(N_pixels):
    locals()['pixel_' + str(i)] = pixels_ev[:,i]

#Construct Data Frame with the pixel values
data = {}
df = pd.DataFrame(data)

for i in range(N_pixels):
    df['pixel_'+str(i)] = locals()['pixel_' +str(i)]

df['energy (MeV)'] = energy_ev
df['radius (mm)'] = posr_ev

if save:
    #save the pandas df
    df.to_csv(f'real_dataset_ML/real_dataset_E_{en_inf_cut}_{en_sup_cut}_MeV_R_{posr_cut}_mm.csv')

  df['energy (MeV)'] = energy_ev
  df['radius (mm)'] = posr_ev


In [31]:
df.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_92,pixel_93,pixel_94,pixel_95,pixel_96,pixel_97,pixel_98,pixel_99,energy (MeV),radius (mm)
0,0.0,0.0,0.0,0.2,0.0,0.1,0.058824,0.131579,0.148148,0.1,...,0.0,0.2,0.111111,0.0,0.0,0.105263,0.037037,0.133333,4.044874,2425.253906
1,0.0,0.25,0.0,0.0,0.117647,0.0,0.0,0.076923,0.107143,0.137931,...,0.125,0.0,0.117647,0.153846,0.16,0.076923,0.178571,0.068966,3.546911,3926.669922
2,0.0,0.0,0.0,0.2,0.083333,0.25,0.0,0.0,0.0,0.103448,...,0.142857,0.2,0.166667,0.25,0.166667,0.090909,0.0,0.034483,3.584208,1686.395874
3,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.08,0.0,0.032258,...,0.142857,0.111111,0.230769,0.238095,0.058824,0.24,0.178571,0.193548,4.000314,3871.861816
4,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.023256,0.054054,...,0.090909,0.125,0.277778,0.15,0.178571,0.111111,0.232558,0.189189,4.135279,3998.809082
