# Notebook Dedicated to Construct the Filtered file.dat to Smartly Download RATDS from Grid Server

In [1]:
import numpy as np
import re
import uproot
import os

# Create Resumed Data List to Download from Grid

The idea is to take the file.dat (in txt format) and read all the entries and compare the runID and subrunID with the candidates. Then, only save the lines of file.dat that contains the run and subrun of candidates.

## Filtering Function

In [2]:
def filter_filelist(in_file_dir, out_file_dir, candidate_list):
    
    """
    Function designed to read the txt file of the dat file and return
    the filtered lines that corresponde to the run and subrun of the candidates

    Parameters:
    - in_file_dir: Directory and name of the file.dat
    - out_file_dir: directory and name of the output file 
    - candidate_list: Set of tuples (runID, subrunID) of the candidates.
    """
    
    # Usamos un set para que la búsqueda sea instantánea O(1)
    candidate_set = set(candidate_list)
    lines_found = 0

    # Expresión regular para capturar run (r...) y subrun (s...)
    # El patrón busca '_r' seguido de dígitos y '_s' seguido de dígitos
    patron = re.compile(r'_r(\d+)_s(\d+)_')

    with open(in_file_dir, 'r') as f_in, open(out_file_dir, 'w') as f_out:
        for linea in f_in:
            # El nombre del archivo es la primera columna (separada por tabulador)
            filename = linea.split('\t')[0]
            #print(f'file name: {filename}')
            
            match = patron.search(filename)
            if match:
                # Convertimos a int para ignorar los ceros a la izquierda (0000366261 -> 366261)
                run_id = int(match.group(1))
                subrun_id = int(match.group(2))
                #print(f'run {run_id}')
                #print(f'subrun {subrun_id}')
                
                # Comprobamos si este par (run, subrun) está en tus candidatos
                if (run_id, subrun_id) in candidate_set:
                    f_out.write(linea)
                    lines_found += 1
                    
    print(f"Proceso finalizado. Se encontraron {lines_found} archivos coincidentes.")

# Load and Select Data
Load runID and subrunID of Candidates
Load energy and posr to perform energy cut

In [4]:
# ------------ Load Data ------------
main_dir = '/home/joankl/data/solars/real_data/bisMSB/first_candidates/'

runID_analysis15 = np.load(main_dir + 'analysis15/resume_files/runID.npy')
subrunID_analysis15 = np.load(main_dir + 'analysis15/resume_files/subrunID.npy')
energy_analysis15 = np.load(main_dir + 'analysis15/resume_files/energy_corrected.npy')
posr_analysis15 = np.load(main_dir + 'analysis15/resume_files/posr_av.npy')


runID_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/runID.npy')
subrunID_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/subrunID.npy')
energy_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/energy_corrected.npy')
posr_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/posr_av.npy')


runID_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/runID.npy')
subrunID_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/subrunID.npy')
energy_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/energy_corrected.npy')
posr_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/posr_av.npy')

# Create the list of tuples of (runID,subrunID) of the candidates
analysis15_candidates = list(zip(runID_analysis15, subrunID_analysis15))
analysis15bMR_candidates = list(zip(runID_analysis15_bMR, subrunID_analysis15_bMR))
analysis20bMR_candidates = list(zip(runID_analysis20_bMR, subrunID_analysis20_bMR))

# ------------ Perform cuts ------------
en_cut = 5
posr_cut = 5500

condition_analysis15 = (energy_analysis15 >= 5) & (posr_analysis15 <= posr_cut)
condition_analysis15bMR = (energy_analysis15_bMR >= 5) & (posr_analysis15_bMR <= posr_cut)
condition_analysis20bMR = (energy_analysis20_bMR >= 5) & (posr_analysis20_bMR <= posr_cut)

runID_analysis15 = runID_analysis15[condition_analysis15]
subrunID_analysis15 = subrunID_analysis15[condition_analysis15]

runID_analysis15_bMR = runID_analysis15_bMR[condition_analysis15bMR]
subrunID_analysis15_bMR = subrunID_analysis15_bMR[condition_analysis15bMR]

runID_analysis20_bMR = runID_analysis20_bMR[condition_analysis20bMR]
subrunID_analysis20_bMR = subrunID_analysis20_bMR[condition_analysis20bMR]

analysis15_candidates = [x for x, m in zip(analysis15_candidates, condition_analysis15) if m]
analysis15bMR_candidates = [x for x, m in zip(analysis15bMR_candidates, condition_analysis15bMR) if m]
analysis20bMR_candidates = [x for x, m in zip(analysis20bMR_candidates, condition_analysis20bMR) if m]

In [5]:
len(analysis15_candidates)

68

In [6]:
len(analysis15bMR_candidates)

77

In [7]:
len(analysis20bMR_candidates)

47

## Start Filtering File Names

In [8]:
# =========== Analysis15 ===========
print('Analysis15')
in_file_dir = 'dat_files_for_RATDS/raw/Analysis15_802_Bronze_364311_371216_July2025_ratds_RAL_updatedSept25.dat'
out_file_dir = 'dat_files_for_RATDS/filtered/Analysis15_802_Bronze_364311_371216_July2025_ratds_RAL_updatedSept25_solar_filtered.dat'
filter_filelist(in_file_dir, out_file_dir, analysis15_candidates)

in_file_dir = 'dat_files_for_RATDS/raw/Analysis15_801_802_803_Bronze_post_364311_updated_Sept25_v2_from_ex.dat'
out_file_dir = 'dat_files_for_RATDS/filtered/Analysis15_801_802_803_Bronze_post_364311_updated_Sept25_v2_from_ex_solar_filtered.dat'
filter_filelist(in_file_dir, out_file_dir, analysis15_candidates)

# =========== Analysis15_bMR ===========
print('Analysis15_bMR')
in_file_dir = 'dat_files_for_RATDS/raw/Analysis15_bMR_801_802_Bronze_358080_364310_Apr2025_ratds_RAL.dat'
out_file_dir = 'dat_files_for_RATDS/filtered/Analysis15_bMR_801_802_Bronze_358080_364310_Apr2025_ratds_RAL_solar_filtered.dat'
filter_filelist(in_file_dir, out_file_dir, analysis15bMR_candidates)

in_file_dir = 'dat_files_for_RATDS/raw/Analysis15_bMR_801_802_Bronze_358084_364310_Sept2025_from_ex.dat'
out_file_dir = 'dat_files_for_RATDS/filtered/Analysis15_bMR_801_802_Bronze_358084_364310_Sept2025_from_ex_solar_filtered.dat'
filter_filelist(in_file_dir, out_file_dir, analysis15bMR_candidates)

# =========== Analysis20_bMR ===========
print('Analysis20_bMR')
in_file_dir = 'dat_files_for_RATDS/raw/Analysis20_bMR_801_802_Bronze_354099_358080_June2025_ratds_RAL.dat'
out_file_dir = 'dat_files_for_RATDS/filtered/Analysis20_bMR_801_802_Bronze_354099_358080_June2025_ratds_RAL_solar_filtered.dat'
filter_filelist(in_file_dir, out_file_dir, analysis20bMR_candidates)

in_file_dir = 'dat_files_for_RATDS/raw/Analysis20_bMR_801_803_Bronze_354099_358080_Apr2025_updatedSept25_from_ex.dat'
out_file_dir = 'dat_files_for_RATDS/filtered/Analysis20_bMR_801_803_Bronze_354099_358080_Apr2025_updatedSept25_from_ex_solar_filtered.dat'
filter_filelist(in_file_dir, out_file_dir, analysis20bMR_candidates)

Analysis15
Proceso finalizado. Se encontraron 41 archivos coincidentes.
Proceso finalizado. Se encontraron 27 archivos coincidentes.
Analysis15_bMR
Proceso finalizado. Se encontraron 0 archivos coincidentes.
Proceso finalizado. Se encontraron 77 archivos coincidentes.
Analysis20_bMR
Proceso finalizado. Se encontraron 36 archivos coincidentes.
Proceso finalizado. Se encontraron 11 archivos coincidentes.
