# Notebook Dedicated to Construct the Filtered file.dat to Smartly Download RATDS from Grid Server

In [1]:
import numpy as np
import re
import uproot
import os

# Create Resumed Data List to Download from Grid

The idea is to take the file.dat (in txt format) and read all the entries and compare the runID and subrunID with the candidates. Then, only save the lines of file.dat that contains the run and subrun of candidates.

## Filtering Function

In [53]:
def filter_filelist(in_file_dir, out_file_dir, candidate_list):
    
    """
    Function designed to read the txt file of the dat file and return
    the filtered lines that corresponde to the run and subrun of the candidates

    Parameters:
    - in_file_dir: Directory and name of the file.dat.txt
    - out_file_dir: directory and name of the output file 
    - candidate_list: Set of tuples (runID, subrunID) of the candidates.
    """
    
    # Usamos un set para que la búsqueda sea instantánea O(1)
    candidate_set = set(candidate_list)
    lines_found = 0

    # Expresión regular para capturar run (r...) y subrun (s...)
    # El patrón busca '_r' seguido de dígitos y '_s' seguido de dígitos
    patron = re.compile(r'_r(\d+)_s(\d+)_')

    with open(in_file_dir, 'r') as f_in, open(out_file_name, 'w') as f_out:
        for linea in f_in:
            # El nombre del archivo es la primera columna (separada por tabulador)
            filename = linea.split('\t')[0]
            #print(f'file name: {filename}')
            
            match = patron.search(filename)
            if match:
                # Convertimos a int para ignorar los ceros a la izquierda (0000366261 -> 366261)
                run_id = int(match.group(1))
                subrun_id = int(match.group(2))
                #print(f'run {run_id}')
                #print(f'subrun {subrun_id}')
                
                # Comprobamos si este par (run, subrun) está en tus candidatos
                if (run_id, subrun_id) in candidate_set:
                    f_out.write(linea)
                    lines_found += 1
                    
    print(f"Proceso finalizado. Se encontraron {lines_found} archivos coincidentes.")

# Load and Select Data
Load runID and subrunID of Candidates
Load energy and posr to perform energy cut

In [86]:
# ------------ Load Data ------------
main_dir = '/home/joankl/data/solars/real_data/bisMSB/first_candidates/'

runID_analysis15 = np.load(main_dir + 'analysis15/resume_files/runID.npy')
subrunID_analysis15 = np.load(main_dir + 'analysis15/resume_files/subrunID.npy')
energy_analysis15 = np.load(main_dir + 'analysis15/resume_files/energy_corrected.npy')
posr_analysis15 = np.load(main_dir + 'analysis15/resume_files/posr_av.npy')


runID_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/runID.npy')
subrunID_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/subrunID.npy')
energy_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/energy_corrected.npy')
posr_analysis15_bMR = np.load(main_dir + 'analysis15_bMR/resume_files/posr_av.npy')


runID_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/runID.npy')
subrunID_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/subrunID.npy')
energy_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/energy_corrected.npy')
posr_analysis20_bMR = np.load(main_dir + 'analysis20_bMR/resume_files/posr_av.npy')

# Create the list of tuples of (runID,subrunID) of the candidates
analysis15_candidates = list(zip(runID_analysis15, subrunID_analysis15))
analysis15bMR_candidates = list(zip(runID_analysis15_bMR, subrunID_analysis15_bMR))
analysis20bMR_candidates = list(zip(runID_analysis20_bMR, subrunID_analysis20_bMR))

# ------------ Perform cuts ------------
en_cut = 5
posr_cut = 5500

condition_analysis15 = (energy_analysis15 >= 5) & (posr_analysis15 <= posr_cut)
condition_analysis15bMR = (energy_analysis15_bMR >= 5) & (posr_analysis15_bMR <= posr_cut)
condition_analysis20bMR = (energy_analysis20_bMR >= 5) & (posr_analysis20_bMR <= posr_cut)

runID_analysis15 = runID_analysis15[condition_analysis15]
subrunID_analysis15 = subrunID_analysis15[condition_analysis15]

runID_analysis15_bMR = runID_analysis15_bMR[condition_analysis15bMR]
subrunID_analysis15_bMR = subrunID_analysis15_bMR[condition_analysis15bMR]

runID_analysis20_bMR = runID_analysis20_bMR[condition_analysis20bMR]
subrunID_analysis20_bMR = subrunID_analysis20_bMR[condition_analysis20bMR]

analysis15_candidates = [x for x, m in zip(analysis15_candidates, condition_analysis15) if m]
analysis15bMR_candidates = [x for x, m in zip(analysis15bMR_candidates, condition_analysis15bMR) if m]
analysis20bMR_candidates = [x for x, m in zip(analysis20bMR_candidates, condition_analysis20bMR) if m]

In [55]:
len(analysis15_candidates)

129

In [56]:
len(analysis15bMR_candidates)

77

In [57]:
len(analysis20bMR_candidates)

47

In [95]:
max(runID_analysis15)

np.float64(371216.0)

## Start Filtering File Names

In [58]:
# Analysis15
in_file_dir = 'dat_files_for_RATDS/Analysis15_802_Bronze_364311_371216_July2025_ratds_RAL_updatedSept25.dat.txt'
out_file_dir = 'dat_files_for_RATDS/Analysis15_802_Bronze_364311_371216_July2025_ratds_RAL_updatedSept25_solar_filtered.dat.txt'
filter_filelist(in_file_dir, out_file_dir, analysis15_candidates)

# Analysis15_bMR
in_file_dir = 'dat_files_for_RATDS/Analysis15_bMR_801_802_Bronze_358080_364310_Apr2025_ratds_RAL.dat.txt'
out_file_dir = 'dat_files_for_RATDS/Analysis15_bMR_801_802_Bronze_358080_364310_Apr2025_ratds_RAL_solar_filtered.dat.txt'
filter_filelist(in_file_dir, out_file_dir, analysis15bMR_candidates)

# Analysis20_bMR
in_file_dir = 'dat_files_for_RATDS/Analysis20_bMR_801_802_Bronze_354099_358080_June2025_ratds_RAL.dat.txt'
out_file_dir = 'dat_files_for_RATDS/Analysis20_bMR_801_802_Bronze_354099_358080_June2025_ratds_RAL_solar_filtered.dat.txt'
filter_filelist(in_file_dir, out_file_dir, analysis20bMR_candidates)

Proceso finalizado. Se encontraron 41 archivos coincidentes.
Proceso finalizado. Se encontraron 0 archivos coincidentes.
Proceso finalizado. Se encontraron 36 archivos coincidentes.


In [45]:
len(analysis15_candidates)

129

In [124]:
129-80

49

# Construct the Rest Files for Download
Strange ... We should have the same number of coincident lines as the number of runID candidates. Lest see where the candidates are not into the entire file name data set.

Answer: Not every run is in the dat files! They are also contained in the file.exactly, but the problem is that the grabber for the grid download doesn't accept this format.

We should take the file.exactly, take the entries of the runs of interest, and generate the file.dat using the GridTools from rat tools as:

./processing_list -e file.exactly -f ratds -o file.dat

Then select entries in file.dat based in runID and subrunID, and download data using grabber.

## Check were the runID are not in the file.dat

In [121]:
# Lets construct the full (run,subrunID) and compare with the candidates

full_run_list = []
full_subrun_list = []

f_dat_dir = 'dat_files_for_RATDS/Analysis15_802_Bronze_364311_371216_July2025_ratds_RAL_updatedSept25.dat.txt' # Directory of file.dat
#f_dat_dir = 'dat_files_for_RATDS/Analysis20_bMR_801_802_Bronze_354099_358080_June2025_ratds_RAL.dat.txt'

patron = re.compile(r'_r(\d+)_s(\d+)_')
with open(f_dat_dir, 'r') as f:
    for line in f:
        filename = line.split('\t')[0]
        match = patron.search(filename)
        if match:
            run = int(match.group(1))
            subrun = int(match.group(2))

            full_run_list.append(run)
            full_subrun_list.append(subrun)

full_run_list = np.array(full_run_list)
full_subrun_list = np.array(full_subrun_list)

# Compart if candidates are fully contained. If True, elements are not in full data list
is_in_condition_run = np.isin(runID_analysis15, full_run_list)
isnot_in_condition_run = np.isin(runID_analysis15, full_run_list, invert = True)
isnot_in_condition_subrun = np.isin(subrunID_analysis15, full_subrun_list, invert = True)

candidate_runID_not_in_dataset = runID_analysis15[isnot_in_condition_run]
candidate_subrunID_not_in_dataset = subrunID_analysis15[isnot_in_condition_subrun]

#isnot_in_condition_run = np.isin(runID_analysis20_bMR, full_run_list, invert = True)
#isnot_in_condition_subrun = np.isin(subrunID_analysis20_bMR, full_subrun_list, invert = True)

#candidate_runID_not_in_dataset = runID_analysis20_bMR[isnot_in_condition_run]
#candidate_subrunID_not_in_dataset = subrunID_analysis20_bMR[isnot_in_condition_subrun]

In [123]:
len(np.where(is_in_condition_run)[0])

80

In [118]:
len(candidate_runID_not_in_dataset)

49

In [125]:
candidate_runID_not_in_dataset

array([366870., 366896., 366902., 366870., 370940., 370949., 370963.,
       370975., 370983., 368752., 368758., 370720., 366896., 370975.,
       367252., 368758., 367080., 367182., 367197., 367199., 367220.,
       370949., 367146., 370940., 366795., 367146., 370655., 371028.,
       367252., 368752., 370655., 371028., 371069., 367220., 368598.,
       371069., 370963., 366291., 366303., 367197., 367199., 367182.,
       366902., 368598., 366334., 366369., 366795., 367080., 370720.])

In [136]:
f_exc_dir = 'dat_files_for_RATDS/Analysis15_801_802_803_Bronze_post_364311_updated_Sept25_v2.exactly.txt'      # Directory of file.exactly
out_file_dir = 'dat_files_for_RATDS/Analysis15_801_802_803_Bronze_post_364311_updated_Sept25_v2_solar_filtered.exactly.txt'

lines_found = 0  # Note: this value could be smaller than len(rest of candidates) because of the repeated runIDs

full_runID = []

with open(f_exc_dir, 'r') as f_in, open(out_file_dir, 'w') as f_out:
    
    for line in f_in:
        runID = line.split('\t')[1]
        runID = int(runID)
        full_runID.append(runID)
        #print(runID)

        if runID in candidate_runID_not_in_dataset:
            #print('in')
            f_out.write(line)
            lines_found += 1

full_runID = np.array(full_runID)
is_in_condition_run = np.isin(candidate_runID_not_in_dataset, full_runID)

In [142]:
lines_found

27