In [6]:
import sys,os,os.path
sys.path.append("../../../") # if you move files around, you need to adjust this!
sys.path.append(os.path.expanduser('~/code/eol_hsrl_python'))
os.environ['ICTDIR']='/home/e78368jw/Documents/NEXT_CODE/IC'

import matplotlib.pyplot as plt
import pandas as pd
import numpy  as np
import tables as tb
import IC.invisible_cities.io.dst_io                           as     dstio
import IC.invisible_cities.io.mcinfo_io as mcio
from    IC.invisible_cities.core.core_functions   import shift_to_bin_centers

import scipy.special as special
from scipy.stats import skewnorm
from scipy.optimize import curve_fit
import matplotlib.ticker as ticker

# timekeeping
from tqdm import tqdm
import time

from scipy.integrate import quad

# the functions
import core.functions as func

In [13]:
def load_data(folder_path):
    '''
    Load in multiple h5 files and produce dataframes corresponding to /Tracking/Tracks, /MC/Particles, and their corresponding
    eventmap.

    Args:
        folder_path     :       path to folder of h5 files
    Returns:
        (tracks,        :       tracks dataframe
        particles,      :       MC particle information dataframe
        eventmap)       :       eventmap for MC -> Tracks
    '''
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.h5')]
    
    # remove any files that dont end in h5

    # NOTE Break this section up, its annoying like this.
    dfs = []
    df_trs = []
    df_ems = []
    i = 0
    end = len(file_names)
    # create massive dataframe with all of them
    for file in file_names:
        file_path = folder_path + file
        df = dstio.load_dst(file_path, 'Tracking', 'Tracks')
        dfs.append(df)
        # include MC particles (boooo takes ages)

        # collecting the correct components of the file, not exactly sure how this works
        df_ps = pd.read_hdf(file_path, 'MC/particles')
        #df_ps = df_ps[df_ps.creator_proc == 'conv']
        # collecting event map
        df_em = mcio.load_eventnumbermap(file_path).set_index('nexus_evt')
        df_trs.append(df_ps)
        df_ems.append(df_em)
        i += 1

        if (i%50 == 0):
            print(i)

    tracks = pd.concat(dfs, axis=0, ignore_index=True)

    particles = pd.concat(df_trs, ignore_index=True)
    particles['event_id'] = particles['event_id'] * 2   # double it

    eventmap = pd.concat([dt for dt in df_ems])
    # create particle list also

    return (tracks, particles, eventmap)

In [41]:
def test_speed(fun):

    # collect file path
    initi = 'isaura_sample/'
    folder_path = ['isaura_sample_5/', 'isaura_sample_20/', 'isaura_sample_50/', 'isaura_sample_100/']
    file_no = ['5', '20', '50', '100']


    print(f"Function: {fun.__name__}")

    for i in range(len(folder_path)):
        print(f"{file_no[i]} files:")
        start = time.time()
        data = fun(initi + folder_path[i])
        end = time.time()
        print(f'{end-start:.4f} s')
        print("")

    return data

def compare_data(data_1, data_2):
    print(f'Are dataframes equivalent?\n{not (((data_1 == data_2) == False).values.any())}')


In [22]:
data = test_speed(load_data)

Function: load_data
5 files:
0.7353 s

20 files:
2.9814 s

50 files:
50
7.0594 s

100 files:
50
100
14.8477 s



#### now new function

In [45]:
from concurrent.futures import ProcessPoolExecutor

def load_single_file(file_path):
    """ Helper function to load data from a single file. """
    tracks_df = dstio.load_dst(file_path, 'Tracking', 'Tracks')
    particles_df = pd.read_hdf(file_path, 'MC/particles')
    eventmap_df = mcio.load_eventnumbermap(file_path).set_index('nexus_evt')
    
    # Modify particles data
    particles_df['event_id'] = particles_df['event_id'] * 2
    
    return tracks_df, particles_df, eventmap_df

def load_data_new(folder_path):
    '''
    Load in multiple h5 files and produce dataframes corresponding to /Tracking/Tracks, /MC/Particles, and their corresponding
    eventmap.
    '''
    file_names = [f for f in os.listdir(folder_path) if f.endswith('.h5')]
    file_paths = [os.path.join(folder_path, f) for f in file_names]

    # Use ProcessPoolExecutor to parallelize the data loading process
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(load_single_file, file_paths))
    
    # Separate the results into respective lists
    tracks_list, particles_list, eventmap_list = zip(*results)

    # Concatenate all the dataframes at once
    tracks = pd.concat(tracks_list, axis=0, ignore_index=True)
    particles = pd.concat(particles_list, ignore_index=True)
    eventmap = pd.concat(eventmap_list, ignore_index=True)

    return tracks, particles

In [46]:
data_new = test_speed(load_data_new)

Function: load_data_new
5 files:
1.5540 s

20 files:
2.6461 s

50 files:
5.1486 s

100 files:
9.5462 s



In [47]:
compare_data(data[0], data_new[0])
compare_data(data[1], data_new[1])


Are dataframes equivalent?
True
Are dataframes equivalent?
True


### trying a different shape

In [48]:
def load_single_file_loop(file_path):
    """Helper function to load data from a single file."""
    tracks_df = dstio.load_dst(file_path, 'Tracking', 'Tracks')
    particles_df = pd.read_hdf(file_path, 'MC/particles')
    eventmap_df = mcio.load_eventnumbermap(file_path)

    # Ensure the eventmap has a consistent index
    eventmap_df = eventmap_df.set_index('nexus_evt')

    # Modify particles data (example operation)
    particles_df['event_id'] = particles_df['event_id'] * 2
    
    return tracks_df, particles_df, eventmap_df

def load_data_loop(folder_path):
    '''
    Load multiple h5 files and produce dataframes corresponding to /Tracking/Tracks, /MC/Particles, and their corresponding
    eventmap.
    '''
    file_names = [f for f in os.listdir(folder_path) if f.endswith('.h5')]
    file_paths = [os.path.join(folder_path, f) for f in file_names]

    # Initialize empty DataFrames
    tracks = pd.DataFrame()
    particles = pd.DataFrame()
    eventmap = pd.DataFrame()

    # Loop through files and append data to the master DataFrames
    for i, file_path in enumerate(file_paths):
        tracks_df, particles_df, eventmap_df = load_single_file(file_path)
        
        # Append data to master DataFrames
        tracks = tracks.append(tracks_df, ignore_index=True)
        particles = particles.append(particles_df, ignore_index=True)
        eventmap = eventmap.append(eventmap_df.reset_index(), ignore_index=True)

        if (i+1) % 50 == 0:
            print(f"{i+1} files processed")

    return tracks, particles, eventmap

In [None]:
data_newest = test_speed(load_data_loop)