# Benchmark different file transform approaches

This document serves the purpose of improving certain time critical parts of the file creat_cube.

### Setup
Importing the necessary modules (run poetry install to use the environment for this)

In [None]:
#python basemodules
import time
import os
import multiprocessing as mp

# data handling
import h5py
import xarray as xr
import numpy as np

Reading the folders and filenames

In [2]:
base="/work/le837wmue-Rhone_download/DAS_2020"
os.chdir(base)
folders=os.listdir()
#folders

In [3]:
os.chdir(f"{base}/{folders[0]}")
files=os.listdir()
print("Number of files in folder 1:", len(files))
files[0:10]

Number of files in folder 1: 1306


['rhone1khz_UTC_20200707_205138.931.h5',
 'rhone1khz_UTC_20200707_154908.931.h5',
 'rhone1khz_UTC_20200707_164408.931.h5',
 'rhone1khz_UTC_20200707_154608.931.h5',
 'rhone1khz_UTC_20200707_235338.931.h5',
 'rhone1khz_UTC_20200707_135038.931.h5',
 'rhone1khz_UTC_20200707_190438.931.h5',
 'rhone1khz_UTC_20200707_182508.931.h5',
 'rhone1khz_UTC_20200707_183838.931.h5',
 'rhone1khz_UTC_20200707_191908.931.h5']

### Benchmarking the file read and conversion to numpy array

1. Using h5py as originally

In [73]:
start=time.time()
f=h5py.File(files[0],  'r')
dset=f['Acoustic']
print(np.array(dset))
end=time.time()
print("Time elapsed:", end-start)

[[  2979  -4623  -8582 ...   3592   2945   3725]
 [  7908  -2019  -6477 ...  -2881  -5727    647]
 [-10515  -3669   2995 ...   2528   1463  -5090]
 ...
 [  6664  15762  12302 ...  -4615  -8789  -5392]
 [   804    389   1578 ...   -908 -12766 -12201]
 [ -4772 -10940  -1687 ...   5943  -1315  -1256]]
Time elapsed: 0.08681368827819824


2. Using xarray 

In [74]:
start=time.time()
xrH5=xr.open_dataset(files[0], engine='h5netcdf', backend_kwargs={'phony_dims': 'access'}) # we need to pass phony_dims as the file has no xarray readable dimensions
print(xrH5["Acoustic"].compute().values)
end=time.time()
print("Time elapsed:", end-start)

[[  2979  -4623  -8582 ...   3592   2945   3725]
 [  7908  -2019  -6477 ...  -2881  -5727    647]
 [-10515  -3669   2995 ...   2528   1463  -5090]
 ...
 [  6664  15762  12302 ...  -4615  -8789  -5392]
 [   804    389   1578 ...   -908 -12766 -12201]
 [ -4772 -10940  -1687 ...   5943  -1315  -1256]]
Time elapsed: 0.0596165657043457


Reading a single file, xarray is about 3 hundredths faster than h5py. Let's see if this scales:

In [70]:
start=time.time()
for index,file in enumerate(files[0:40]):
    f=h5py.File(files[index],  'r')
    dset=f['Acoustic']
    np.array(dset)
end=time.time()
print("Time elapsed:", end-start)

Time elapsed: 17.899274826049805


In [6]:
start=time.time()
for index,file in enumerate(files[0:40]):
    xrH5=xr.open_dataset(files[index], engine='h5netcdf', backend_kwargs={'phony_dims': 'access'})
    xrH5["Acoustic"].compute().values
end=time.time()
print("Time elapsed:", end-start)

Time elapsed: 3.1821184158325195


It does! Reading 40 files with h5py takes 17.9 seconds, with xarray it takes 2.9 seconds.
Can we use multiprocessing to speed up the process?

In [5]:
cpu_count=mp.cpu_count()*2//3 # we tae two thirds so the open file limit is not exceeded
cpu_count

85

In [15]:
def read_file(file):
    with h5py.File(file, 'r') as f: # we need with so it actually closes
        dset = f['Acoustic']
        np.array(dset)

start=time.time()
pool=mp.Pool(cpu_count)
pool.map(read_file, files[0:40])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

Time elapsed: 2.542973756790161


In [12]:
def read_file(file):
    xrH5=xr.open_dataset(file, engine='h5netcdf', backend_kwargs={'phony_dims': 'access'})
    xrH5["Acoustic"].compute().values

start=time.time()
pool=mp.Pool(cpu_count)
pool.map(read_file, files[0:40])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

Time elapsed: 1.7802202701568604


Xarray with the underlying dask is already using distributed computing. Still, we see that we can improve the reading and conversion to 1.78 seconds. 
However, h5py's conversion time is also highly reduced to only 2.54 seconds. Let's try it with more files:

In [None]:
def read_file(file):
    with h5py.File(file, 'r') as f: # we need with so it actually closes
        dset = f['Acoustic']
        np.array(dset)

start=time.time()
pool=mp.Pool(mp.cpu_count())
pool.map(read_file, files[0:200])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

In [6]:
def read_file(file):
    xrH5=xr.open_dataset(file, engine='h5netcdf', backend_kwargs={'phony_dims': 'access'})
    xrH5["Acoustic"].compute().values

start=time.time()
pool=mp.Pool(cpu_count)
pool.map(read_file, files[0:200])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

Time elapsed: 17.176568031311035


As we can see, this scales:
200 files with xarray still take only 17.2 seconds, but 200 files with h5py on 85 cpus never finish and the kernel crashes, which might be due to files not being closed properly and using too much memory space.

### Fourier transfrom

In [2]:
AttributeError##########Base settings#########
#granularity of spectrogram
d_f = 1 # frequency resolution in Hz
d_t = 0.1 # time res in seconds

# section
loc_a, loc_e = 0, 9200 # cable section to be processed (in meters) - 0==start
ind_a, ind_e= loc_a//4, loc_e//4 # channel distances (4m each)
nFiles = 5 # number of h5 files processed
nCores = 8 # cpu cores


day= 22
month=7
sec=0
minut=0
hours=0

# Additional parameters:
file_length = 30 # Length of a single h5 file in seconds.
NU = 1000 #Sampling frequency in Hz of the recorded data.
freq_max = 100 # maximum frequency cut off value for the analysis
seg_length=1/d_f #calculate window length corresponding to d_f
N = file_length*NU #number of samples in one file
ind_f = int(seg_length*freq_max+1)
seg_len=int(seg_length*NU) #how many time points should be in one processing window
nseg=int(2*(file_length/seg_length)) #amount of segments for the desired window length
location_coords = np.arange(loc_a, loc_e, 4)
freq_coords=scipy.fft.rfftfreq(int(NU/d_f), 1/NU)[:ind_f]
hop = int(d_t*NU)

#fft input arguments
args = {
    "ind_f" : ind_f,
    "ind_a" : ind_a,
    "ind_e" : ind_e,
    "seg_len" : seg_len,
    "hop" : hop,
    "N" : N
}


#path and name of resulting zarr-formatted data cube.
ZARR_NAME = "cryo_cube.zarr"

1. The original approach 

In [None]:

def channel_fourier(data, args, taper, positions):
    """
    Applies Fourier Transformation to segments of DAS records to compute spectrograms.

    Args:
        data (ndarray): The raw data from DAS channels.
        args (dict): Contains parameters for Fourier Transform such as segment length and indices.
        taper (ndarray): The taper function to apply before the Fourier transform.
        positions (ndarray): The positions of the segments.

    Returns:
        ndarray: A 3D array containing the Fourier transform for each segment and channel.
    """
    seg_len = args["seg_len"]
    ind_e, ind_a = args["ind_e"], args["ind_a"]
    ind_f = args["ind_f"]



    # data transformation
    segs = ([data[pos:pos+seg_len] for pos in positions]) #dividing the data into segments each consisting of desired amount of data points
    segs = [seg.T[ind_a:ind_e] for seg in segs] #transposing the segments individually to gain time series for each channel
    nseg = positions.shape[0]
    
    # the first loop iterates over all segments (each corresponding to a time point)
    # in the second loop, the fourier transform gets applied on each channel
    Fsegs=np.zeros((nseg, ind_e-ind_a, ind_f))
    for i in range(nseg):
        for channel_number, channel in enumerate(segs[i]):

            # note that modified_log(x)=10*log(x) (conversion to

            fourier_transformed = np.fft.rfft(taper*channel, n=seg_len)
            fourier_transformed = ((10*np.log(np.abs(fourier_transformed)**2)))[0:ind_f]
            fourier_transformed[0]=0
            Fsegs[i][channel_number]=fourier_transformed

    return Fsegs




f=h5py.File(files[0],  'r')
dset=f['Acoustic']
seg_len=args["seg_len"]
hop=args["hop"]
N=args["N"]

print(np.array(dset))

start=time.time()

end=time.time()
print("Time elapsed:", end-start) 