# Benchmark different file transform approaches

This document serves the purpose of improving certain time critical parts of the file creat_cube.

### Setup
Importing the necessary modules (run poetry install to use the environment for this)

In [1]:
#python basemodules
import time
import os
import multiprocessing as mp

# data handling
import h5py
import xarray as xr
import numpy as np
import scipy
from scipy import signal, fft
import dask.array as da
import pyfftw
import pyfftw.interfaces.dask_fft as dafft
import pickle

repo_dir = os.popen('git rev-parse --show-toplevel').read().strip()


Reading the folders and filenames

In [2]:
base="/work/le837wmue-Rhone_download/DAS_2020"
os.chdir(base)
folders=os.listdir()
#folders

In [3]:
os.chdir(f"{base}/{folders[0]}")
files=os.listdir()
print("Number of files in folder 1:", len(files))
files[0:10]

Number of files in folder 1: 1307


['rhone1khz_UTC_20200707_205138.931.h5',
 'rhone1khz_UTC_20200707_154908.931.h5',
 'rhone1khz_UTC_20200707_164408.931.h5',
 'rhone1khz_UTC_20200707_154608.931.h5',
 'rhone1khz_UTC_20200707_235338.931.h5',
 'rhone1khz_UTC_20200707_135038.931.h5',
 'rhone1khz_UTC_20200707_190438.931.h5',
 'rhone1khz_UTC_20200707_182508.931.h5',
 'rhone1khz_UTC_20200707_183838.931.h5',
 'rhone1khz_UTC_20200707_191908.931.h5']

### Benchmarking the file read and conversion to numpy array

1. Using h5py as originally

In [4]:
start=time.time()
f=h5py.File(files[0],  'r')
dset=f['Acoustic']
print(np.array(dset))
end=time.time()
print("Time elapsed:", end-start)

[[  2979  -4623  -8582 ...   3592   2945   3725]
 [  7908  -2019  -6477 ...  -2881  -5727    647]
 [-10515  -3669   2995 ...   2528   1463  -5090]
 ...
 [  6664  15762  12302 ...  -4615  -8789  -5392]
 [   804    389   1578 ...   -908 -12766 -12201]
 [ -4772 -10940  -1687 ...   5943  -1315  -1256]]
Time elapsed: 0.4123396873474121


2. Using xarray 

In [14]:
start=time.time()
xr_h5=xr.open_dataset(files[0], engine='h5netcdf', backend_kwargs={'phony_dims': 'access'}) # we need to pass phony_dims as the file has no xarray readable dimensions
print(xr_h5["Acoustic"].compute().values)
end=time.time()
print("Time elapsed:", end-start)

[[  2979  -4623  -8582 ...   3592   2945   3725]
 [  7908  -2019  -6477 ...  -2881  -5727    647]
 [-10515  -3669   2995 ...   2528   1463  -5090]
 ...
 [  6664  15762  12302 ...  -4615  -8789  -5392]
 [   804    389   1578 ...   -908 -12766 -12201]
 [ -4772 -10940  -1687 ...   5943  -1315  -1256]]
Time elapsed: 0.0845937728881836


Reading a single file, xarray is about 3 hundredths faster than h5py. Let's see if this scales:

In [15]:
start=time.time()
for index,file in enumerate(files[0:40]):
    f=h5py.File(files[index],  'r')
    dset=f['Acoustic']
    np.array(dset)
end=time.time()
print("Time elapsed:", end-start)

Time elapsed: 20.112152338027954


In [16]:
start=time.time()
for index,file in enumerate(files[0:40]):
    xr_h5=xr.open_dataset(files[index], engine='h5netcdf', backend_kwargs={'phony_dims': 'access'})
    xr_h5["Acoustic"].compute().values
end=time.time()
print("Time elapsed:", end-start)

Time elapsed: 3.881666660308838


It does! Reading 40 files with h5py takes 17.9 seconds, with xarray it takes 2.9 seconds.
Can we use multiprocessing to speed up the process?

In [6]:
cpu_count=mp.cpu_count()*2//3 # we tae two thirds so the open file limit is not exceeded
cpu_count

85

In [18]:
def read_file(file):
    with h5py.File(file, 'r') as f: # we need with so it actually closes
        dset = f['Acoustic']
        np.array(dset)

start=time.time()
pool=mp.Pool(cpu_count)
pool.map(read_file, files[0:40])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

Time elapsed: 3.781881093978882


In [20]:
def read_file(file):
    xr_h5=xr.open_dataset(file, engine='h5netcdf', backend_kwargs={'phony_dims': 'access'})
    xr_h5["Acoustic"].compute().values

start=time.time()
pool=mp.Pool(cpu_count)
pool.map(read_file, files[0:40])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

Time elapsed: 2.9076411724090576


Xarray with the underlying dask is already using distributed computing. Still, we see that we can improve the reading and conversion to 1.78 seconds. 
However, h5py's conversion time is also highly reduced to only 2.54 seconds. Let's try it with more files:

In [None]:
def read_file(file):
    with h5py.File(file, 'r') as f: # we need with so it actually closes
        dset = f['Acoustic']
        np.array(dset)

start=time.time()
pool=mp.Pool(mp.cpu_count())
pool.map(read_file, files[0:200])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

In [None]:
def read_file(file):
    xr_h5=xr.open_dataset(file, engine='h5netcdf', backend_kwargs={'phony_dims': 'access'})
    xr_h5["Acoustic"].compute().values

start=time.time()
pool=mp.Pool(cpu_count)
pool.map(read_file, files[0:200])
pool.close()
pool.join()
end=time.time()
print("Time elapsed:", end-start) 

As we can see, this scales:
200 files with xarray still take only 17.2 seconds, but 200 files with h5py on 85 cpus never finish and the kernel crashes, which might be due to files not being closed properly and using too much memory space.

### Fourier transfrom

In [6]:
##########Base settings#########
#granularity of spectrogram
d_f = 1 # frequency resolution in Hz
d_t = 0.1 # time res in seconds

# section
loc_a, loc_e = 0, 9200 # cable section to be processed (in meters) - 0==start
ind_a, ind_e= loc_a//4, loc_e//4 # channel distances (4m each)
nFiles = 5 # number of h5 files processed
nCores = 8 # cpu cores


day= 22
month=7
sec=0
minut=0
hours=0

# Additional parameters:
file_length = 30 # Length of a single h5 file in seconds.
NU = 1000 #Sampling frequency in Hz of the recorded data.
freq_max = 100 # maximum frequency cut off value for the analysis
seg_length=1/d_f #calculate window length corresponding to d_f
N = file_length*NU #number of samples in one file
ind_f = int(seg_length*freq_max+1)
seg_len=int(seg_length*NU) #how many time points should be in one processing window
nseg=int(2*(file_length/seg_length)) #amount of segments for the desired window length
location_coords = np.arange(loc_a, loc_e, 4)
freq_coords=scipy.fft.rfftfreq(int(NU/d_f), 1/NU)[:ind_f]
hop = int(d_t*NU)

#fft input arguments
args = {
    "ind_f" : ind_f,
    "ind_a" : ind_a,
    "ind_e" : ind_e,
    "seg_len" : seg_len,
    "hop" : hop,
    "N" : N
}


#path and name of resulting zarr-formatted data cube.
ZARR_NAME = "cryo_cube.zarr"

1. The original approach 

In [None]:
def channel_fourier_numpy(data, args, taper, positions):
    """
    Applies Fourier Transformation to segments of DAS records to compute spectrograms.

    Args:
        data (ndarray): The raw data from DAS channels.
        args (dict): Contains parameters for Fourier Transform such as segment length and indices.
        taper (ndarray): The taper function to apply before the Fourier transform.
        positions (ndarray): The positions of the segments.

    Returns:
        ndarray: A 3D array containing the Fourier transform for each segment and channel.
    """
    seg_len = args["seg_len"]
    ind_e, ind_a = args["ind_e"], args["ind_a"]
    ind_f = args["ind_f"]



    # data transformation
    segs = ([data[pos:pos+seg_len] for pos in positions]) #dividing the data into segments each consisting of desired amount of data points
    segs = [seg.T[ind_a:ind_e] for seg in segs] #transposing the segments individually to gain time series for each channel
    nseg = positions.shape[0]
    
    # the first loop iterates over all segments (each corresponding to a time point)
    # in the second loop, the fourier transform gets applied on each channel
    Fsegs=np.zeros((nseg, ind_e-ind_a, ind_f))
    for i in range(nseg):
        for channel_number, channel in enumerate(segs[i]):

            # note that modified_log(x)=10*log(x) (conversion to

            fourier_transformed = np.fft.rfft(taper*channel, n=seg_len)
            fourier_transformed = ((10*np.log(np.abs(fourier_transformed)**2)))[0:ind_f]
            fourier_transformed[0]=0
            Fsegs[i][channel_number]=fourier_transformed

    return Fsegs


####### running for only 
file_index=0
f=h5py.File(files[file_index],  'r')
dset=f['Acoustic']
seg_len=args["seg_len"]
hop=args["hop"]
N=args["N"]
data = np.array(dset) # DAS data
taper = signal.windows.tukey(seg_len, 0.25) #taper function - reduce the amplitude of the discontinuities at the boundaries, thereby reducing spectral leakage.


# the windowing function (Tukey window in this case) tapers at the ends, 
#so to avoid losing data at the ends of each file, 
# the end of one file is overlapped with the beginning of the next file.
if file_index!=nFiles-1:

    g = h5py.File(files[file_index+1],'r')
    dset2=g['Acoustic']
    data2= np.array(dset2)
    
    start=time.time()
    data = np.concatenate((data, data2[0:seg_len]), axis=0)
    end=time.time()
    print("Time elapsed to concatenate:", end-start) 

j = file_index+1
file_pos = file_index * N

start=time.time()

# If the current file is not the last one
if file_index != nFiles-1:
    # Calculate the starting positions of each segment in the data
    # first segment: (j-1)*N/hop, rounded up
    # last segment: (j*N-1)/hop, rounded down
    positions = np.arange(np.ceil((j-1)*N/hop), np.floor((j*N-1)/hop)+1, dtype=int)*hop - file_pos # scaled by the hop size and offset by the file position
else:
    # If last one, start: (j*N-seg_len)/hop
    # to ensure that the last segment doesn't extend beyond the end of the data
    positions = np.arange(np.ceil((j-1)*N/hop), np.floor((j*N-seg_len)/hop)+1, dtype=int)*hop - file_pos
    
Fsegs = channel_fourier_numpy(data, args, taper, positions)

end=time.time()
print("Time elapsed for numpy fft:", end-start) 

Time elapsed to concatenate: 0.04962778091430664
Time elapsed for numpy fft: 19.632588624954224


2. Benchmarking SciPy NumPy pyFFTW

In [21]:
def channel_fourier(data, args, taper, positions, method='numpy'):
    """
    Applies Fourier Transformation to segments of DAS records using specified method.

    Args:
        data (ndarray): The raw data from DAS channels.
        args (dict): Contains parameters for Fourier Transform such as segment length and indices.
        taper (ndarray): The taper function to apply before the Fourier transform.
        positions (ndarray): The positions of the segments.
        method (str): Method for FFT computation ('numpy', 'scipy', 'pyfftw'). Default is 'numpy'.

    Returns:
        ndarray: A 3D array containing the Fourier transform for each segment and channel.
    """
    seg_len = args["seg_len"]
    ind_e, ind_a = args["ind_e"], args["ind_a"]
    ind_f = args["ind_f"]

    segs = ([data[pos:pos+seg_len] for pos in positions])
    segs = [seg.T[ind_a:ind_e] for seg in segs]

    nseg = positions.shape[0]
    Fsegs = np.zeros((nseg, ind_e-ind_a, ind_f))

    if method == 'numpy':
        for i in range(nseg):
            for channel_number, channel in enumerate(segs[i]):
                fourier_transformed = np.fft.rfft(taper * channel, n=seg_len)
                fourier_transformed = ((10 * np.log(np.abs(fourier_transformed) ** 2)))[0:ind_f]
                fourier_transformed[0] = 0
                Fsegs[i][channel_number] = fourier_transformed

    elif method == 'scipy':
        for i in range(nseg):
            for channel_number, channel in enumerate(segs[i]):
                fourier_transformed = fft.fft(taper * channel, n=seg_len)
                fourier_transformed = ((10 * np.log(np.abs(fourier_transformed) ** 2)))[0:ind_f]
                fourier_transformed[0] = 0
                Fsegs[i][channel_number] = fourier_transformed


    elif method == 'pyfftw':
        
        try:
            with open(f'{repo_dir}/code/notebooks/fftw_wisdom.pkl', 'rb') as f:
                wisdom = pickle.load(f)
                pyfftw.import_wisdom(wisdom)
                print("Found a wisdom file.")
        except FileNotFoundError:
            print("No wisdom file found. Starting without wisdom.")

        # Pre-allocate the input and output arrays for FFTW
        fft_input = pyfftw.empty_aligned(seg_len, dtype='complex128')
        fft_output = pyfftw.empty_aligned(seg_len, dtype='complex128')

        # Create FFTW object
        fft_object = pyfftw.FFTW(fft_input, fft_output)

        for i in range(nseg):
            for channel_number, channel in enumerate(segs[i]):
                fft_input[:] = taper * channel  # Apply taper
                fft_object()  # Execute FFT
                fourier_transformed = ((10 * np.log(np.abs(fft_output) ** 2)))[0:ind_f]
                fourier_transformed[0] = 0
                Fsegs[i][channel_number] = fourier_transformed

        with open(f'{repo_dir}/code/notebooks/fftw_wisdom.pkl', 'wb') as f:
            pickle.dump(pyfftw.export_wisdom(), f)

    else:
        raise ValueError("Invalid method specified. Choose from 'numpy', 'scipy', or 'pyfftw'.")

    return Fsegs

In [None]:
# Example usage for benchmarking
file_index = 0
seg_len = args["seg_len"]
hop = args["hop"]
N = args["N"]

f = h5py.File(files[file_index], 'r')
dset = f['Acoustic']
data = np.array(dset)

taper = signal.windows.tukey(seg_len, 0.25)

if file_index != nFiles - 1:
    g = h5py.File(files[file_index + 1], 'r')
    dset2 = g['Acoustic']
    data2 = np.array(dset2)
    data = np.concatenate((data, data2[0:seg_len]), axis=0)

j = file_index + 1
file_pos = file_index * N

if file_index != nFiles - 1:
    positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - 1) / hop) + 1, dtype=int) * hop - file_pos
else:
    positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - seg_len) / hop) + 1, dtype=int) * hop - file_pos

# Benchmarking each method
methods = ['numpy', 'scipy', 'pyfftw']
#methods = ['pyfftw']
#methods = ['scipy']
# methods = ['numpy']
for method in methods:
    start = time.time()
    Fsegs = channel_fourier(data, args, taper, positions, method=method)
    end = time.time()
    print(f"Time elapsed for {method} fft:", end - start)


In [22]:
# Assuming 'files' is a list of file paths and 'nFiles' is the total number of files
nFiles = 5  # Set this to the actual number of files you want to process
#methods = ['numpy', 'scipy', 'pyfftw']  # FFT methods to benchmark
methods = ['pyfftw']
#methods = ['scipy']
# methods = ['numpy']

# Loop over each method first
for method in methods:
    print(f"Benchmarking method: {method}")
    # Then loop over each file for the current method
    for file_index in range(nFiles):
        seg_len = args["seg_len"]
        hop = args["hop"]
        N = args["N"]

        f = h5py.File(files[file_index], 'r')
        dset = f['Acoustic']
        data = np.array(dset)

        taper = signal.windows.tukey(seg_len, 0.25)

        if file_index != nFiles - 1:
            g = h5py.File(files[file_index + 1], 'r')
            dset2 = g['Acoustic']
            data2 = np.array(dset2)
            data = np.concatenate((data, data2[0:seg_len]), axis=0)

        j = file_index + 1
        file_pos = file_index * N

        if file_index != nFiles - 1:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - 1) / hop) + 1, dtype=int) * hop - file_pos
        else:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - seg_len) / hop) + 1, dtype=int) * hop - file_pos

        # Benchmark the current method for the current file
        start = time.time()
        Fsegs = channel_fourier(data, args, taper, positions, method=method)
        end = time.time()
        print(f"Time elapsed for {method} fft in file {file_index}:", end - start)

Benchmarking method: pyfftw
No wisdom file found. Starting without wisdom.
Time elapsed for pyfftw fft in file 0: 29.353445529937744
Found a wisdom file.
Time elapsed for pyfftw fft in file 1: 29.174957036972046
Found a wisdom file.
Time elapsed for pyfftw fft in file 2: 29.14138913154602
Found a wisdom file.
Time elapsed for pyfftw fft in file 3: 29.096455097198486
Found a wisdom file.
Time elapsed for pyfftw fft in file 4: 28.372843265533447


### Optimizing pyFFTW

In [5]:
def channel_fourier(data, args, taper, positions, method='numpy'):
    """
    Applies Fourier Transformation to segments of DAS records using specified method.

    Args:
        data (ndarray): The raw data from DAS channels.
        args (dict): Contains parameters for Fourier Transform such as segment length and indices.
        taper (ndarray): The taper function to apply before the Fourier transform.
        positions (ndarray): The positions of the segments.
        method (str): Method for FFT computation ('numpy', 'scipy', 'pyfftw'). Default is 'numpy'.

    Returns:
        ndarray: A 3D array containing the Fourier transform for each segment and channel.
    """
    start=time.time()
    seg_len = args["seg_len"]
    ind_e, ind_a = args["ind_e"], args["ind_a"]
    ind_f = args["ind_f"]

    segs = ([data[pos:pos+seg_len] for pos in positions])
    segs = [seg.T[ind_a:ind_e] for seg in segs]

    nseg = positions.shape[0]
    Fsegs = np.zeros((nseg, ind_e-ind_a, ind_f))
    step1=time.time()
    print("Time to create segments: ", step1-start)
        
    try:
        with open(f'{repo_dir}/code/notebooks/fftw_wisdom.pkl', 'rb') as f:
            wisdom = pickle.load(f)
            pyfftw.import_wisdom(wisdom)
            print("Found a wisdom file.")
    except FileNotFoundError:
        print("No wisdom file found. Starting without wisdom.")
        
    step2=time.time()
    print("Time to load wisdom: ", step2-step1)

    # Pre-allocate the input and output arrays for FFTW
    fft_input = pyfftw.empty_aligned(seg_len, dtype='complex128')
    fft_output = pyfftw.empty_aligned(seg_len, dtype='complex128')

    step3=time.time()
    print("Time to pre-allocate arrays: ", step3-step2)


    # Create FFTW object
    fft_object = pyfftw.FFTW(fft_input, fft_output, flags=['FFTW_ESTIMATE'], threads=mp.cpu_count()//2)
    step4=time.time()
    print("Time to create FFTW object: ", step4-step3)

    for i in range(nseg):
        for channel_number, channel in enumerate(segs[i]):
            #step5=time.time()
            fft_input[:] = taper * channel  # Apply taper
            #step6=time.time()
            #print("Time to apply taper: ", step6-step5)
            fft_object()  # Execute FFT
            fourier_transformed = ((10 * np.log(np.abs(fft_output) ** 2)))[0:ind_f] # Compute power spectrum
            fourier_transformed[0] = 0 # Remove DC component (avarage value of the signal)
            Fsegs[i][channel_number] = fourier_transformed

    with open(f'{repo_dir}/code/notebooks/fftw_wisdom.pkl', 'wb') as f:
        pickle.dump(pyfftw.export_wisdom(), f)


    return Fsegs


In [6]:
# Assuming 'files' is a list of file paths and 'nFiles' is the total number of files
nFiles = 2  # Set this to the actual number of files you want to process
#methods = ['numpy', 'scipy', 'pyfftw']  # FFT methods to benchmark
methods = ['pyfftw']
#methods = ['scipy']
# methods = ['numpy']

# Loop over each method first
for method in methods:
    print(f"Benchmarking method: {method}")
    # Then loop over each file for the current method
    for file_index in range(nFiles):
        seg_len = args["seg_len"]
        hop = args["hop"]
        N = args["N"]

        f = h5py.File(files[file_index], 'r')
        dset = f['Acoustic']
        data = np.array(dset)

        taper = signal.windows.tukey(seg_len, 0.25)

        if file_index != nFiles - 1:
            g = h5py.File(files[file_index + 1], 'r')
            dset2 = g['Acoustic']
            data2 = np.array(dset2)
            data = np.concatenate((data, data2[0:seg_len]), axis=0)

        j = file_index + 1
        file_pos = file_index * N

        if file_index != nFiles - 1:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - 1) / hop) + 1, dtype=int) * hop - file_pos
        else:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - seg_len) / hop) + 1, dtype=int) * hop - file_pos

        # Benchmark the current method for the current file
        start = time.time()
        Fsegs = channel_fourier(data, args, taper, positions, method=method)
        end = time.time()
        print(f"Time elapsed for {method} fft in file {file_index}:", end - start)

Benchmarking method: pyfftw
Time to create segments:  0.00022363662719726562
Found a wisdom file.
Time to load wisdom:  0.0011110305786132812
Time to pre-allocate arrays:  6.127357482910156e-05
Time to create FFTW object:  0.00045418739318847656
Time elapsed for pyfftw fft in file 0: 29.494694471359253
Time to create segments:  0.00021982192993164062
Found a wisdom file.
Time to load wisdom:  0.0010890960693359375
Time to pre-allocate arrays:  4.3392181396484375e-05
Time to create FFTW object:  0.00045228004455566406
Time elapsed for pyfftw fft in file 1: 28.44540238380432


Using rfft from the builders module

In [9]:
def channel_fourier(data, args, taper, positions, method='numpy'):
    """
    Applies Fourier Transformation to segments of DAS records using specified method.

    Args:
        data (ndarray): The raw data from DAS channels.
        args (dict): Contains parameters for Fourier Transform such as segment length and indices.
        taper (ndarray): The taper function to apply before the Fourier transform.
        positions (ndarray): The positions of the segments.
        method (str): Method for FFT computation ('numpy', 'scipy', 'pyfftw'). Default is 'numpy'.

    Returns:
        ndarray: A 3D array containing the Fourier transform for each segment and channel.
    """
    start=time.time()
    seg_len = args["seg_len"]
    ind_e, ind_a = args["ind_e"], args["ind_a"]
    ind_f = args["ind_f"]

    segs = ([data[pos:pos+seg_len] for pos in positions])
    segs = [seg.T[ind_a:ind_e] for seg in segs]

    nseg = positions.shape[0]
    Fsegs = np.zeros((nseg, ind_e-ind_a, ind_f))
    step1=time.time()
    print("Time to create segments: ", step1-start)

    # Pre-allocate the input array for FFTW
    fft_input = pyfftw.empty_aligned(seg_len, dtype='float64')

    # Create FFTW object
    fft_object = pyfftw.builders.rfft(fft_input, planner_effort='FFTW_ESTIMATE', threads=mp.cpu_count()//2)

    for i in range(nseg):
        for channel_number, channel in enumerate(segs[i]):
            fft_input[:] = taper * channel  # Apply taper
            fft_output = fft_object()  # Execute FFT
            fourier_transformed = ((10 * np.log(np.abs(fft_output) ** 2)))[0:ind_f] # Compute power spectrum
            fourier_transformed[0] = 0 # Remove DC component (average value of the signal)
            Fsegs[i][channel_number] = fourier_transformed

    return Fsegs


In [10]:
# Assuming 'files' is a list of file paths and 'nFiles' is the total number of files
nFiles = 2  # Set this to the actual number of files you want to process
#methods = ['numpy', 'scipy', 'pyfftw']  # FFT methods to benchmark
methods = ['pyfftw']
#methods = ['scipy']
# methods = ['numpy']

# Loop over each method first
for method in methods:
    print(f"Benchmarking method: {method}")
    # Then loop over each file for the current method
    for file_index in range(nFiles):
        seg_len = args["seg_len"]
        hop = args["hop"]
        N = args["N"]

        f = h5py.File(files[file_index], 'r')
        dset = f['Acoustic']
        data = np.array(dset)

        taper = signal.windows.tukey(seg_len, 0.25)

        if file_index != nFiles - 1:
            g = h5py.File(files[file_index + 1], 'r')
            dset2 = g['Acoustic']
            data2 = np.array(dset2)
            data = np.concatenate((data, data2[0:seg_len]), axis=0)

        j = file_index + 1
        file_pos = file_index * N

        if file_index != nFiles - 1:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - 1) / hop) + 1, dtype=int) * hop - file_pos
        else:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - seg_len) / hop) + 1, dtype=int) * hop - file_pos

        # Benchmark the current method for the current file
        start = time.time()
        Fsegs = channel_fourier(data, args, taper, positions, method=method)
        end = time.time()
        print(f"Time elapsed for {method} fft in file {file_index}:", end - start)

Benchmarking method: pyfftw
Time to create segments:  0.00027251243591308594
Time to load wisdom:  9.703636169433594e-05
Time elapsed for pyfftw fft in file 0: 18.18965983390808
Time to create segments:  0.0002493858337402344
Time to load wisdom:  4.76837158203125e-05
Time elapsed for pyfftw fft in file 1: 17.55069351196289


Using dask rfft instead of the direct FFTW object

In [7]:
def channel_fourier(data, args, taper, positions, method='numpy'):
    
    # Enable the pyfftw cache
    pyfftw.interfaces.cache.enable()
    start=time.time()
    seg_len = args["seg_len"]
    ind_e, ind_a = args["ind_e"], args["ind_a"]
    ind_f = args["ind_f"]

    segs = ([data[pos:pos+seg_len] for pos in positions])
    segs = [seg.T[ind_a:ind_e] for seg in segs]

    nseg = positions.shape[0]
    Fsegs = np.zeros((nseg, ind_e-ind_a, ind_f))
    step1=time.time()
    print("Time to create segments: ", step1-start)
        
    try:
        with open(f'{repo_dir}/code/notebooks/fftw_wisdom.pkl', 'rb') as f:
            wisdom = pickle.load(f)
            pyfftw.import_wisdom(wisdom)
            print("Found a wisdom file.")
    except FileNotFoundError:
        print("No wisdom file found. Starting without wisdom.")
        
    step2=time.time()
    print("Time to load wisdom: ", step2-step1)

    for i in range(nseg):
        for channel_number, channel in enumerate(segs[i]):
            # Convert the channel data to a Dask array
            channel_da = da.from_array(channel, chunks=seg_len)
            
            # Apply taper and compute FFT using pyFFTW
            fft_output = dafft.rfft(taper * channel_da)
            
            # Compute the result
            fourier_transformed = ((10 * np.log(np.abs(fft_output.compute()) ** 2)))[0:ind_f]
            fourier_transformed[0] = 0
            Fsegs[i][channel_number] = fourier_transformed

    with open(f'{repo_dir}/code/notebooks/fftw_wisdom.pkl', 'wb') as f:
        pickle.dump(pyfftw.export_wisdom(), f)

    return Fsegs

In [None]:
# Assuming 'files' is a list of file paths and 'nFiles' is the total number of files
nFiles = 2  # Set this to the actual number of files you want to process
#methods = ['numpy', 'scipy', 'pyfftw']  # FFT methods to benchmark
methods = ['pyfftw']
#methods = ['scipy']
# methods = ['numpy']

# Loop over each method first
for method in methods:
    print(f"Benchmarking method: {method}")
    # Then loop over each file for the current method
    for file_index in range(nFiles):
        seg_len = args["seg_len"]
        hop = args["hop"]
        N = args["N"]

        f = h5py.File(files[file_index], 'r')
        dset = f['Acoustic']
        data = np.array(dset)

        taper = signal.windows.tukey(seg_len, 0.25)

        if file_index != nFiles - 1:
            g = h5py.File(files[file_index + 1], 'r')
            dset2 = g['Acoustic']
            data2 = np.array(dset2)
            data = np.concatenate((data, data2[0:seg_len]), axis=0)

        j = file_index + 1
        file_pos = file_index * N

        if file_index != nFiles - 1:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - 1) / hop) + 1, dtype=int) * hop - file_pos
        else:
            positions = np.arange(np.ceil((j - 1) * N / hop), np.floor((j * N - seg_len) / hop) + 1, dtype=int) * hop - file_pos

        # Benchmark the current method for the current file
        start = time.time()
        Fsegs = channel_fourier(data, args, taper, positions, method=method)
        end = time.time()
        print(f"Time elapsed for {method} fft in file {file_index}:", end - start)