In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')
#%matplotlib widget

In [None]:
from scipy.signal import welch, get_window
from obspy.signal.filter import bandpass

from obspy.clients.filesystem.sds import Client
from obspy.clients.fdsn import RoutingClient
from obspy.core import UTCDateTime as UTC
from obspy.signal import util

In [None]:
import h5py

### Partial read of HDF5

In [None]:
fname1 = 'data/GR.BFO..HHZ_2020-12-25_2020-12-31.hdf5'
fname2 = 'data/GR.BFO..HHZ_2021-01-01_2021-01-09.hdf5'

In [None]:
fin = h5py.File(fname1, 'r')

In [None]:
fin.__sizeof__()

In [None]:
amps = fin['amplitudes']

In [None]:
amps.shape

In [None]:
l1 = [i for i in range(10)]
print(l1.__sizeof__())

In [None]:
l2 = [i for i in range(100)]
print(l2.__sizeof__())

In [None]:
x = 10
x.__sizeof__()

Get numpy-arrays

In [None]:
PXX = np.load("data/GR.BFO..HHZ_2021-07-20T00:00:00.000000Z_2021-10-25T00:00:00.000000Z_PXX.npy")
AMP = np.load("data/GR.BFO..HHZ_2021-07-20T00:00:00.000000Z_2021-10-25T00:00:00.000000Z_AMP.npy")

In [None]:
print(PXX.shape, PXX.size)
print(AMP.shape, AMP.size)

Check file sizes of numpy data

In [None]:
%ls -lh data/*.npy

#### HPF5

In [None]:
f = h5py.File('mmytest.hdf5', 'w')

In [None]:
f.create_dataset('amp', data=AMP)

In [None]:
for k, v in f.items():
    print(k, v.name)


In [None]:
print(type(f['amp']))

To convert hdf5-dataset into numpy array, simply use `np.array()` on it.

In [None]:
x = np.array(v)

print(type(x))

Put each numpy-array in hdf5 and check file size

In [None]:
with h5py.File("data/GR.BFO..HHZ_PXX.hdf5", "w") as fout:
    fout.create_dataset('pxx', data=PXX, compression=1)

In [None]:
with h5py.File("data/GR.BFO..HHZ_AMP.hdf5", "w") as fout:
    fout.create_dataset('amp', data=AMP)

In [None]:
%ls -lh data/*.hdf5

Clear all items from FileObject

In [None]:
f.clear()

### How to put data into HDF5 row by row?

In [None]:
PXX.shape

In [None]:
fname = "data/GR.BFO..HHZ_PXX.hdf5"

Initialize file with some shape (years later) and max shape (leap year).

In [None]:
with h5py.File(fname, "w") as fout:
    fout.create_dataset('pxx', shape=(97,24,1025), maxshape=(98, 24, 1025))

In [None]:
for i, row in enumerate(PXX):
    #print(row.shape)
    with h5py.File(fname, "a") as fout:
        fout['pxx'][i,:,:] = row

Now let's add last row again into next row.

In [None]:
with h5py.File(fname, "a") as fout:
    try:
        fout['pxx'][i+1,:,:] = row
    except IndexError:
        fout['pxx'].resize(i+2, axis=0)
        fout['pxx'][i+1,:,:] = row

And again with increase `i`. Hopefully this does not work.

In [None]:
i = i+1
with h5py.File(fname, "a") as fout:
    try:
        fout['pxx'][i+1,:,:] = row
    except IndexError:
        fout['pxx'].resize(98, axis=0)
        fout['pxx'][i+1,:,:] = row

No, it does not. Returns `IndexError`.

In [None]:
fout.close()

In [None]:
fname = "data/GR.BFO..HHZ.hdf5"

In [None]:
with h5py.File(fname, "a") as fout:
    fout.create_dataset('PXX', data=PXX)
    fout.create_dataset('AMP', data=AMP)

### Runtime of IO

Which one is faster? 

- Open and close the file every time we add a row (simulate daily update)
- Opening it once and add all the rows one by one, then close (simulate batch processing of existing data)

Unsurprisingly, the latter is faster, although the difference is smaller than expected. Probably, looping over the array alone already costs a lot of time.

In [None]:
%%timeit -r 20 -n 20
with h5py.File(fname, "w") as fout:
    fout.create_dataset('pxx', shape=(97,24,1025), maxshape=(98, 24, 1025))
    
for i, row in enumerate(PXX):
    #print(row.shape)
    with h5py.File(fname, "a") as fout:
        fout['pxx'][i,:,:] = row

In [None]:
%%timeit -r 20 -n 20
with h5py.File(fname, "w") as fout:
    fout.create_dataset('pxx', shape=(97,24,1025), maxshape=(98, 24, 1025))
    
with h5py.File(fname, "a") as fout:
    for i, row in enumerate(PXX):
    #print(row.shape)
        fout['pxx'][i,:,:] = row

In [None]:
%ls -lh data/*.hdf5

In [None]:
with h5py.File(fname, "r") as f:
    print(f['pxx'].shape)
    dataset = f['pxx']
    data = np.array(f['pxx'])

In [None]:
dataset.resize()

In [None]:
%lsmagic

In [None]:

PXX.__sizeof__()