In [3]:
%%script false --no-raise-error
import zarr
from netCDF4 import Dataset
if __name__ == '__main__':
    #Todo read up on hdf5, zarr, netcdf, xarray, mpi, lustre, ceph
    # and check back with dkrz their structure, code conventions, datasets

    # Todo setup test cases proactively to ensure proper comparability and best practice

    #Todo use netcdf dataset for testing (for now) should already bet hdf5
    dest_hdf = Dataset("data/source.nc", "w", format="NETCDF4")
    print(dest_hdf.data_model)
    dest_hdf.close()

    #Todo import netcdf data through hdf5 to zarr (for now)
    dest_zarr = zarr.open_group('data/example2.zarr', mode='w')
    zarr.copy_all(dest_hdf, dest_zarr)
    dest_zarr.tree()

    #Todo conversion to netcdf

    #for hdf5 nothing needs to be done

    #for zarr (wip). netcdf is developing its own implementation
    # but that isn't available yet so I will work on something myself in the meantime

    #Todo setup Lustre and Ceph (need info on that)

    #Todo setup benchmark (prob compression, filesize, access-time, r/w-time) for sequential access

    #Todo setup benchmark for parallel access

    #Todo setup benchmark for random access

    #Todo setup benchmark for parallel with subfileing and async / I/O


In [4]:
%%script false --no-raise-error
from mpi4py import MPI
import numpy as np
import h5py

rank = MPI.COMM_WORLD.rank

# create HDF5 file
with h5py.File('data/test_ds.hdf5', 'w', driver="mpio", comm=MPI.COMM_WORLD) as hf:
    u = hf.create_dataset("u", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    v = hf.create_dataset("v", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    w = hf.create_dataset("w", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    x = hf.create_dataset("x", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    y = hf.create_dataset("y", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)

hf.close()   

In [4]:
from dask.distributed import Client

client = Client()
client
client.shutdown()

In [20]:
%%time
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_netcdf('data/test_dataset.nc', mode="w")

ds.to_netcdf('data/test_dataset.h5', mode="w", engine="h5netcdf")

ds.to_zarr("data/test_dataset.zarr", mode="w")



CPU times: user 6.55 s, sys: 8.33 s, total: 14.9 s
Wall time: 13.7 s


<xarray.backends.zarr.ZarrStore at 0x7fdc12d9fd40>

In [5]:
#dataset creation for plotting
import numpy as np

plotting_zarr_opening_times = []
plotting_hdf5_opening_times = []
plotting_netcdf4_opening_times = []
plotting_zarr_calculating_times = []
plotting_hdf5_calculating_times = []
plotting_netcdf4_calculating_times = []


In [6]:
%%time 
import xarray as xr
import time

for i in range(10000):
    start_time = time.time()
    ds_zarr = xr.open_zarr('data/test_dataset.zarr', consolidated=True)
    plotting_zarr_opening_times.append(time.time() - start_time)

CPU times: user 16.4 s, sys: 226 ms, total: 16.7 s
Wall time: 18.2 s


In [7]:
%%time 
import xarray as xr
import time

for i in range(10000):
    start_time = time.time()
    max_var = ds_zarr["x"].max().compute()
    plotting_zarr_calculating_times.append(time.time() - start_time)


KeyboardInterrupt: 

In [21]:
%%time
import xarray as xr
import time

for i in range(10000):
    start_time = time.time()
    ds_netcdf4 = xr.open_dataset(filename_or_obj="data/test_dataset.nc", engine="h5netcdf", chunks={"u": 100, "v": 100, "w": 100, "x": 100, "y": 100, "z": 100, })
    plotting_netcdf4_opening_times.append(time.time() - start_time)


ValueError: unrecognized chunk manager dask - must be one of: []

In [22]:
%%time
import xarray as xr
import time

for i in range(10000):
    start_time = time.time()
    max_var = ds_netcdf4["x"].max().compute()
    plotting_netcdf4_calculating_times.append(time.time() - start_time)

KeyboardInterrupt: 

In [7]:
%%time
import xarray as xr
import time

for i in range(10000):
    start_time = time.time()
    ds_hdf5 = xr.open_dataset(filename_or_obj="data/test_dataset.h5", engine="h5netcdf", chunks='auto')
    plotting_hdf5_opening_times.append(time.time() - start_time)

CPU times: user 1min 9s, sys: 79.4 ms, total: 1min 9s
Wall time: 1min 9s


In [8]:
%%time
import xarray as xr
import time

for i in range(10000):
    start_time = time.time()
    max_var = ds_hdf5["x"].max().compute()
    plotting_hdf5_calculating_times.append(time.time() - start_time)

CPU times: user 3min 19s, sys: 142 ms, total: 3min 19s
Wall time: 3min 20s


In [None]:
import plotly.express as px

fig = px.box(plotting_zarr_opening_times, log_y=True)
fig.show()

fig = px.histogram(plotting_zarr_opening_times, log_y=True)
fig.show()

fig = px.box(plotting_hdf5_opening_times, log_y=True)
fig.show()

fig = px.histogram(plotting_hdf5_opening_times, log_y=True)
fig.show()

fig = px.box(plotting_netcdf4_opening_times, log_y=True)
fig.show()

fig = px.histogram(plotting_netcdf4_opening_times, log_y=True)
fig.show()

fig = px.box(plotting_zarr_calculating_times, log_y=True)
fig.show()

fig = px.histogram(plotting_zarr_calculating_times, log_y=True)
fig.show()

fig = px.box(plotting_hdf5_calculating_times, log_y=True)
fig.show()

fig = px.histogram(plotting_hdf5_calculating_times, log_y=True)
fig.show()

fig = px.box(plotting_netcdf4_calculating_times, log_y=True)
fig.show()

fig = px.histogram(plotting_netcdf4_calculating_times, log_y=True)
fig.show()
