In [3]:
%%script false --no-raise-error
import zarr
from netCDF4 import Dataset
if __name__ == '__main__':
    #Todo read up on hdf5, zarr, netcdf, xarray, mpi, lustre, ceph
    # and check back with dkrz their structure, code conventions, datasets

    # Todo setup test cases proactively to ensure proper comparability and best practice

    #Todo use netcdf dataset for testing (for now) should already bet hdf5
    dest_hdf = Dataset("data/source.nc", "w", format="NETCDF4")
    print(dest_hdf.data_model)
    dest_hdf.close()

    #Todo import netcdf data through hdf5 to zarr (for now)
    dest_zarr = zarr.open_group('data/example2.zarr', mode='w')
    zarr.copy_all(dest_hdf, dest_zarr)
    dest_zarr.tree()

    #Todo conversion to netcdf

    #for hdf5 nothing needs to be done

    #for zarr (wip). netcdf is developing its own implementation
    # but that isn't available yet so I will work on something myself in the meantime

    #Todo setup Lustre and Ceph (need info on that)

    #Todo setup benchmark (prob compression, filesize, access-time, r/w-time) for sequential access

    #Todo setup benchmark for parallel access

    #Todo setup benchmark for random access

    #Todo setup benchmark for parallel with subfileing and async / I/O


In [4]:
%%script false --no-raise-error
from mpi4py import MPI
import numpy as np
import h5py

rank = MPI.COMM_WORLD.rank

# create HDF5 file
with h5py.File('data/test_ds.hdf5', 'w', driver="mpio", comm=MPI.COMM_WORLD) as hf:
    u = hf.create_dataset("u", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    v = hf.create_dataset("v", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    w = hf.create_dataset("w", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    x = hf.create_dataset("x", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)
    y = hf.create_dataset("y", data=np.random.rand(1_000_000, 10, 10, 1), shape=(1_000_000, 10, 10, 1), compression="gzip", chunks=True)

hf.close()   

In [20]:
%%time
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_netcdf('data/test_dataset.nc', mode="w")

ds.to_netcdf('data/test_dataset.h5', mode="w", engine="h5netcdf")

ds.to_zarr("data/test_dataset.zarr", mode="w")



CPU times: user 6.55 s, sys: 8.33 s, total: 14.9 s
Wall time: 13.7 s


<xarray.backends.zarr.ZarrStore at 0x7fdc12d9fd40>

In [11]:
#dataset creation for plotting
import numpy as np

plotting_zarr = []
plotting_hdf5 = []
plotting_netcdf4 = []


In [16]:
%%time 
import xarray as xr
import time

start_time = time.time()
ds_zarr = xr.open_zarr('data/test_dataset.zarr', consolidated=True)
plotting_zarr.append(time.time() - start_time)

CPU times: user 1.18 ms, sys: 411 μs, total: 1.59 ms
Wall time: 1.22 ms


In [4]:
%%time 
import xarray as xr
import time

start_time = time.time()
max_var = ds_zarr["x"].max().compute()
plotting_zarr.append(time.time() - start_time)


CPU times: user 633 ms, sys: 372 ms, total: 1 s
Wall time: 1.14 s


In [5]:
%%time
import xarray as xr
import time

start_time = time.time()
ds_netcdf4 = xr.open_dataset(filename_or_obj="data/test_dataset.nc", engine="h5netcdf")
plotting_netcdf4.append(time.time() - start_time)


CPU times: user 26.3 ms, sys: 0 ns, total: 26.3 ms
Wall time: 68.6 ms


In [6]:
%%time
import xarray as xr
import time

start_time = time.time()
max_var = ds_netcdf4["x"].max().compute()
plotting_netcdf4.append(time.time() - start_time)

CPU times: user 0 ns, sys: 209 ms, total: 209 ms
Wall time: 266 ms


In [7]:
%%time
import xarray as xr
import time

start_time = time.time()
ds_hdf5 = xr.open_dataset(filename_or_obj="data/test_dataset.h5", engine="h5netcdf")
plotting_hdf5.append(time.time() - start_time)

CPU times: user 0 ns, sys: 10.3 ms, total: 10.3 ms
Wall time: 9.79 ms


In [8]:
%%time
import xarray as xr
import time

start_time = time.time()
max_var = ds_hdf5["x"].max().compute()
plotting_hdf5.append(time.time() - start_time)

CPU times: user 0 ns, sys: 199 ms, total: 199 ms
Wall time: 272 ms
