In [100]:
%%script false --no-raise-error
%%sh
export ZARR_V3_EXPERIMENTAL_API=1

In [101]:
%%time
# Create Testing Dataset, Change Parameters to vary size 
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_zarr("data/test_dataset.zarr", mode="w", zarr_format=2)

ds.to_netcdf("data/test_dataset.nc", mode="w", format="NETCDF4", engine="netcdf4")

ds.to_netcdf("data/test_dataset.h5", mode="w", engine="h5netcdf")


CPU times: user 7.37 s, sys: 9.14 s, total: 16.5 s
Wall time: 14.9 s


In [102]:
#dataset creation for plotting

zarr_op_time = []
netcdf4_op_time = []
hdf5_op_time = []

zarr_read_time = []
netcdf4_read_time = []
hdf5_read_time = []

zarr_write_time = []
netcdf4_write_time = []
hdf5_write_time = []

In [103]:
%%time
#Open benchmark zarr
import zarr
import time

for i in range(10000):
    start_time = time.time()
    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
    zarr_op_time.append(time.time() - start_time) 
     


CPU times: user 258 ms, sys: 93.2 ms, total: 351 ms
Wall time: 316 ms


In [104]:
%%time 
#Read benchmark zarr
import time
import numpy as np
import zarr

ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

for i in range(1000):
    #print(f"Currently in i: {i}")
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp=0
    
    start_time = time.time()
    tmp = ds_zarr["x"][val_picked]
    zarr_read_time.append(time.time() - start_time) 
    
    #print(f"Current val read at {val_picked}: {tmp}")
    

CPU times: user 3.17 s, sys: 2.85 s, total: 6.02 s
Wall time: 5.64 s


In [105]:
%%time 
#Write benchmark zarr
import time
import numpy as np
import zarr

ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

for i in range(1_000):
    #print(f"Currently in i: {i}")
    val_picked = np.random.randint(low=0, high=1_000_000-1)
    tmp=0
    prev = ds_zarr["x"][val_picked]
    curr = ds_zarr["x"][val_picked]

    #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
    
    start_time = time.time()
    ds_zarr["x"][val_picked] = np.random.rand(10,10,1)
    zarr_write_time.append(time.time() - start_time) 
    
    curr = ds_zarr["x"][val_picked]
    #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
    

CPU times: user 22.8 s, sys: 46.1 s, total: 1min 8s
Wall time: 1min 9s


In [106]:
%%time 
#Open Benchmark netcdf4
import time
import netCDF4

for i in range(1_0000):
    start_time = time.time()
    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
    netcdf4_op_time.append(time.time() - start_time)

CPU times: user 14 s, sys: 1.68 s, total: 15.7 s
Wall time: 17.7 s


In [107]:
%%time
#Read Benchmark netcdf4
import time
import numpy as np
import netCDF4

ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
    
    
for i in range(1000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1_000_000-1)
    tmp=0
        
    start_time = time.time()
    tmp = ds_netcdf4.variables["x"][val_picked]
    netcdf4_read_time.append(time.time() - start_time)
        
    #print(f"Current val read at {val_picked}: {tmp}")

CPU times: user 79.8 ms, sys: 14.9 ms, total: 94.7 ms
Wall time: 84.7 ms


In [108]:
%%time
#Write Benchmark netcdf4
import time
import numpy as np
import netCDF4

ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
    
for i in range(1000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1_000_000-1)
    tmp = 0
    prev = ds_netcdf4.variables["x"][val_picked]
    curr = ds_netcdf4.variables["x"][val_picked]

    #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
    start_time = time.time()
    ds_netcdf4.variables["x"][val_picked] = np.random.rand(10,10,1)
    netcdf4_write_time.append(time.time() - start_time)
    
    curr = ds_zarr["x"][val_picked]
    #print(f"Current val read at {val_picked}: {(prev == curr).all()}")

CPU times: user 4.23 s, sys: 8.47 s, total: 12.7 s
Wall time: 11.4 s


In [109]:
%%time
#Open Benchmark hdf5
import h5py
import time

for i in range(10_000):
    start_time = time.time()
    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
    hdf5_op_time.append(time.time() - start_time)

CPU times: user 124 ms, sys: 15.3 ms, total: 139 ms
Wall time: 132 ms


In [110]:
%%time
#Read Benchmark hdf5
import time
import numpy as np
import h5py

ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
    
for i in range(1_000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1_000_000-1)
    tmp=0
        
    start_time = time.time()
    tmp = ds_hdf5.get("x")[val_picked]
    hdf5_read_time.append(time.time() - start_time)
        
    #print(f"Current val read at {val_picked}: {tmp}")

CPU times: user 53.9 ms, sys: 0 ns, total: 53.9 ms
Wall time: 48.4 ms


In [111]:
%%time
#Write Benchmark hdf5
import time
import numpy as np
import h5py

ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
    
for i in range(1_000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1_000_000-1)
    tmp=0
    prev = ds_hdf5.get("x")[val_picked]
    curr = ds_hdf5.get("x")[val_picked]
        
    #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
    
    start_time = time.time()
    ds_hdf5.get("x")[val_picked] = np.random.rand(10,10,1)
    hdf5_write_time.append(time.time() - start_time)
    
    curr = ds_zarr["x"][val_picked]
    #print(f"Current val read at {val_picked}: {(prev == curr).all()}")

CPU times: user 3.79 s, sys: 9.3 s, total: 13.1 s
Wall time: 11.8 s


In [112]:
import pandas as pd

df_op = pd.DataFrame()
df_op.insert(0,"zarr_op_time", zarr_op_time)
df_op.insert(1,"netcdf4_op_time", netcdf4_op_time)
df_op.insert(2,"hdf5_op_time", hdf5_op_time)

df_read = pd.DataFrame()
df_read.insert(0,"zarr_read_time", zarr_read_time)
df_read.insert(1,"netcdf4_read_time", netcdf4_read_time)
df_read.insert(2,"hdf5_read_time", hdf5_read_time)

df_write = pd.DataFrame()
df_write.insert(0,"zarr_write_time", zarr_write_time)
df_write.insert(1,"netcdf4_write_time", netcdf4_write_time)
df_write.insert(2,"hdf5_write_time", hdf5_write_time)


In [113]:
#%%script false --no-raise-error
df_op.to_pickle("data/s_plotting_df_op.pk1")
df_read.to_pickle("data/s_plotting_df_read.pk1")
df_write.to_pickle("data/s_plotting_df_write.pk1")

In [114]:
import pandas as pd

df_op = pd.read_pickle("data/s_plotting_df_op.pk1")
df_read = pd.read_pickle("data/s_plotting_df_read.pk1")
df_write = pd.read_pickle("data/s_plotting_df_write.pk1")


In [115]:
import plotly.express as px

fig = px.box(data_frame=df_op, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_op, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_read, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_read, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_write, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_write, log_y=True, marginal="violin", barmode="group")
fig.show()
