In [59]:
%%script false --no-raise-error
%%sh
export ZARR_V3_EXPERIMENTAL_API=1

In [60]:
%%time
# Create Testing Dataset, Change Parameters to vary size 
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_zarr("data/test_dataset.zarr", mode="w", zarr_format=2)

ds.to_netcdf("data/test_dataset.nc", mode="w", format="NETCDF4", engine="netcdf4")

ds.to_netcdf("data/test_dataset.h5", mode="w", engine="h5netcdf")


CPU times: user 7.04 s, sys: 8.57 s, total: 15.6 s
Wall time: 17.3 s


In [61]:
#dataset creation for plotting

zarr_op_time = []
netcdf4_op_time = []
hdf5_op_time = []

zarr_read_time = []
netcdf4_read_time = []
hdf5_read_time = []

zarr_write_time = []
netcdf4_write_time = []
hdf5_write_time = []

In [62]:
%%time
#Open benchmark zarr
import zarr
import time

for i in range(10000):
    start_time = time.time()
    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
    zarr_op_time.append(time.time() - start_time) 
     


CPU times: user 229 ms, sys: 130 ms, total: 359 ms
Wall time: 329 ms


In [63]:
%%time 
#Read benchmark zarr
import time
import numpy as np
import zarr

ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

for i in range(1000):
    #print(f"Currently in i: {i}")
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp=0
    
    start_time = time.time()
    tmp = ds_zarr["x"][val_picked]
    zarr_read_time.append(time.time() - start_time) 
    
    #print(f"Current val read at {val_picked}: {tmp}")
    

CPU times: user 2.69 s, sys: 3.03 s, total: 5.72 s
Wall time: 5.47 s


In [64]:
%%time 
#Write benchmark zarr
import time
import numpy as np
import zarr

ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

for i in range(1000):
    #print(f"Currently in i: {i}")
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp=0
    prev = ds_zarr["x"][val_picked]
    curr = ds_zarr["x"][val_picked]

    #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
    
    start_time = time.time()
    ds_zarr["x"][val_picked] = np.random.rand(10,10,1)
    zarr_write_time.append(time.time() - start_time) 
    
    curr = ds_zarr["x"][val_picked]
    #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
    

CPU times: user 24.8 s, sys: 47.1 s, total: 1min 11s
Wall time: 1min 11s


In [65]:
%%time 
#Open Benchmark netcdf4
import time
import netCDF4

for i in range(10000):
    start_time = time.time()
    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
    netcdf4_op_time.append(time.time() - start_time)

CPU times: user 12.6 s, sys: 1.4 s, total: 14 s
Wall time: 12.8 s


In [66]:
%%time
#Read Benchmark netcdf4
import time
import numpy as np
import netCDF4

ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
    
    
for i in range(1000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp=0
        
    start_time = time.time()
    tmp = ds_netcdf4.variables["x"][val_picked]
    netcdf4_read_time.append(time.time() - start_time)
        
    #print(f"Current val read at {val_picked}: {tmp}")

CPU times: user 95.7 ms, sys: 0 ns, total: 95.7 ms
Wall time: 86.8 ms


In [67]:
%%time
#Write Benchmark netcdf4
import time
import numpy as np
import netCDF4

ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
    
for i in range(1000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp = 0
    prev = ds_netcdf4.variables["x"][val_picked]
    curr = ds_netcdf4.variables["x"][val_picked]

    #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
    start_time = time.time()
    ds_netcdf4.variables["x"][val_picked] = np.random.rand(10,10,1)
    netcdf4_write_time.append(time.time() - start_time)
    
    curr = ds_zarr["x"][val_picked]
    #print(f"Current val read at {val_picked}: {(prev == curr).all()}")

CPU times: user 3.82 s, sys: 12.5 s, total: 16.4 s
Wall time: 17.9 s


In [68]:
%%time
#Open Benchmark hdf5
import h5py
import time

for i in range(10000):
    start_time = time.time()
    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
    hdf5_op_time.append(time.time() - start_time)

CPU times: user 125 ms, sys: 117 ms, total: 242 ms
Wall time: 221 ms


In [69]:
%%time
#Read Benchmark hdf5
import time
import numpy as np
import h5py

ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
    
for i in range(1000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp=0
        
    start_time = time.time()
    tmp = ds_hdf5.get("x")[val_picked]
    hdf5_read_time.append(time.time() - start_time)
        
    #print(f"Current val read at {val_picked}: {tmp}")

CPU times: user 42.7 ms, sys: 9.73 ms, total: 52.4 ms
Wall time: 47.7 ms


In [70]:
%%time
#Write Benchmark hdf5
import time
import numpy as np
import h5py

ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
    
for i in range(1000):
    #print(f"Currently in i: {i}")
        
    val_picked = np.random.randint(low=0, high=1000000-1)
    tmp=0
    prev = ds_hdf5.get("x")[val_picked]
    curr = ds_hdf5.get("x")[val_picked]
        
    #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
    
    start_time = time.time()
    ds_hdf5.get("x")[val_picked] = np.random.rand(10,10,1)
    hdf5_write_time.append(time.time() - start_time)
    
    curr = ds_zarr["x"][val_picked]
    #print(f"Current val read at {val_picked}: {(prev == curr).all()}")

CPU times: user 3.89 s, sys: 12.7 s, total: 16.6 s
Wall time: 15.2 s


In [71]:
import pandas as pd

df_op = pd.DataFrame()
df_op.insert(0,"zarr_op_time", zarr_op_time)
df_op.insert(1,"netcdf4_op_time", netcdf4_op_time)
df_op.insert(2,"hdf5_op_time", hdf5_op_time)

df_read = pd.DataFrame()
df_read.insert(0,"zarr_read_time", zarr_read_time)
df_read.insert(1,"netcdf4_read_time", netcdf4_read_time)
df_read.insert(2,"hdf5_read_time", hdf5_read_time)

df_write = pd.DataFrame()
df_write.insert(0,"zarr_write_time", zarr_write_time)
df_write.insert(1,"netcdf4_write_time", netcdf4_write_time)
df_write.insert(2,"hdf5_write_time", hdf5_write_time)


In [72]:
#%%script false --no-raise-error
df_op.to_pickle("data/s_plotting_df_op.pk1")
df_read.to_pickle("data/s_plotting_df_read.pk1")
df_write.to_pickle("data/s_plotting_df_write.pk1")

In [73]:
import pandas as pd

df_op = pd.read_pickle("data/s_plotting_df_op.pk1")
df_read = pd.read_pickle("data/s_plotting_df_read.pk1")
df_write = pd.read_pickle("data/s_plotting_df_write.pk1")


In [74]:
import plotly.express as px

fig = px.box(data_frame=df_op, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_op, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_read, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_read, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_write, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_write, log_y=True, marginal="violin", barmode="group")
fig.show()
