In [1]:
%%script false --no-raise-error
%%sh
export ZARR_V3_EXPERIMENTAL_API=1

In [20]:
%%time
# Create Testing Dataset, Change Parameters to vary size 
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_zarr("data/test_dataset.zarr", mode="w", zarr_format=2)

ds.to_netcdf("data/test_dataset.nc", mode="w", format="NETCDF4", engine="netcdf4")

ds.to_netcdf("data/test_dataset.h5", mode="w", engine="h5netcdf")


CPU times: user 6.79 s, sys: 8.41 s, total: 15.2 s
Wall time: 14 s


In [23]:
#dataset creation for plotting

import zarr
import netCDF4
import h5py


zarr_op_time = []
netcdf4_op_time = []
hdf5_op_time = []

zarr_read_time = []
netcdf4_read_time = []
hdf5_read_time = []

zarr_write_time = []
netcdf4_write_time = []
hdf5_write_time = []

In [24]:
%%time
#%%writefile func/open_zarr_benchmark.py
def open_zarr_benchmark():
    #Open benchmark zarr
    import zarr
    import time

    for i in range(10000):
        start_time = time.time()
        ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
        zarr_op_time.append(time.time() - start_time) 
        
open_zarr_benchmark()

CPU times: user 213 ms, sys: 86.2 ms, total: 299 ms
Wall time: 274 ms


In [25]:
%%time
#%%writefile func/read_zarr_benchmark.py

#Read benchmark zarr

def read_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1000):
        #print(f"Currently in i: {i}")
        val_picked = np.random.randint(low=0, high=1000000-1)
        tmp=0
        
        start_time = time.time()
        tmp = ds_zarr["x"][val_picked]
        zarr_read_time.append(time.time() - start_time) 
        
        #print(f"Current val read at {val_picked}: {tmp}")
    
read_zarr()

CPU times: user 2.91 s, sys: 2.72 s, total: 5.63 s
Wall time: 5.39 s


In [None]:
%%time
#%%writefile func/write_zarr_benchmark.py

#Write benchmark zarr

def write_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1_000):
        #print(f"Currently in i: {i}")
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
        prev = ds_zarr["x"][val_picked]
        curr = ds_zarr["x"][val_picked]

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.time()
        ds_zarr["x"][val_picked] = np.random.rand(10,10,1)
        zarr_write_time.append(time.time() - start_time) 
        
        curr = ds_zarr["x"][val_picked]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_zarr()
    

In [2]:
%%time
#%%writefile func/open_netcdf4_benchmark.py
#Open Benchmark netcdf4

def open_netcdf4():
    import time
    import netCDF4

    for i in range(1_0000):
        start_time = time.time()
        ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        netcdf4_op_time.append(time.time() - start_time)

open_netcdf4()

NameError: name 'netcdf4_op_time' is not defined

In [14]:
%%time
#%%writefile func/read_netcdf4_benchmark.py
#Read Benchmark netcdf4

def read_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
            
        start_time = time.time()
        tmp = ds_netcdf4.variables["x"][val_picked]
        netcdf4_read_time.append(time.time() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")

read_netcdf4()

CPU times: user 94.2 ms, sys: 0 ns, total: 94.2 ms
Wall time: 84.9 ms


In [None]:
%%time
#%%writefile func/write_netcdf4_benchmark.py
#Write Benchmark netcdf4


def write_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp = 0
        prev = ds_netcdf4.variables["x"][val_picked]
        curr = ds_netcdf4.variables["x"][val_picked]

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
            
        start_time = time.time()
        ds_netcdf4.variables["x"][val_picked] = np.random.rand(10,10,1)
        netcdf4_write_time.append(time.time() - start_time)
        
        curr = ds_zarr["x"][val_picked]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_netcdf4()

CPU times: user 4.13 s, sys: 13.4 s, total: 17.5 s
Wall time: 19.3 s


In [None]:
%%time
#%%writefile func/open_hdf5_benchmark.py
#Open Benchmark hdf5

def open_hdf5():
    import h5py
    import time

    for i in range(10_000):
        start_time = time.time()
        ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        hdf5_op_time.append(time.time() - start_time)
        
open_hdf5()

CPU times: user 76.8 ms, sys: 53.9 ms, total: 131 ms
Wall time: 117 ms


In [None]:
%%time
#%%writefile func/read_hdf5_benchmark.py
#Read Benchmark hdf5

def read_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
            
        start_time = time.time()
        tmp = ds_hdf5.get("x")[val_picked]
        hdf5_read_time.append(time.time() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")
        
read_hdf5()

CPU times: user 44.1 ms, sys: 10.5 ms, total: 54.6 ms
Wall time: 49 ms


In [10]:
%%time
#%%writefile func/write_hdf5_benchmark.py
#Write Benchmark hdf5

def write_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
        prev = ds_hdf5.get("x")[val_picked]
        curr = ds_hdf5.get("x")[val_picked]
            
        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.time()
        ds_hdf5.get("x")[val_picked] = np.random.rand(10,10,1)
        hdf5_write_time.append(time.time() - start_time)
        
        curr = ds_zarr["x"][val_picked]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_hdf5()

Overwriting func/write_hdf5_benchmark.py
CPU times: user 1.04 ms, sys: 189 μs, total: 1.23 ms
Wall time: 9.26 ms


In [22]:
import pandas as pd

df_op = pd.DataFrame()
df_op.insert(0,"zarr_op_time", zarr_op_time)
df_op.insert(1,"netcdf4_op_time", netcdf4_op_time)
df_op.insert(2,"hdf5_op_time", hdf5_op_time)

df_read = pd.DataFrame()
df_read.insert(0,"zarr_read_time", zarr_read_time)
df_read.insert(1,"netcdf4_read_time", netcdf4_read_time)
df_read.insert(2,"hdf5_read_time", hdf5_read_time)

df_write = pd.DataFrame()
df_write.insert(0,"zarr_write_time", zarr_write_time)
df_write.insert(1,"netcdf4_write_time", netcdf4_write_time)
df_write.insert(2,"hdf5_write_time", hdf5_write_time)


In [23]:
#%%script false --no-raise-error
df_op.to_pickle("data/plotting/s_plotting_df_op.pk1")
df_read.to_pickle("data/plotting/s_plotting_df_read.pk1")
df_write.to_pickle("data/plotting/s_plotting_df_write.pk1")

In [21]:
import pandas as pd

df_op = pd.read_pickle("data/plotting/s_plotting_df_op.pk1")
df_read = pd.read_pickle("data/plotting/s_plotting_df_read.pk1")
df_write = pd.read_pickle("data/plotting/s_plotting_df_write.pk1")


In [22]:
import plotly.express as px

fig = px.box(data_frame=df_op, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_op, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_read, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_read, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_write, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_write, log_y=True, marginal="violin", barmode="group")
fig.show()
