In [38]:
%%script false --no-raise-error
%%sh
export ZARR_V3_EXPERIMENTAL_API=1

In [1]:
%%time
# Create Testing Dataset, Change Parameters to vary size 
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_zarr("data/test_dataset.zarr", mode="w", zarr_format=2)

ds.to_netcdf("data/test_dataset.nc", mode="w", format="NETCDF4", engine="netcdf4")

ds.to_netcdf("data/test_dataset.h5", mode="w", engine="h5netcdf", invalid_netcdf=True)


ModuleNotFoundError: No module named 'h5netcdf'

In [40]:
#dataset creation for plotting

import zarr
import netCDF4
import h5py


zarr_op_time = []
netcdf4_op_time = []
hdf5_op_time = []

zarr_read_time = []
netcdf4_read_time = []
hdf5_read_time = []

zarr_write_time = []
netcdf4_write_time = []
hdf5_write_time = []

In [41]:
%%time
#%%writefile func/open_zarr_benchmark.py
def open_zarr_benchmark():
    #Open benchmark zarr
    import zarr
    import time

    for i in range(10000):
        start_time = time.time()
        ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
        zarr_op_time.append(time.time() - start_time) 
        
open_zarr_benchmark()

CPU times: user 232 ms, sys: 127 ms, total: 359 ms
Wall time: 329 ms


In [42]:
import zarr
ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
ds_zarr["x"].info

0,1
Name,/x
Type,zarr.core.Array
Data type,float64
Shape,"(1000000, 10, 10, 1)"
Chunk shape,"(62500, 2, 2, 1)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,800000000 (762.9M)


In [43]:
%%time
#%%writefile func/read_zarr_benchmark.py

#Read benchmark zarr  random

def read_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1000):
        #print(f"Currently in i: {i}")
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
        
        start_time = time.time()
        tmp = ds_zarr["x"][val_picked]
        zarr_read_time.append(time.time() - start_time) 
        
        #print(f"Current val read at {val_picked}: {tmp}")
    
read_zarr()

CPU times: user 3.02 s, sys: 2.41 s, total: 5.42 s
Wall time: 5.21 s


In [44]:
%%time
#%%writefile func/write_zarr_benchmark.py

#Write benchmark zarr  random

def write_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1_000):
        #print(f"Currently in i: {i}")
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
        prev = ds_zarr["x"][val_picked]
        curr = ds_zarr["x"][val_picked]
        fill = np.random.rand(10, 10, 1)

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.time()
        ds_zarr["x"][val_picked] = fill
        zarr_write_time.append(time.time() - start_time) 
        
        curr = ds_zarr["x"][val_picked]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_zarr()
    

CPU times: user 22.9 s, sys: 47.9 s, total: 1min 10s
Wall time: 1min 10s


In [45]:
%%time
#%%writefile func/open_netcdf4_benchmark.py
#Open Benchmark netcdf4 

def open_netcdf4():
    import time
    import netCDF4

    for i in range(1_0000):
        start_time = time.time()
        ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        netcdf4_op_time.append(time.time() - start_time)

open_netcdf4()

CPU times: user 13.9 s, sys: 1.65 s, total: 15.6 s
Wall time: 17.2 s


In [46]:
import netCDF4

ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")

print(ds_netcdf4)
print(ds_netcdf4.variables["x"])

<class 'netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): 1(1000000), 2(10), 3(10), 4(1)
    variables(dimensions): float64 u(1, 2, 3, 4), float64 v(1, 2, 3, 4), float64 w(1, 2, 3, 4), float64 x(1, 2, 3, 4), float64 y(1, 2, 3, 4), float64 z(1, 2, 3, 4)
    groups: 
<class 'netCDF4.Variable'>
float64 x(1, 2, 3, 4)
    _FillValue: nan
unlimited dimensions: 
current shape = (1000000, 10, 10, 1)
filling on


In [47]:
%%time
#%%writefile func/read_netcdf4_benchmark.py
#Read Benchmark netcdf4  random

def read_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
            
        start_time = time.time()
        tmp = ds_netcdf4.variables["x"][val_picked]
        netcdf4_read_time.append(time.time() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")

read_netcdf4()

CPU times: user 96.7 ms, sys: 0 ns, total: 96.7 ms
Wall time: 92.8 ms


In [48]:
%%time
#%%writefile func/write_netcdf4_benchmark.py
#Write Benchmark netcdf4  random


def write_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp = 0
        prev = ds_netcdf4.variables["x"][val_picked]
        curr = ds_netcdf4.variables["x"][val_picked]
        fill = np.random.rand(10, 10, 1)

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
            
        start_time = time.time()
        ds_netcdf4.variables["x"][val_picked] = fill
        netcdf4_write_time.append(time.time() - start_time)
        
        curr = ds_netcdf4.variables["x"][val_picked]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_netcdf4()

CPU times: user 359 ms, sys: 31.8 ms, total: 391 ms
Wall time: 360 ms


In [49]:
%%time
#%%writefile func/open_hdf5_benchmark.py
#Open Benchmark hdf5

def open_hdf5():
    import h5py
    import time

    for i in range(10_000):
        start_time = time.time()
        ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        hdf5_op_time.append(time.time() - start_time)
        
open_hdf5()

CPU times: user 120 ms, sys: 21.2 ms, total: 141 ms
Wall time: 129 ms


In [50]:
import h5py

ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")

print(ds_hdf5)
print(ds_hdf5.get("x"))

<HDF5 file "test_dataset.h5" (mode r+)>
<HDF5 dataset "x": shape (1000000, 10, 10, 1), type "<f8">


In [51]:
%%time
#%%writefile func/read_hdf5_benchmark.py
#Read Benchmark hdf5  random

def read_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
            
        start_time = time.time()
        tmp = ds_hdf5.get("x")[val_picked]
        hdf5_read_time.append(time.time() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")
        
read_hdf5()

CPU times: user 58.9 ms, sys: 0 ns, total: 58.9 ms
Wall time: 53.9 ms


In [52]:
%%time
#%%writefile func/write_hdf5_benchmark.py
#Write Benchmark hdf5 random

def write_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        val_picked = np.random.randint(low=0, high=1_000_000-1)
        tmp=0
        prev = ds_hdf5.get("x")[val_picked]
        curr = ds_hdf5.get("x")[val_picked]
        fill = np.random.rand(10, 10, 1)
            
        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.time()
        ds_hdf5.get("x")[val_picked] = fill
        hdf5_write_time.append(time.time() - start_time)
        
        curr = ds_hdf5.get("x")[val_picked]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_hdf5()

CPU times: user 286 ms, sys: 41.4 ms, total: 327 ms
Wall time: 300 ms


In [53]:
import pandas as pd

df_op = pd.DataFrame()
df_op.insert(0,"zarr_op_time", zarr_op_time)
df_op.insert(1,"netcdf4_op_time", netcdf4_op_time)
df_op.insert(2,"hdf5_op_time", hdf5_op_time)

df_read = pd.DataFrame()
df_read.insert(0,"zarr_read_time", zarr_read_time)
df_read.insert(1,"netcdf4_read_time", netcdf4_read_time)
df_read.insert(2,"hdf5_read_time", hdf5_read_time)

df_write = pd.DataFrame()
df_write.insert(0,"zarr_write_time", zarr_write_time)
df_write.insert(1,"netcdf4_write_time", netcdf4_write_time)
df_write.insert(2,"hdf5_write_time", hdf5_write_time)


In [54]:
#%%script false --no-raise-error
df_op.to_pickle("data/plotting/s_plotting_df_op.pk1")
df_read.to_pickle("data/plotting/s_plotting_df_read_random.pk1")
df_write.to_pickle("data/plotting/s_plotting_df_write_random.pk1")

In [55]:
import pandas as pd

df_op = pd.read_pickle("data/plotting/s_plotting_df_op.pk1")
df_read = pd.read_pickle("data/plotting/s_plotting_df_read_random.pk1")
df_write = pd.read_pickle("data/plotting/s_plotting_df_write_random.pk1")

def filter_outliers(df) -> pd.DataFrame:
    cols = df.select_dtypes('number').columns
    df_sub = df.loc[:, cols]
    
    iqr = df_sub.quantile(0.75, numeric_only=False) - df_sub.quantile(0.25, numeric_only=False)
    lim = np.abs((df_sub - df_sub.median()) / iqr) < 2.22
    df.loc[:, cols] = df_sub.where(lim, np.nan)
    df.dropna(inplace=True)
    
    return df

if True:
    
    df_op = filter_outliers(df_op)
    df_read = filter_outliers(df_read)
    df_write = filter_outliers(df_write)


In [56]:
import plotly.express as px

fig = px.box(data_frame=df_op, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_op, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_read, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_read, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_write, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_write, log_y=True, marginal="violin", barmode="group")
fig.show()
