In [18]:
%%script false --no-raise-error
%%sh
export ZARR_V3_EXPERIMENTAL_API=1

In [19]:
%%time
# Create Testing Dataset, Change Parameters to vary size 
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_zarr("data/test_dataset.zarr", mode="w", zarr_format=2)

ds.to_netcdf("data/test_dataset.nc", mode="w", format="NETCDF4", engine="netcdf4")

ds.to_netcdf("data/test_dataset.h5", mode="w", engine="h5netcdf", invalid_netcdf=True)


CPU times: user 7.01 s, sys: 8.96 s, total: 16 s
Wall time: 18 s


In [48]:
#dataset creation for plotting

import zarr
import netCDF4
import h5py


zarr_op_time = []
netcdf4_op_time = []
hdf5_op_time = []

zarr_read_time = []
netcdf4_read_time = []
hdf5_read_time = []

zarr_write_time = []
netcdf4_write_time = []
hdf5_write_time = []

In [49]:
%%time
#%%writefile func/open_zarr_benchmark.py
def open_zarr_benchmark():
    #Open benchmark zarr
    import zarr
    import time

    for i in range(10000):
        start_time = time.time()
        ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
        zarr_op_time.append(time.time() - start_time) 
        
open_zarr_benchmark()

CPU times: user 312 ms, sys: 45.3 ms, total: 358 ms
Wall time: 322 ms


In [50]:
%%time
#%%writefile func/read_zarr_benchmark.py

#Read benchmark zarr  serial

def read_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1000):
        #print(f"Currently in i: {i}")
        tmp=0
        
        start_time = time.time()
        tmp = ds_zarr["x"][i]
        zarr_read_time.append(time.time() - start_time) 
        
        #print(f"Current val read at {val_picked}: {tmp}")
    
read_zarr()

CPU times: user 3.45 s, sys: 12.7 s, total: 16.2 s
Wall time: 18.1 s


In [51]:
%%time
#%%writefile func/write_zarr_benchmark.py

#Write benchmark zarr   serial

def write_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1_000):
        #print(f"Currently in i: {i}")
        tmp=0
        prev = ds_zarr["x"][i]
        curr = ds_zarr["x"][i]
        fill = np.random.rand(10,10,1)

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.time()
        ds_zarr["x"][i] = fill
        zarr_write_time.append(time.time() - start_time) 
        
        curr = ds_zarr["x"][i]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_zarr()
    

CPU times: user 24.5 s, sys: 1min 15s, total: 1min 39s
Wall time: 1min 40s


In [52]:
%%time
#%%writefile func/open_netcdf4_benchmark.py
#Open Benchmark netcdf4 

def open_netcdf4():
    import time
    import netCDF4

    for i in range(1_0000):
        start_time = time.time()
        ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        netcdf4_op_time.append(time.time() - start_time)

open_netcdf4()

CPU times: user 12.7 s, sys: 1.26 s, total: 14 s
Wall time: 12.6 s


In [53]:
%%time
#%%writefile func/read_netcdf4_benchmark.py
#Read Benchmark netcdf4   serial

def read_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        tmp=0
            
        start_time = time.time()
        tmp = ds_netcdf4.variables["x"][i]
        netcdf4_read_time.append(time.time() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")

read_netcdf4()

CPU times: user 67.6 ms, sys: 8.09 ms, total: 75.6 ms
Wall time: 74 ms


In [54]:
%%time
#%%writefile func/write_netcdf4_benchmark.py
#Write Benchmark netcdf4  serial


def write_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        tmp = 0
        prev = ds_netcdf4.variables["x"][i]
        curr = ds_netcdf4.variables["x"][i]
        fill = np.random.rand(10,10,1)

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
            
        start_time = time.time()
        ds_netcdf4.variables["x"][i] = fill
        netcdf4_write_time.append(time.time() - start_time)
        
        curr = ds_netcdf4.variables["x"][i]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_netcdf4()

CPU times: user 280 ms, sys: 0 ns, total: 280 ms
Wall time: 252 ms


In [55]:
%%time
#%%writefile func/open_hdf5_benchmark.py
#Open Benchmark hdf5

def open_hdf5():
    import h5py
    import time

    for i in range(10_000):
        start_time = time.time()
        ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        hdf5_op_time.append(time.time() - start_time)
        
open_hdf5()

CPU times: user 102 ms, sys: 35.9 ms, total: 138 ms
Wall time: 132 ms


In [56]:
%%time
#%%writefile func/read_hdf5_benchmark.py
#Read Benchmark hdf5  serial

def read_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        tmp=0
            
        start_time = time.time()
        tmp = ds_hdf5.get("x")[i]
        hdf5_read_time.append(time.time() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")
        
read_hdf5()

CPU times: user 48.5 ms, sys: 0 ns, total: 48.5 ms
Wall time: 44 ms


In [57]:
%%time
#%%writefile func/write_hdf5_benchmark.py
#Write Benchmark hdf5  serial

def write_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        tmp=0
        prev = ds_hdf5.get("x")[i]
        curr = ds_hdf5.get("x")[i]
        fill = np.random.rand(10,10,1)
            
        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.time()
        ds_hdf5.get("x")[i] = fill
        hdf5_write_time.append(time.time() - start_time)
        
        curr = ds_hdf5.get("x")[i]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_hdf5()

CPU times: user 266 ms, sys: 11.4 ms, total: 277 ms
Wall time: 249 ms


In [72]:
import pandas as pd

df_op = pd.DataFrame()
df_op.insert(0,"zarr_op_time", zarr_op_time)
df_op.insert(1,"netcdf4_op_time", netcdf4_op_time)
df_op.insert(2,"hdf5_op_time", hdf5_op_time)

df_read = pd.DataFrame()
df_read.insert(0,"zarr_read_time", zarr_read_time)
df_read.insert(1,"netcdf4_read_time", netcdf4_read_time)
df_read.insert(2,"hdf5_read_time", hdf5_read_time)

df_write = pd.DataFrame()
df_write.insert(0,"zarr_write_time", zarr_write_time)
df_write.insert(1,"netcdf4_write_time", netcdf4_write_time)
df_write.insert(2,"hdf5_write_time", hdf5_write_time)


In [59]:
#%%script false --no-raise-error
df_op.to_pickle("data/plotting/s_plotting_df_op.pk1")
df_read.to_pickle("data/plotting/s_plotting_df_read_serial.pk1")
df_write.to_pickle("data/plotting/s_plotting_df_write_serial.pk1")

In [105]:
import pandas as pd
import numpy as np

df_op = pd.read_pickle("data/plotting/s_plotting_df_op.pk1")
df_read = pd.read_pickle("data/plotting/s_plotting_df_read_serial.pk1")
df_write = pd.read_pickle("data/plotting/s_plotting_df_write_serial.pk1")

def filter_outliers(df) -> pd.DataFrame:
    cols = df.select_dtypes('number').columns
    df_sub = df.loc[:, cols]
    
    iqr = df_sub.quantile(0.75, numeric_only=False) - df_sub.quantile(0.25, numeric_only=False)
    lim = np.abs((df_sub - df_sub.median()) / iqr) < 2.22
    df.loc[:, cols] = df_sub.where(lim, np.nan)
    df.dropna(inplace=True)
    
    return df
    

if True:
    
    df_op = filter_outliers(df_op)
    df_read = filter_outliers(df_read)
    df_write = filter_outliers(df_write)
    
    


In [106]:
import plotly.express as px

fig = px.box(data_frame=df_op, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_op, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_read, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_read, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_write, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_write, log_y=True, marginal="violin", barmode="group")
fig.show()