In [32]:
%%script false --no-raise-error
%%sh
export ZARR_V3_EXPERIMENTAL_API=1

In [33]:
%%script false --no-raise-error
import dev_logging as d_log
import importlib
importlib.reload(d_log)
import multiprocessing
import time

proc = multiprocessing.Process(target=d_log.run_logging, args=('text.txt',))

proc.start()
print(f"pid: {proc.pid}")

for i in range(10):
    #print(i)
    time.sleep(1)
    print(proc.is_alive())

proc.kill()
proc.join()
proc.close()


In [34]:
%%script false --no-raise-error
import dev_logging as d_log
import importlib
importlib.reload(d_log)

d_log.run_logging("test.txt")


In [35]:
%%script false --no-raise-error
%%time
import ctypes
import time

test_plot = []

lib = ctypes.CDLL("/home/test/dkrz_dev/testlib1.so")


for i in range (10000):
    
    start_time = time.time()
    lib.test_open()
    test_plot.append(time.time() - start_time)

In [36]:
%%script false --no-raise-error
%%time
import ctypes
import time

test_plot_2 = []

lib = ctypes.CDLL("/home/test/dkrz_dev/testlib2.so")


for i in range (10000):
    
    start_time = time.time()
    lib.test_open_netcdf()
    test_plot_2.append(time.time() - start_time)

In [37]:
%%time
# Create Testing Dataset, Change Parameters to vary size 
import xarray as xr
import numpy as np

u = np.random.rand(1_000_000, 10, 10, 1)
v = np.random.rand(1_000_000, 10, 10, 1)
w = np.random.rand(1_000_000, 10, 10, 1)
x = np.random.rand(1_000_000, 10, 10, 1)
y = np.random.rand(1_000_000, 10, 10, 1)
z = np.random.rand(1_000_000, 10, 10, 1)

ds = xr.Dataset(data_vars=dict(
                            u=(["1","2","3","4"], u),
                            v=(["1","2","3","4"], v),
                            w=(["1","2","3","4"], w),
                            x=(["1","2","3","4"], x),
                            y=(["1","2","3","4"], y),
                            z=(["1","2","3","4"], z)
                            ))

ds.to_zarr("data/test_dataset.zarr", mode="w", zarr_format=2)

ds.to_netcdf("data/test_dataset.nc", mode="w", format="NETCDF4", engine="netcdf4")

ds.to_netcdf("data/test_dataset.h5", mode="w", engine="h5netcdf", invalid_netcdf=True)


CPU times: user 7.13 s, sys: 8.76 s, total: 15.9 s
Wall time: 17.7 s


In [38]:
#dataset creation for plotting

import zarr
import netCDF4
import h5py


zarr_op_time = []
netcdf4_op_time = []
hdf5_op_time = []

zarr_read_time = []
netcdf4_read_time = []
hdf5_read_time = []

zarr_write_time = []
netcdf4_write_time = []
hdf5_write_time = []

In [39]:
%%time
#%%writefile func/open_zarr_benchmark.py
def open_zarr_benchmark():
    #Open benchmark zarr
    import zarr
    import time
    import dev_logging as d_log

    for i in range(10000):
        #print(i)
        
        start_time = time.monotonic()
        ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
        zarr_op_time.append(time.monotonic() - start_time)
        
open_zarr_benchmark()

CPU times: user 209 ms, sys: 143 ms, total: 353 ms
Wall time: 324 ms


In [40]:
import zarr
ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)
ds_zarr["x"].info

0,1
Name,/x
Type,zarr.core.Array
Data type,float64
Shape,"(1000000, 10, 10, 1)"
Chunk shape,"(62500, 2, 2, 1)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,800000000 (762.9M)


In [41]:
%%time
#%%writefile func/read_zarr_benchmark.py

#Read benchmark zarr  serial

def read_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1000):
        #print(f"Currently in i: {i}")
        tmp=0
        
        start_time = time.monotonic()
        tmp = ds_zarr["x"][i]
        zarr_read_time.append(time.monotonic() - start_time) 
        
        #print(f"Current val read at {val_picked}: {tmp}")
    
read_zarr()

CPU times: user 3.33 s, sys: 2.83 s, total: 6.17 s
Wall time: 5.67 s


In [42]:
%%time
#%%writefile func/write_zarr_benchmark.py

#Write benchmark zarr   serial

def write_zarr():
    import time
    import numpy as np
    import zarr

    ds_zarr = zarr.open(store="data/test_dataset.zarr",mode="r+" ,zarr_version=2)

    for i in range(1_000):
        #print(f"Currently in i: {i}")
        tmp=0
        prev = ds_zarr["x"][i]
        curr = ds_zarr["x"][i]
        fill = np.random.rand(10, 10, 1)

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.monotonic()
        ds_zarr["x"][i] = fill
        zarr_write_time.append(time.monotonic() - start_time) 
        
        curr = ds_zarr["x"][i]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_zarr()
    

CPU times: user 25.4 s, sys: 53.2 s, total: 1min 18s
Wall time: 1min 17s


In [43]:
%%time
#%%writefile func/open_netcdf4_benchmark.py
#Open Benchmark netcdf4 

def open_netcdf4():
    import time
    import netCDF4

    for i in range(1_0000):
        start_time = time.monotonic()
        ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        netcdf4_op_time.append(time.monotonic() - start_time)

open_netcdf4()

CPU times: user 12.9 s, sys: 1.86 s, total: 14.7 s
Wall time: 13.5 s


In [44]:
import netCDF4

ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")

print(ds_netcdf4)

<class 'netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): 1(1000000), 2(10), 3(10), 4(1)
    variables(dimensions): float64 u(1, 2, 3, 4), float64 v(1, 2, 3, 4), float64 w(1, 2, 3, 4), float64 x(1, 2, 3, 4), float64 y(1, 2, 3, 4), float64 z(1, 2, 3, 4)
    groups: 


In [45]:
%%time
#%%writefile func/read_netcdf4_benchmark.py
#Read Benchmark netcdf4   serial

def read_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        tmp=0
            
        start_time = time.monotonic()
        tmp = ds_netcdf4.variables["x"][i]
        netcdf4_read_time.append(time.monotonic() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")

read_netcdf4()

CPU times: user 84.2 ms, sys: 0 ns, total: 84.2 ms
Wall time: 78.1 ms


In [46]:
%%time
#%%writefile func/write_netcdf4_benchmark.py
#Write Benchmark netcdf4  serial


def write_netcdf4():
    import time
    import numpy as np
    import netCDF4

    ds_netcdf4 = netCDF4.Dataset("data/test_dataset.nc", mode="r+", format="NETCDF4")
        
    for i in range(1000):
        #print(f"Currently in i: {i}")
            
        tmp = 0
        prev = ds_netcdf4.variables["x"][i]
        curr = ds_netcdf4.variables["x"][i]
        fill = np.random.rand(10, 10, 1)

        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
            
        start_time = time.monotonic()
        ds_netcdf4.variables["x"][i] = fill
        netcdf4_write_time.append(time.monotonic() - start_time)
        
        curr = ds_netcdf4.variables["x"][i]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_netcdf4()

CPU times: user 285 ms, sys: 865 μs, total: 286 ms
Wall time: 261 ms


In [47]:
%%time
#%%writefile func/open_hdf5_benchmark.py
#Open Benchmark hdf5

def open_hdf5():
    import h5py
    import time

    for i in range(10_000):
        start_time = time.monotonic()
        ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        hdf5_op_time.append(time.monotonic() - start_time)
        
open_hdf5()

CPU times: user 132 ms, sys: 16 ms, total: 148 ms
Wall time: 135 ms


In [48]:
%%time
#%%writefile func/read_hdf5_benchmark.py
#Read Benchmark hdf5  serial

def read_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        tmp=0
            
        start_time = time.monotonic()
        tmp = ds_hdf5.get("x")[i]
        hdf5_read_time.append(time.monotonic() - start_time)
            
        #print(f"Current val read at {val_picked}: {tmp}")
        
read_hdf5()

CPU times: user 45.5 ms, sys: 2.04 ms, total: 47.5 ms
Wall time: 43 ms


In [49]:
%%time
#%%writefile func/write_hdf5_benchmark.py
#Write Benchmark hdf5  serial

def write_hdf5():
    import time
    import numpy as np
    import h5py

    ds_hdf5 = h5py.File("data/test_dataset.h5", mode="r+")
        
    for i in range(1_000):
        #print(f"Currently in i: {i}")
            
        tmp=0
        prev = ds_hdf5.get("x")[i]
        curr = ds_hdf5.get("x")[i]
        fill = np.random.rand(10, 10, 1)
            
        #print(f"prev item at {val_picked} is {type(prev)} and curr item at {val_picked} is {type(curr)}, both are the same : {(prev == curr).all()}")
        
        start_time = time.monotonic()
        ds_hdf5.get("x")[i] = fill
        hdf5_write_time.append(time.monotonic() - start_time)
        
        curr = ds_hdf5.get("x")[i]
        #print(f"Current val read at {val_picked}: {(prev == curr).all()}")
        
write_hdf5()

CPU times: user 250 ms, sys: 31.2 ms, total: 281 ms
Wall time: 257 ms


In [50]:
import pandas as pd

df_op = pd.DataFrame()
df_op.insert(0,"zarr_op_time", zarr_op_time)
df_op.insert(1,"netcdf4_op_time", netcdf4_op_time)
df_op.insert(2,"hdf5_op_time", hdf5_op_time)

df_read = pd.DataFrame()
df_read.insert(0,"zarr_read_time", zarr_read_time)
df_read.insert(1,"netcdf4_read_time", netcdf4_read_time)
df_read.insert(2,"hdf5_read_time", hdf5_read_time)

df_write = pd.DataFrame()
df_write.insert(0,"zarr_write_time", zarr_write_time)
df_write.insert(1,"netcdf4_write_time", netcdf4_write_time)
df_write.insert(2,"hdf5_write_time", hdf5_write_time)


In [51]:
%%script false --no-raise-error
import pandas as pd

df_lol = pd.DataFrame()
df_lol.insert(0, "zarr_read_netcdf-c", test_plot)
df_lol.insert(1, "netcdf_read_netcdf-c", test_plot_2)

In [52]:
#%%script false --no-raise-error
df_op.to_json("data/plotting/s_plotting_df_op.json")
df_read.to_json("data/plotting/s_plotting_df_read_serial.json")
df_write.to_json("data/plotting/s_plotting_df_write_serial.json")

In [59]:
import pandas as pd
import numpy as np

df_op = pd.read_json("data/plotting/s_plotting_df_op.json")
df_read = pd.read_json("data/plotting/s_plotting_df_read_serial.json")
df_write = pd.read_json("data/plotting/s_plotting_df_write_serial.json")

df_lol1 = pd.read_json("test_plotting_zarr.json")
df_lol2 = pd.read_json("test_plotting_netcdf.json")
df_lol3 = pd.read_json("test_plotting_hdf.json")

df_lol = pd.concat([df_lol1, df_lol2, df_lol3], axis=1, join="inner")

def filter_outliers(df) -> pd.DataFrame:
    cols = df.select_dtypes('number').columns
    df_sub = df.loc[:, cols]
    
    iqr = df_sub.quantile(0.75, numeric_only=False) - df_sub.quantile(0.25, numeric_only=False)
    lim = np.abs((df_sub - df_sub.median()) / iqr) < 2.22
    df.loc[:, cols] = df_sub.where(lim, np.nan)
    df.dropna(inplace=True)
    
    return df
    

if True:
    
    df_op = filter_outliers(df_op)
    df_read = filter_outliers(df_read)
    df_write = filter_outliers(df_write)
    #df_lol = filter_outliers(df_lol)
    
    


In [60]:
import plotly.express as px

fig = px.box(data_frame=df_lol, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_lol, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_op, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_op, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_read, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_read, log_y=True, marginal="violin", barmode="group")
fig.show()

fig = px.box(data_frame=df_write, log_y=True)
fig.show()

fig = px.histogram(data_frame=df_write, log_y=True, marginal="violin", barmode="group")
fig.show()