In [None]:
import zarr
import numpy as np

root = zarr.create_group(store="c-stuff/data/datasets/test_dataset.zarr", zarr_format=3, overwrite=True)

x = root.create_array(name="X", shape=(134217728), dtype="f8", filters=None)
x[:] = np.random.random_sample(134217728)

root["X"].info_complete()

Type               : Array
Zarr format        : 3
Data type          : DataType.float64
Shape              : (134217728,)
Chunk shape        : (262144,)
Order              : C
Read-only          : False
Store type         : LocalStore
Filters            : ()
Serializer         : BytesCodec(endian=<Endian.little: 'little'>)
Compressors        : (ZstdCodec(level=0, checksum=False),)
No. bytes          : 1073741824 (1.0G)
No. bytes stored   : 1007889652
Storage ratio      : 1.1
Chunks Initialized : 0

In [None]:
root["X"].info

Type               : Array
Zarr format        : 2
Data type          : float64
Shape              : (134217728,)
Chunk shape        : (262144,)
Order              : C
Read-only          : False
Store type         : LocalStore
Filters            : ()
Compressors        : (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),)
No. bytes          : 1073741824 (1.0G)

In [23]:
import zarr
arr = zarr.create(shape=(10,), chunks=(2,), dtype="float32")

arr.info

Type               : Array
Zarr format        : 3
Data type          : DataType.float32
Shape              : (10,)
Chunk shape        : (2,)
Order              : C
Read-only          : False
Store type         : MemoryStore
Filters            : ()
Serializer         : BytesCodec(endian=<Endian.little: 'little'>)
Compressors        : (ZstdCodec(level=0, checksum=False),)
No. bytes          : 40

In [2]:
import netCDF4
import numpy as np

root = netCDF4.Dataset("c-stuff/data/datasets/test_dataset.nc", "w", format="NETCDF4")
root.createGroup("/")
root.createDimension("u", 134217728)
       
x = root.createVariable("X", "f8", "u")
   
x[:] = np.random.random_sample(134217728)

root

root.close()

In [1]:
import h5py
import numpy as np

root = h5py.File("c-stuff/data/datasets/test_dataset.h5", "w-")
x = root.create_dataset("X", shape=(134217728), dtype="f8")
x[:] = np.random.random_sample(134217728)

root

root.close()

In [4]:
import pandas as pd
import numpy as np

df_bench = pd.DataFrame()
df_bench.insert(0, "bench zarr", ds_zarr.log)
df_bench.insert(1, "bench hdf5", ds_hdf5.log)
df_bench.insert(2, "bench netcdf4", ds_netcdf4.log)

def filter_outliers(df) -> pd.DataFrame:
    cols = df.select_dtypes('number').columns
    df_sub = df.loc[:, cols]
    
    iqr = df_sub.quantile(0.75, numeric_only=False) - df_sub.quantile(0.25, numeric_only=False)
    lim = np.abs((df_sub - df_sub.median()) / iqr) < 2.22
    df.loc[:, cols] = df_sub.where(lim, np.nan)
    df.dropna(inplace=True)
    
    return df

if False:
    
    df_bench = filter_outliers(df_bench)

In [None]:
import func.datastruct as ds

form = {"X": ([512, 512, 512], [512, 512, 1]), "Y": ([10, 10], [2, 2])}

ds_test = ds.Datastruct()
ds_test.create(path="test.h5", form=form, engine="hdf5")

In [None]:
import func.datastruct as ds
ds_test = ds.Datastruct()
ds_test.open(mode="r+", engine="hdf5", path="test.h5")
print(ds_test.dataset["X"][:])

In [None]:
import func.datastruct as ds

form = {"X": ([512, 512, 512], [512, 512, 1]), "Y": ([10, 10], [2, 2])}


ds_test = ds.Datastruct()
ds_test.create(path="test.nc", form=form, engine="netcdf4")

In [None]:
ds_test.open(mode="r+", engine="netcdf4", path="test.nc")
print(ds_test.dataset.variables["Y"])

In [59]:
%%script false --no-raise-error
import pandas as pd

df_bench = pd.read_json("data/plotting/plotting_bench(1).json")

In [26]:
#%%script false --no-raise-error
import pandas as pd

df_bench = pd.read_json("data/plotting/plotting_bench_variable.json")
df_prev = pd.read_json("data/plotting/plotting_previous.json")

In [9]:
#%%script false --no-raise-error
import pandas as pd

import plotly.express as px

df = pd.DataFrame()

tmp = pd.read_json("/home/dev/dkrz_dev/c-stuff/test_hdf5-c.json")
df_hdf5_c = tmp["hdf5-c-read"].tolist()

tmp = pd.DataFrame(data={"time taken": df_hdf5_c, "format": f"hdf5-c-134217728-134217728", "run":1, "engine": "hdf5-c"})
df = pd.concat([df, tmp], ignore_index=True) 

tmp = pd.read_json("/home/dev/dkrz_dev/c-stuff/test_hdf5-c_async.json")
df_hdf5_async = tmp["hdf5-c-async-read"].tolist()

tmp = pd.DataFrame(data={"time taken": df_hdf5_async, "format": f"hdf5-async-134217728-134217728", "run":1, "engine": "hdf5-async"})
df = pd.concat([df, tmp], ignore_index=True) 

tmp = pd.read_json("/home/dev/dkrz_dev/c-stuff/test_hdf5-c_parallel.json")
df_hdf5_async = tmp["hdf5-c-read-parallel"].tolist()

tmp = pd.DataFrame(data={"time taken": df_hdf5_async, "format": f"hdf5-parallel-134217728-134217728", "run":1, "engine": "hdf5-parallel"})
df = pd.concat([df, tmp], ignore_index=True) 

tmp = pd.read_json("/home/dev/dkrz_dev/c-stuff/test_hdf5-c_async_parallel.json")
df_hdf5_async = tmp["hdf5-c-async-read-parallel"].tolist()

tmp = pd.DataFrame(data={"time taken": df_hdf5_async, "format": f"hdf5-async-parallel-134217728-134217728", "run":1, "engine": "hdf5-async-parallel"})
df = pd.concat([df, tmp], ignore_index=True) 

tmp = pd.read_json("/home/dev/dkrz_dev/c-stuff/test_hdf5_subfiling.json")
df_hdf5_subfiling = tmp["hdf5-subfiling-read"].tolist()

tmp = pd.DataFrame(data={"time taken": df_hdf5_subfiling, "format": f"hdf5-subfiling-134217728-134217728", "run":1, "engine": "hdf5-subfiling"})
df = pd.concat([df, tmp], ignore_index=True) 

#tmp = pd.read_json("/home/dev/dkrz_dev/c-stuff/test_netcdf4.json")
#df_netcdf4_c = tmp["netcdf4-read"].tolist()

#tmp = pd.DataFrame(data={"time taken": df_netcdf4_c, "format": f"netcdf4-c-134217728-134217728", "run":1, "engine": "netcdf4-c"})
#df = pd.concat([df, tmp], ignore_index=True) 

fig =  fig = px.box(data_frame=df, log_y=True, x="engine", y="time taken", color="run", hover_data=["format"])
fig.show()


In [27]:
import plotly.express as px 
import plotly.graph_objects as go
import numpy as np

#pd.options.plotting.backend = "plotly"

plot_previous = True
save_previous = True

df = df_bench 


# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    mean =  df_format["time taken"].mean()
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "filesize per chunk": df_format["filesize per chunk"].iat[0]}])
    smth = pd.concat([smth, tmp], ignore_index=True)

if plot_previous:
    filter = df_prev["format"].drop_duplicates(ignore_index=True)  
    prev_smth = pd.DataFrame()
    for format in filter:
        mask = df_prev["format"].values == format
        df_format = df_prev[mask]
        mean =  df_format["time taken"].mean()
        tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "filesize per chunk": df_format["filesize per chunk"].iat[0]}])
        prev_smth = pd.concat([prev_smth, tmp], ignore_index=True)

fig = px.line(data_frame=smth, log_y=True, x="filesize per chunk", y="mean time", color="engine", markers=True, hover_data=["format"], title="loading complete dataset with varying chunksizes")
fig.show()

if plot_previous:
    fig = px.line(data_frame=prev_smth, log_y=True, x="filesize per chunk", y="mean time", color="engine", markers=True, hover_data=["format"], title="previous: loading complete dataset with varying chunksizes")
    fig.show()

# plot runs by engine
fig = px.box(data_frame=df, log_y=True, x="engine", y="time taken", color="run", hover_data=["format"])
fig.show()

if plot_previous:
    fig = px.box(data_frame=df_prev, log_y=True, x="engine", y="time taken", color="run", hover_data=["format"])
    fig.show()

# plot formats by chunksize
fig = px.box(data_frame=df, log_y=True, x="filesize per chunk", y="time taken", color="engine", hover_data=["format"], title="filesize per chunk")
fig.show()

if plot_previous:
    fig =  fig = px.box(data_frame=df_prev, log_y=True, x="filesize per chunk", y="time taken", color="engine", hover_data=["format"], title="previous: filesize per chunk")
    fig.show()

filter = df["engine"].drop_duplicates(ignore_index=True)

for engine in filter:
    
    mask = df["engine"].values == engine
    df_engine = df[mask]
    fig =  fig = px.box(data_frame=df_engine, log_y=True, x="engine", y="time taken", color="filesize per chunk",)
    fig.show()


filter = df["run"].drop_duplicates(ignore_index=True)

fig = px.violin(data_frame=df, log_x=True, x="time taken", y="format", color="engine")
fig.show()

for engine in filter:
    
    mask = df["run"].values == engine
    fig = px.box(data_frame=df[mask], log_y=True, x="engine", y="time taken", color="filesize per chunk", points="all",)
    #fig.show()
    
    fig = px.violin(data_frame=df[mask], log_x=True, x="time taken", y="filesize per chunk", color="engine")
    fig.show()
    

if save_previous:
    df.to_json("data/plotting/plotting_previous.json")

In [16]:
%%script false --no-raise-error
from mpi4py import MPI
import numpy as np
from netCDF4 import Dataset

rank = MPI.COMM_WORLD.rank
nc = Dataset('parallel_test.nc','w',parallel=True)
d = nc.createDimension('dim',4)
v = nc.createVariable('var', np.int64, 'dim')
v[rank] = rank
nc.close()

In [None]:
%%script false --no-raise-error
import pandas as pd

log = [0, 1, 2, 3, 4, 5, 6, 7]
log2 = [7, 15, 154, 155, 566,7 ,8]

df_test = pd.DataFrame()


df_tmp = pd.DataFrame(data={"time taken": log, "format": f"lol-{5}-{5}", "id": 0})
df_tmp2 = pd.DataFrame(data={"time taken": log2, "format": f"lol-{6}-{6}", "id": 1})

df_test = pd.concat([df_test, df_tmp], ignore_index=True)
df_test = pd.concat([df_test, df_tmp2], ignore_index=True)

#print(df_test)

filters = df_test["id"].drop_duplicates(ignore_index=True)

for id in filters:
    
    mask = df_test["id"].values == id
    some = df_test[mask]
    
    print(f"value picked for {id} are {some}")