In [None]:
import zarr
import numpy as np

path = "c-stuff/data/datasets/test_dataset.zarr"
shape = (134217728)
variable = "X"
dtype = "f8"

root = zarr.create_group(store=path, zarr_format=2, overwrite=True)
x = root.create_array(name=variable, shape=shape, dtype=dtype)
x[:] = np.random.random_sample(shape)

In [1]:
#%%script false --no-raise-error
import pandas as pd

df_bench_c = pd.read_json("data/plotting/plotting_bench_c.json")
df_bench_c_copy = pd.read_json("data/plotting/plotting_bench_c.json")
df_bench_python = pd.read_json("data/plotting/plotting_bench_python.json")
df_bench_python_copy = pd.read_json("data/plotting/plotting_bench_python copy.json")

In [None]:
import plotly.express as px 
import numpy as np

df = df_bench_python

save_image = True

# python plotting

# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    
    mean =  df_format["time taken"].mean()
    str_filesize = df_format["total filesize"].iat[0]
    filesize = float(str_filesize.split()[0])
    unit = f"{str_filesize.split()[1]}/s"

    data_rate = filesize / mean
    
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "total filesize": df_format["total filesize"].iat[0], "data rate": data_rate, "unit":unit}])
    smth = pd.concat([smth, tmp], ignore_index=True)

fig = px.line(
    data_frame=smth, 
    log_y=True, 
    x="total filesize", 
    y="mean time", 
    color="engine", 
    markers=True, 
    hover_data=["format"], 
    title="mean time in seconds (s)",
    color_discrete_map={
        "zarr-python": '#E69F00',
        "netcdf4-python": '#56B4E9',
        "hdf5-python": '#009E73',
        "netcdf4-python-parallel": '#0072B2',
        "hdf5-python-parallel": '#D55E00',
    },
    )

fig.show()
if save_image: fig.write_image("images/python-line-total-filesize.svg", width=900, height=500, scale=1)

unit = smth["unit"].iat[0]
fig = px.bar(
    data_frame=smth, 
    x="total filesize", 
    y="data rate", 
    color="engine", 
    barmode='group', 
    hover_data=["format"], 
    title=f"throughput in {unit}",
    color_discrete_map={
        "zarr-python": '#E69F00',
        "netcdf4-python": '#56B4E9',
        "hdf5-python": '#009E73',
        "netcdf4-python-parallel": '#0072B2',
        "hdf5-python-parallel": '#D55E00',
    },
    )

fig.show()
if save_image: fig.write_image("images/python-box-datarate.svg", width=900, height=500, scale=1)



df = df_bench_c

# c plotting

# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    
    mean =  df_format["time taken"].mean()
    str_filesize = df_format["total filesize"].iat[0]
    filesize = float(str_filesize.split()[0])
    unit = f"{str_filesize.split()[1]}/s"

    data_rate = filesize / mean
    
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "total filesize": df_format["total filesize"].iat[0], "data rate": data_rate, "unit":unit}])
    smth = pd.concat([smth, tmp], ignore_index=True)

fig = px.line(
    data_frame=smth, 
    log_y=True, 
    x="total filesize", 
    y="mean time", 
    color="engine", 
    markers=True, 
    hover_data=["format"], 
    title="mean time in seconds (s)",
    color_discrete_map={
        "netcdf4-c": '#56B4E9',
        "hdf5-c": '#009E73',
        "netcdf4-c-parallel": '#0072B2',
        "hdf5-c-parallel": '#D55E00',
        "hdf5-async": '#CC79A7',
        "hdf5-subfiling": '#E69F00',
    },
    )

fig.show()
if save_image: fig.write_image("images/c-line-total-filesize.svg", width=900, height=500, scale=1)

unit = smth["unit"].iat[0]
fig = px.bar(
    data_frame=smth, 
    x="total filesize", 
    y="data rate", 
    color="engine", 
    barmode='group', 
    hover_data=["format"], 
    title=f"throughput in {unit}",
    color_discrete_map={
        "netcdf4-c": '#56B4E9',
        "hdf5-c": '#009E73',
        "netcdf4-c-parallel": '#0072B2',
        "hdf5-c-parallel": '#D55E00',
        "hdf5-async": '#CC79A7',
        "hdf5-subfiling": '#E69F00',
    },
    )
fig.show()
if save_image: fig.write_image("images/c-box-datarate.svg", width=900, height=500, scale=1)




df = df_bench_both = pd.concat([df_bench_python, df_bench_c], ignore_index=True)

# plotting both

# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    
    mean =  df_format["time taken"].mean()
    str_filesize = df_format["total filesize"].iat[0]
    filesize = float(str_filesize.split()[0])
    unit = f"{str_filesize.split()[1]}/s"

    data_rate = filesize / mean
    
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "total filesize": df_format["total filesize"].iat[0], "data rate": data_rate, "unit":unit}])
    smth = pd.concat([smth, tmp], ignore_index=True)

fig = px.line(
    data_frame=smth, 
    log_y=True, 
    x="total filesize", 
    y="mean time", 
    color="engine", 
    markers=True, 
    hover_data=["format"], 
    title="mean time in seconds (s)",
    )
fig.show()
if save_image: fig.write_image("images/both-line-total-filesize.svg", width=900, height=500, scale=1)

unit = smth["unit"].iat[0]
fig = px.bar(
    data_frame=smth, 
    x="total filesize", 
    y="data rate", 
    color="engine", 
    barmode='group', 
    hover_data=["format"], 
    title=f"throughput in {unit}"
    )
fig.show()
if save_image: fig.write_image("images/both-box-datarate.svg", width=900, height=500, scale=1)




In [1]:
#%%script false --no-raise-error
import pandas as pd

df_bench_c = pd.read_json("data/plotting/plotting_bench_c_chunks.json")
df_bench_c_copy = pd.read_json("data/plotting/plotting_bench_c.json")
df_bench_python = pd.read_json("data/plotting/plotting_bench_python_chunks.json")
df_bench_python_copy = pd.read_json("data/plotting/plotting_bench_python copy.json")

In [None]:
import plotly.express as px 
import numpy as np

df = df_bench_python

save_image = False

# python plotting

# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    
    mean =  df_format["time taken"].mean()
    str_filesize = df_format["total filesize"].iat[0]
    filesize = float(str_filesize.split()[0])
    unit = f"{str_filesize.split()[1]}/s"

    data_rate = filesize / mean
    
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "total filesize": df_format["total filesize"].iat[0], "data rate": data_rate, "unit":unit, "filesize per chunk": df_format["filesize per chunk"].iat[0]}])
    smth = pd.concat([smth, tmp], ignore_index=True)

fig = px.line(data_frame=smth, log_y=True, x="filesize per chunk", y="mean time", color="engine", markers=True, hover_data=["format"], title="mean time in seconds (s)")
fig.show()
if save_image: fig.write_image("images/python-line-total-filesize-chunks.svg", width=900, height=500, scale=1)

unit = smth["unit"].iat[0]
fig = px.bar(data_frame=smth, x="filesize per chunk", y="data rate", color="engine", barmode='group', hover_data=["format"], title=f"data rate in {unit}")
fig.show()
if save_image: fig.write_image("images/python-box-datarate.svg", width=900, height=500, scale=1)



df = df_bench_c

# c plotting

# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    
    mean =  df_format["time taken"].mean()
    str_filesize = df_format["total filesize"].iat[0]
    filesize = float(str_filesize.split()[0])
    unit = f"{str_filesize.split()[1]}/s"

    data_rate = filesize / mean
    
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "total filesize": df_format["total filesize"].iat[0], "data rate": data_rate, "unit":unit, "filesize per chunk": df_format["filesize per chunk"].iat[0]}])
    smth = pd.concat([smth, tmp], ignore_index=True)

fig = px.line(data_frame=smth, log_y=True, x="filesize per chunk", y="mean time", color="engine", markers=True, hover_data=["format"], title="mean time in seconds (s)")
fig.show()
if save_image: fig.write_image("images/c-line-total-filesize-chunks.svg", width=900, height=500, scale=1)

unit = smth["unit"].iat[0]
fig = px.bar(data_frame=smth, x="filesize per chunk", y="data rate", color="engine", barmode='group', hover_data=["format"], title=f"data rate in {unit}")
fig.show()
if save_image: fig.write_image("images/c-box-datarate.svg", width=900, height=500, scale=1)




df = df_bench_both = pd.concat([df_bench_python, df_bench_c], ignore_index=True)

# plotting both

# plot development in time taken by chunksize
filter = df["format"].drop_duplicates(ignore_index=True)
smth = pd.DataFrame()
for format in filter:
    mask = df["format"].values == format
    df_format = df[mask]
    
    mean =  df_format["time taken"].mean()
    str_filesize = df_format["total filesize"].iat[0]
    filesize = float(str_filesize.split()[0])
    unit = f"{str_filesize.split()[1]}/s"

    data_rate = filesize / mean
    
    tmp = pd.DataFrame(data=[{"mean time": mean, "format": format, "engine": df_format["engine"].iat[0], "run": df_format["run"].iat[0], "total filesize": df_format["total filesize"].iat[0], "data rate": data_rate, "unit":unit, "filesize per chunk": df_format["filesize per chunk"].iat[0]}])
    smth = pd.concat([smth, tmp], ignore_index=True)

fig = px.line(data_frame=smth, log_y=True, x="filesize per chunk", y="mean time", color="engine", markers=True, hover_data=["format"], title="mean time in seconds (s)")
fig.show()
if save_image: fig.write_image("images/both-line-total-filesize-chunks.svg", width=900, height=500, scale=1)

unit = smth["unit"].iat[0]
fig = px.bar(data_frame=smth, x="filesize per chunk", y="data rate", color="engine", barmode='group', hover_data=["format"], title=f"data rate in {unit}")
fig.show()
if save_image: fig.write_image("images/both-box-datarate.svg", width=900, height=500, scale=1)




In [None]:
# plot runs by engine
fig = px.box(data_frame=df, log_y=True, x="engine", y="time taken", color="run", hover_data=["format", "filesize per chunk"])
fig.show()

# plot formats by chunksize
fig = px.box(data_frame=df, log_y=True, x="total filesize", y="time taken", color="engine", hover_data=["format", "filesize per chunk"], title="total filesize")
fig.show()

filter = df["engine"].drop_duplicates(ignore_index=True)

for engine in filter:
    
    mask = df["engine"].values == engine
    df_engine = df[mask]
    fig =  fig = px.box(data_frame=df_engine, log_y=True, x="engine", y="time taken", color="total filesize",)
    fig.show()


filter = df["run"].drop_duplicates(ignore_index=True)

fig = px.violin(data_frame=df, log_x=True, x="time taken", y="format", color="engine")
fig.show()

for engine in filter:
    
    mask = df["run"].values == engine
    fig = px.box(data_frame=df[mask], log_y=True, x="engine", y="time taken", color="total filesize", points="all",)
    #fig.show()
    
    fig = px.violin(data_frame=df[mask], log_x=True, x="time taken", y="total filesize", color="engine")
    fig.show()