In [1]:
import os
import pandas as pd
import humanfriendly
import numpy as np
from pathlib import Path

In [2]:
def parse_size(values):
    size = np.zeros(values.shape, dtype=int)
    for j, val in enumerate(values):
        size[j] = humanfriendly.parse_size(val)
    return size


In [5]:
fn = "zarr_inspect.csv"
df_zarr = pd.read_csv(fn)
df_zarr["stored_bytes"] = parse_size(df_zarr.stored.values)
df_zarr.sort_values("stored_bytes", ascending=False, inplace=True)
df_zarr

Unnamed: 0,name,dtype,stored,size,ratio,nchunks,chunk_size,avg_chunk_stored,shape,chunk_shape,compressor,filters,stored_bytes
0,/call_GQ,int8,1.89 GiB,163.47 GiB,87.0,17900,9.35 MiB,110.55 KiB,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,2029372047
1,/call_genotype,int8,914.22 MiB,326.93 GiB,370.0,17900,18.7 MiB,52.3 KiB,"(715256, 245394, 2)","(1000, 10000, 2)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,958629150
2,/call_RGQ,int16,729.57 MiB,326.93 GiB,460.0,17900,18.7 MiB,41.74 KiB,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,765009592
3,/call_genotype_mask,bool,606.02 MiB,326.93 GiB,550.0,17900,18.7 MiB,34.67 KiB,"(715256, 245394, 2)","(1000, 10000, 2)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,635458027
4,/call_genotype_phased,bool,17.17 MiB,163.47 GiB,9700.0,17900,9.35 MiB,1006 bytes,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,18004049
5,/variant_allele,object,4.74 MiB,518.41 MiB,110.0,716,741.42 KiB,6.78 KiB,"(715256, 95)","(1000, 95)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",[VLenUTF8()],4970250
6,/variant_filter,bool,2.87 MiB,2.73 MiB,0.95,716,3.9 KiB,4.1 KiB,"(715256, 4)","(1000, 4)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,3009413
7,/variant_AN,int32,908.75 KiB,2.73 MiB,3.1,716,3.9 KiB,1.27 KiB,"(715256,)","(1000,)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,930560
8,/variant_position,int32,810.77 KiB,2.73 MiB,3.4,716,3.9 KiB,1.13 KiB,"(715256,)","(1000,)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,830228
9,/sample_id,object,357.15 KiB,1.87 MiB,5.4,25,76.69 KiB,14.29 KiB,"(245394,)","(10000,)","Blosc(cname='zstd', clevel=7, shuffle=SHUFFLE,...",[VLenUTF8()],365721


In [9]:
total = df_zarr.stored_bytes.sum()

humanfriendly.format_size(total, binary=True)

'4.11 GiB'

In [10]:
df_zarr.nchunks.sum()

95973

In [11]:
df_display_table = pd.DataFrame({
    "Field":df_zarr.name,
    "type": df_zarr.dtype,
    "storage":df_zarr.stored,
    "compress": df_zarr.ratio,
    "percentage": df_zarr.stored_bytes / total})
df_display_table.sort_values("percentage", ascending=False, inplace=True)
df_display_table["percentage"] = df_display_table["percentage"].map('{:.2%}'.format)
df_display_table["compress"] = df_display_table["compress"].map('{:.1f}'.format)
df_display_table

Unnamed: 0,Field,type,storage,compress,percentage
0,/call_GQ,int8,1.89 GiB,87.0,45.94%
1,/call_genotype,int8,914.22 MiB,370.0,21.70%
2,/call_RGQ,int16,729.57 MiB,460.0,17.32%
3,/call_genotype_mask,bool,606.02 MiB,550.0,14.39%
4,/call_genotype_phased,bool,17.17 MiB,9700.0,0.41%
5,/variant_allele,object,4.74 MiB,110.0,0.11%
6,/variant_filter,bool,2.87 MiB,0.9,0.07%
7,/variant_AN,int32,908.75 KiB,3.1,0.02%
8,/variant_position,int32,810.77 KiB,3.4,0.02%
9,/sample_id,object,357.15 KiB,5.4,0.01%


In [12]:
print(df_display_table.to_latex(index=False))

\begin{tabular}{lllll}
\toprule
Field & type & storage & compress & percentage \\
\midrule
/call_GQ & int8 & 1.89 GiB & 87.0 & 45.94% \\
/call_genotype & int8 & 914.22 MiB & 370.0 & 21.70% \\
/call_RGQ & int16 & 729.57 MiB & 460.0 & 17.32% \\
/call_genotype_mask & bool & 606.02 MiB & 550.0 & 14.39% \\
/call_genotype_phased & bool & 17.17 MiB & 9700.0 & 0.41% \\
/variant_allele & object & 4.74 MiB & 110.0 & 0.11% \\
/variant_filter & bool & 2.87 MiB & 0.9 & 0.07% \\
/variant_AN & int32 & 908.75 KiB & 3.1 & 0.02% \\
/variant_position & int32 & 810.77 KiB & 3.4 & 0.02% \\
/sample_id & object & 357.15 KiB & 5.4 & 0.01% \\
/variant_length & int16 & 152.4 KiB & 9.2 & 0.00% \\
/variant_id & object & 56.18 KiB & 99.0 & 0.00% \\
/variant_quality & float32 & 51.92 KiB & 54.0 & 0.00% \\
/variant_id_mask & bool & 51.89 KiB & 13.0 & 0.00% \\
/variant_contig & int16 & 50.6 KiB & 28.0 & 0.00% \\
/region_index & int32 & 13.93 KiB & 1.2 & 0.00% \\
/contig_length & int64 & 9.24 KiB & 2.8 & 0.00% \\
/con