In [None]:
from notebooks import *
memory.log.level = 'debug'
sg.init(app=None)

# Faster(/smaller) search_recs load from disk
- See takeaways at bottom

In [None]:
# Inspect types + values
(sg.search_recs
    [:1].T
    .pipe(df_assign_first, type=lambda df: df[0].map(lambda x: type(x).__name__))
)

In [None]:
# How big are f_*?
#   - float16 or float32? -- see takeaways below
print(f'shape: {sg.search_recs.shape}')
for dtype in [
    None,        # total[703m] = no_f_*[116m] + f_*[587m] (= f_f[479m] + f_p[108m]) -- List[float]
    np.float64,  # total[639m] = no_f_*[116m] + f_*[523m] (= f_f[426m] + f_p[ 97m]) -- Same as np.float (np default)
    np.float32,  # total[381m] = no_f_*[116m] + f_*[265m] (= f_f[215m] + f_p[ 50m])
    np.float16,  # total[252m] = no_f_*[116m] + f_*[136m] (= f_f[109m] + f_p[ 27m])
]:
    mem = (sg.search_recs
        # [:1000]  # XXX Faster dev
        .pipe(lambda df: df if dtype is None else (df
            .pipe(df_col_map,
                f_f=lambda xs: np.array(xs, dtype=dtype),  # List[float] -> np.ndarray
                f_p=lambda xs: np.array(xs, dtype=dtype),  # List[float] -> np.ndarray
            )
        ))
        .memory_usage(deep=True)
        .sort_values(ascending=False)
    )
    print()
    print(f'dtype: {dtype}')
    print(f'total: {humanize.naturalsize(mem.sum())}')
    display(mem
        .to_frame().rename(columns={0: 'mem'})
        .sort_values('mem', ascending=True)
        .assign(rev_cumsum=lambda df: df.mem.cumsum())
        .sort_values('mem', ascending=False)
        .assign(cumsum=lambda df: df.mem.cumsum())
        .applymap(humanize.naturalsize)
        [:5]
    )

In [None]:
# Perf (results recorded in next cell)
def measure(path, msg, f):
    # Various contortions to print time + mem delta + file size on one line
    with print_mem_delta(
        collect_before=True, collect_after=False,
        desc=None,
        format='uss[%(uss)9s]',  # uss > rss,vms [https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_full_info]
        print=partial(print, end='')
    ):
        # Retain ret to ensure mem delta is measured before it's collected
        ret = timed_print(f=f, msg='time[%%s] %-55s' % f'{path} {msg}', print=partial(print, end=''))
    print(' file[%8s]' % naturalsize(Path(path).stat().st_size))
    return ret

run = AttrDict(

    # Params
    version=5,
    n=(
        len(sg.search_recs) // 100
        # len(sg.search_recs) // 10
        # len(sg.search_recs)
    ),
    array_dtypes=[
        np.float16,
        np.float32,
    ],

    # File formats
    pkl=None,  # Slow
    parquet=[
        'uncompressed',  # Fast
        # 'gzip',  # Slow
        # 'snappy',  # Not installed [should be faster than gzip]
    ],
    sqlite=None,  # Fast

)

for array_dtype in run.array_dtypes:
    path = f'/tmp/df-{run.version}-{run.n}-{array_dtype.__name__}'
    df = (sg.search_recs
        # Limit (for faster dev)
        [:run.n]
        # HACK Drop df_cell cols (currently just *_stack)
        #   - TODO Format the ones that matter as str/bytes (e.g. html)
        [lambda df: [c for c in df if not c.endswith('_stack')]]
        # ~10x faster to serdes np.array than list (but only slightly more compact)
        .pipe(df_col_map,
            f_f=lambda xs: np.array(xs, dtype=array_dtype),  # List[float] -> np.ndarray
            f_p=lambda xs: np.array(xs, dtype=array_dtype),  # List[float] -> np.ndarray
        )
        # Convert types for serdes
        .pipe(df_col_map,
            feat=np_to_npy_bytes,         # np.array -> bytes (.parquet, .sqlite)
            f_f=np_to_npy_bytes,          # np.array -> bytes (.parquet, .sqlite)
            f_p=np_to_npy_bytes,          # np.array -> bytes (.parquet, .sqlite)
            background=','.join,          # List[str] -> str (.sqlite)
            background_species=','.join,  # List[str] -> str (.sqlite)
        )
    )
    if 'pkl' in run:
        pkl_path = f'{path}.pkl'
        measure(pkl_path, 'write', lambda: joblib.dump(df, pkl_path))
        measure(pkl_path, 'read',  lambda: joblib.load(pkl_path))
    if 'parquet' in run:
        for compression in run.parquet:
            parquet_path = f'{path}.parquet.{compression}'
            measure(parquet_path, 'write', lambda: df.to_parquet(parquet_path, engine='fastparquet', compression=compression))
            measure(parquet_path, 'read',  lambda: pd.read_parquet(parquet_path, engine='fastparquet'))
    if 'sqlite' in run:
        sqlite_path = f'{path}.sqlite'
        eng = sqla.create_engine(f'sqlite:///{sqlite_path}')
        with eng.begin() as con:
            measure(sqlite_path, 'write', lambda: df.to_sql('df', con=con, if_exists='replace',
                chunksize=1000,  # Else mem unsafe (default is to write all rows in same operation)
                # schema=..., # Any gains to be had here?
                #   - f_* are TEXT instead of BLOB, but:
                #       - TEXT accepts BLOB data as is [https://sqlite.org/datatype3.html#type_affinity]
                #       - (Sanity check) Size in sqlite looks fine: n_elems * dtype_bytes + 128b (constant overhead)
                #   - No other cols matter for size, so the rest will just be data fidelity and client compat
            ))
            measure(sqlite_path, 'read',  lambda: pd.read_sql_table('df', con=con))
        eng.dispose()  # Release conn pools
    print()

time[00:00.096] /tmp/df-5-352-float16.pkl write                        

uss[  -5.0 MB]

 file[ 17.3 MB]




time[00:00.069] /tmp/df-5-352-float16.pkl read                         

uss[  12.3 kB]

 file[ 17.3 MB]




time[00:00.083] /tmp/df-5-352-float16.parquet.uncompressed write       

uss[  10.2 MB]

 file[  6.2 MB]




time[00:00.018] /tmp/df-5-352-float16.parquet.uncompressed read        

uss[      0 B]

 file[  6.2 MB]




time[00:00.456] /tmp/df-5-352-float16.sqlite write                     

uss[   9.8 MB]

 file[  6.1 MB]




time[00:00.035] /tmp/df-5-352-float16.sqlite read                      

uss[      0 B]

 file[  6.1 MB]







time[00:00.106] /tmp/df-5-352-float32.pkl write                        

uss[  -9.2 MB]

 file[ 21.2 MB]




time[00:00.103] /tmp/df-5-352-float32.pkl read                         

uss[  28.7 kB]

 file[ 21.2 MB]




time[00:00.082] /tmp/df-5-352-float32.parquet.uncompressed write       

uss[   9.1 MB]

 file[  7.5 MB]




time[00:00.019] /tmp/df-5-352-float32.parquet.uncompressed read        

uss[  32.8 kB]

 file[  7.5 MB]




time[00:00.516] /tmp/df-5-352-float32.sqlite write                     

uss[   7.6 MB]

 file[  7.4 MB]




time[00:00.039] /tmp/df-5-352-float32.sqlite read                      

uss[ 278.5 kB]

 file[  7.4 MB]







# Perf (for prev. cell)
```
.pkl                   n[35231] d[float16] file[1.7g] w[ 11s] r[7.0s] uss[904m]
.parquet.uncompressed  n[35231] d[float16] file[607m] w[2.7s] r[1.0s] uss[304m]
.parquet.gzip          n[35231] d[float16] file[530m] w[ 49s] r[8.7s] uss[405m]
.sqlite                n[35231] d[float16] file[610m] w[ 38s] r[1.6s] uss[265m]

.pkl                   n[35231] d[float32] file[2.1g] w[ 13s] r[ 10s] uss[1.1g]
.parquet.uncompressed  n[35231] d[float32] file[736m] w[3.4s] r[1.2s] uss[384m]
.parquet.gzip          n[35231] d[float32] file[652m] w[ 58s] r[ 11s] uss[503m]
.sqlite                n[35231] d[float32] file[740m] w[ 37s] r[2.9s] uss[389m]
```

# Takeaways
- Forget .pkl
    - ~3x bigger, ~4–8x slower to read (vs. .sqlite/.parquet)
- Forget .gzip
    - ~10x slowdown
    - TODO Revisit compression once we start sending payloads to mobile clients.
- Start with float32 instead of float16
    - Size: only ~1.2x bigger (.sqlite, .parquet) -- I was expecting ~2x
    - Read time: ~1.8x for .sqlite, ~1.2x for .parquet
    - TODO Revisit float16 later when we need ~1.2–2x gains
- Start building on .sqlite, since .parquet is a dead end for mobile
    - Size: same (< 1%)
    - Read time: .sqlite ~2x slower than .parquet
    - Write time: .sqlite ~10x slower than .parquet, but not a relevant bottleneck, and dominated by etl compute anyway