In [None]:
from bigearthnet_encoder.encoder import tiff_dir_to_ben_s2_patch
from bigearthnet_common.example_data import get_s2_example_folder_path

s2_path = get_s2_example_folder_path()
example_patch = [p for p in s2_path.iterdir()][0]

In [None]:
s2_patch = tiff_dir_to_ben_s2_patch(example_patch)

In [None]:
import sys

# 160KB per patch with all bands
patch_size_in_bytes = sys.getsizeof(s2_patch.dumps())
patch_size_in_bytes

In [None]:
2**30 // patch_size_in_bytes
# around 6600 patches per GB
# 80~

In [None]:
import lmdb
from rich.progress import Progress
from bigearthnet_patch_interface.s2_interface import BigEarthNet_S2_Patch
import bigearthnet_common.constants as ben_constants
from typing import List

SHARD_SIZE = 6600
BATCH_SIZE = 16
REPS = 2
TOTAL_TEST_SIZE = SHARD_SIZE * REPS


def interpolate_to_ben_perf(size, seconds):
    interpolated_seconds = ben_constants.BEN_COMPLETE_SIZE / size * seconds
    return f"Would take {interpolated_seconds / 60:.02} min to pass through BigEarthNet"


def fake_lmdb_builder(fake_data, keys: List[str], lmdb_path: str = "S2_lmdb.db"):
    max_size = 2**40  # 1TebiByte
    env = lmdb.open(str(lmdb_path), map_size=max_size, readonly=False)

    with Progress() as progress:
        task = progress.add_task("Building LMDB archive", total=len(keys))
        for key in keys:
            with env.begin(write=True) as txn:
                txn.put(key.encode("utf-8"), fake_data)
                progress.update(task, advance=1)
        env.close()


keys = [f"{i:05}" for i in range(TOTAL_TEST_SIZE)]
# generate fake data and keys
# fake_lmdb_builder(s2_patch.dumps(), keys)

In [None]:
def read_fake_lmdb(keys, lmdb_path="S2_lmdb.db"):
    # readahead should be True if dataset fits in RAM
    # otherwise it may be faster to set readahead = False
    # as readonly=True no need for `locking` which _should_ take longer if lock=True
    env = lmdb.open(str(lmdb_path), readonly=True, readahead=True, lock=False)
    # possible optimization use single call to
    # getmulti(keys) instead of a new thread with a single element as transaction?

    for key in keys:
        with env.begin() as txn:
            byteflow = txn.get(key.encode("utf-8"))
            s2_patch = BigEarthNet_S2_Patch.loads(byteflow)


# ~4 sek to pass through 6600 * 2
read_fake_lmdb(keys)

In [None]:
interpolate_to_ben_perf(TOTAL_TEST_SIZE, 4.1)

In [None]:
import shutil
from pathlib import Path
from bigearthnet_encoder.squirrel_ext import (
    MyMessagePackDriver,
    _patch_interface_to_dict,
)

s2_patch_dict = _patch_interface_to_dict(s2_patch)


def gen_shards():
    while True:
        yield s2_patch_dict


it = IterableSource(iter(gen_shards()))
url = "dummy"
# Other supported compressions:
# fsspec.compression.available_compressions()
# [None, 'zip', 'bz2', 'gzip', 'lzma', 'xz']
compression = "None"
SAVE_URL = f"{url}_{compression}"
p = Path(SAVE_URL)
shutil.rmtree(p)
msgpack_driver = MyMessagePackDriver(SAVE_URL)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from bigearthnet_encoder.squirrel_ext import _write_s2_msgpack
from bigearthnet_common.example_data import get_s2_example_folder_path
from bigearthnet_common.base import get_s2_patch_directories
from squirrel.iterstream import IterableSource
from pathlib import Path

# s2_path = get_s2_example_folder_path()
s2_path = Path("~/datasets/BigEarthNet-v1.0/BigEarthNet-v1.0").expanduser()
patch_paths = get_s2_patch_directories(s2_path)

In [None]:
patch_paths = patch_paths[:100]

In [None]:
_write_s2_msgpack(patch_paths, "dummy_None")

In [None]:
list(msgpack_driver.keys())

In [None]:
interpolate_to_ben_perf(6600, 60 * 2)

In [None]:
# from functools import partial
# to_shard = partial(msgpack_driver.store.set, compression=None)
# batches = it.take(TOTAL_TEST_SIZE).batched(SHARD_SIZE, drop_last_if_not_full=False).map(to_shard).join()

In [None]:
# GZIP: 6.5s for pass-through
# ~797MB
# 6.8
# with not prefetch_buffer 6.8s

# read using the messagepack driver

it_msg_pack = msgpack_driver.get_iter()
for item in it_msg_pack.take(10):
    # np.mean(item["10m_bands"])
    print(item["patch_name"])
    print(item["B01"])

In [None]:
interpolate_to_ben_perf(TOTAL_TEST_SIZE, 0.7)