In [22]:
import tiledb
import os
import shutil
import numpy as np

array_uri = "tiledb_blob_storage_array"
image_paths = ["data/example.jpg",
               "data/example2.jpg",
               "data/example3.jpg",]

if os.path.exists(array_uri):
    shutil.rmtree(array_uri)

In [23]:
# Create one dimension
d1 = tiledb.Dim(name="d1", domain=(0, 2), tile=2, dtype=np.int32)

dom = tiledb.Domain(d1)

# This attribute will accept variable-length strings.
a = tiledb.Attr(name="a", dtype=np.bytes_)

# Create the array schema with `sparse=True`
sch = tiledb.ArraySchema(domain=dom, sparse=False, attrs=[a])

# Create the array on disk (it will initially be empty)
tiledb.Array.create(array_uri, sch)

In [24]:
# Read all images first, then write them in one open() (single commit)
d1_list = []
a_list = []

for i, image_path in enumerate(image_paths):
    with open(image_path, "rb") as f:
        img_bytes = f.read()
    d1_list.append(i)
    a_list.append(img_bytes)

d1_data = np.array(d1_list, dtype=np.int32)
# Use object dtype for variable-length bytes
a_data = np.array(a_list, dtype=object)

with tiledb.open(array_uri, 'w') as A:
    A[d1_data] = {"a": a_data}

In [25]:
with tiledb.open(array_uri) as A:
    data = A[:]["a"]

In [26]:
data

array([b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xdb\x00C\x01\x03\x03\x03\x04\x03\x04\x08\x04\x04\x08\x10\x0b\t\x0b\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\xff\xc0\x00\x11\x08\x00\x1b\x00\x1d\x03\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x18\x00\x00\x03\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x05\x06\x07\x03\xff\xc4\x00,\x10\x00\x02\x01\x03\x03\x03\x02\x04\x07\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x11\x06\x12!\x00\x131"Q\x07\x14Aa\x08%2B\x81\x92\xf1\xff\xc4\x00\x18\x01\x00\x03\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x0

In [27]:
for i, img_bytes in enumerate(data):
    print(f'Image {i} size: {len(img_bytes)} bytes')

Image 0 size: 1241 bytes
Image 1 size: 5416 bytes
Image 2 size: 5044 bytes


This results in the following directory tree:
```bash
tiledb_blob_storage_array/
├── __commits
│   └── __1761231023619_1761231023619_622fccca9c9764f618c8251382530537_22.wrt
├── __fragment_meta
├── __fragments
│   └── __1761231023619_1761231023619_622fccca9c9764f618c8251382530537_22
│       ├── a0.tdb
│       ├── a0_var.tdb
│       ├── d0.tdb
│       └── __fragment_metadata.tdb
├── __labels
├── __meta
└── __schema
    ├── __1761231022144_1761231022144_00000002d95a6f8db114e47561e521a8
    └── __enumerations
```

Note that there is only one commit and thus only one fragment.

The storage footprint for the tiledb_blob_storage_array is 124 KB, vs the 48 KB for the 3 combined raw files on disk. This is dues to TileDB's overhead for managing the array structure and metadata. Suspect that this overhead will be minimal when storing batches of many (and larger) patches.

In [28]:
shutil.rmtree(array_uri)