In [1]:
import os
import glob
import lmdb
import pickle
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import math

In [2]:
image_dir = 'D:/header-image'
images = glob.glob(os.path.join(image_dir, '*.jpg'))

In [3]:
item_ids = []
for image_path in images:
    img_name = os.path.splitext(os.path.basename(image_path))[0]
    item_ids.append(int(img_name))
item_ids.sort()

In [4]:
groups = []
step = 500000
n_groups = math.ceil(max(item_ids) / step)
for i in range(n_groups):
    groups.append([])
for item_id in item_ids:
    group_idx = math.floor(item_id / step)
    groups[group_idx].append(item_id)

In [8]:
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

for group_idx, group in enumerate(groups):
    map_size = len(group) * 600 * 1024 * 2
    shard_dir = os.path.join('D:/image-tensors', f'{group_idx}.lmdb')
    os.makedirs(shard_dir, exist_ok=True)
    env = lmdb.open(shard_dir, map_size=map_size)

    with env.begin(write=True) as txn:
        for idx, image_id in enumerate(tqdm(group, desc="Writing to LMDB")):
            image_path = os.path.join(image_dir, str(image_id) + '.jpg')
            img = Image.open(image_path).convert('RGB')
            img_tensor = preprocess(img)
            serialized_tensor = pickle.dumps(img_tensor)

            txn.put(str(image_id).encode(), serialized_tensor)

    with env.begin(write=True) as txn:
        txn.put(b'__len__', str(len(group)).encode())
        txn.put(b'__keys__', pickle.dumps(group))

    env.close()
print("LMDB creation completed")


Writing to LMDB: 100%|██████████| 10001/10001 [00:20<00:00, 487.54it/s]
Writing to LMDB: 100%|██████████| 18061/18061 [03:07<00:00, 96.07it/s] 
Writing to LMDB: 100%|██████████| 17688/17688 [03:08<00:00, 93.82it/s] 
Writing to LMDB: 100%|██████████| 17710/17710 [03:14<00:00, 91.25it/s]
Writing to LMDB: 100%|██████████| 12515/12515 [02:15<00:00, 92.57it/s]
Writing to LMDB: 100%|██████████| 13093/13093 [02:24<00:00, 90.60it/s]
Writing to LMDB: 100%|██████████| 8310/8310 [01:10<00:00, 118.34it/s]


LMDB creation completed
