In [1]:
import os
import lmdb
from PIL import Image
import numpy as np
import io

In [6]:
def jpg_to_lmdb(input_dir, db_name):
    env = lmdb.open(db_name, map_size=26106127360)
    
    with env.begin(write=True) as txn:
        
        # Iterate over RGB images in input directory
        for filename in os.listdir(input_dir):
            img = Image.open(os.path.join(input_dir, filename))
            img_array = np.array(img)

            # Serialize image data
            img_bytes = io.BytesIO()
            np.save(img_bytes, img_array)
            img_bytes.seek(0)
            img_data = img_bytes.read()

            # Store image data in LMDB database
            txn.put(filename.encode(), img_data)
    env.close()

def read_images_from_lmdb(db_path):
    # Open LMDB environment in readonly mode
    env = lmdb.open(db_path, readonly=True)

    # Begin transaction
    with env.begin() as txn:
        cursor = txn.cursor()

        # Iterate over key-value pairs in LMDB database
        for key, value in cursor:
            # Convert serialized image data back to numpy array
            img_bytes_io = io.BytesIO(value)
            img_array = np.load(img_bytes_io)

            # Convert numpy array to PIL Image
            img = Image.fromarray(img_array)

            # Display or process the image as needed
            # For example, you can display the image:
            img.show()

    # Close LMDB environment
    env.close()
    
def compute_database_size(db_path):
    # Open LMDB environment in readonly mode
    env = lmdb.open(db_path, readonly=True)

    total_size = 0

    # Begin transaction
    with env.begin() as txn:
        cursor = txn.cursor()

        # Iterate over key-value pairs in LMDB database
        for key, value in cursor:
            # Compute the size of the value (data)
            value_size = len(value)

            # Accumulate the total size
            total_size += value_size

    # Close LMDB environment
    env.close()

    return total_size

In [None]:
input_dir = 'data/train_small'
db_path = 'data/train_small_lmdb/train_small.lmdb'

jpgs_to_lmdb(input_dir, db_path)
read_images_from_lmdb(db_path)
total_size = compute_database_size(db_path)

print(f"Total space used by the database: {total_size} bytes")