In [1]:
# 10/18/2019 Shawn Jung
# Title: Image 2 LMDB 
# This notebook will convert raw images and labels to LMDB format 
# Inspired by https://realpython.com/storing-images-in-python/ 

In [2]:
# loading library 
import numpy as np 
import pandas as pd
import pickle 
from pathlib import Path
from PIL import Image 
import glob
import re
import lmdb

In [3]:
# define image directory, lmdb save directory, # of images to save, and lmdb file name
image_dir = Path("./images/*.png")
database_dir = Path("./data/")
num_images = 20 
image_array = [] 



In [4]:
#image_filelist = glob.glob(str(image_dir) + "/*.png")

image_filelist = sorted(glob.glob(str(image_dir)), key=lambda x:float(re.findall("([0-9]+?)\.png",x)[0]))


In [5]:

def images_2_array(image_filelist, num_images):
    """ load images into Numpy array """
    for name in image_filelist[0:num_images]:
        image = Image.open(name)
        image_array.append(np.asarray(image))


In [6]:
images_2_array(image_filelist, num_images)
np.shape(image_array )

(20, 32, 32, 3)

In [7]:
# let's read label file 
image_label = pd.read_csv('image_label.csv', header=None, nrows=num_images)


In [8]:
image_label = np.array(image_label)

In [9]:
np.shape(image_label)

(20, 1)

In [10]:
class CIFAR_Image:
    def __init__(self, image, label):
        # Dimensions of image for reconstruction - not really necessary 
        # for this dataset, but some datasets may include images of 
        # varying sizes
        self.channels = image.shape[2]
        self.size = image.shape[:2]

        self.image = image.tobytes()
        self.label = label

    def get_image(self):
        """ Returns the image as a numpy array. """
        image = np.frombuffer(self.image, dtype=np.uint8)
        return image.reshape(*self.size, self.channels)

In [11]:
def store_many_lmdb(images, labels):
    map_size = num_images * images[0].nbytes * 10

    # create a new db file 
    env = lmdb.open(str(database_dir / f"lmdb_{num_images}_images"), map_size = map_size)

    with env.begin(write=True) as txn:
        for i in range(num_images):
            # All key-value pairs need to be Strings
            value = CIFAR_Image(images[i], labels[i])
            key = f"{i:08}"
            txn.put(key.encode("ascii"), pickle.dumps(value))
    env.close()


In [12]:
# finall, let's save images to LMDB 
store_many_lmdb(image_array, image_label)


