In [1]:
# 10/18/2019 Shawn Jung
# Title: Image 2 HDF5 
# This notebook will convert raw images and labels to HDF5 format 
# Inspired by https://realpython.com/storing-images-in-python/

In [2]:
# loading library 
import numpy as np 
import pandas as pd
import pickle 
from pathlib import Path
from PIL import Image 
import glob
import re
import h5py

In [3]:
# define image directory, lmdb save directory, # of images to save, and hdf5 file name
image_dir = Path("./images/*.png")
database_dir = Path("./data/")
num_images = 20000 
image_array = [] 


In [4]:
image_filelist = sorted(glob.glob(str(image_dir)), key=lambda x:float(re.findall("([0-9]+?)\.png",x)[0]))


In [5]:
def images_2_array(image_filelist, num_images):
    """ load images into Numpy array """
    for name in image_filelist[0:num_images]:
        image = Image.open(name)
        image_array.append(np.asarray(image))
        

In [6]:
# are they loaded well? 
images_2_array(image_filelist, num_images)
np.shape(image_array )

(20, 32, 32, 3)

In [None]:
image_filelist = sorted(glob.glob(str(image_dir)), key=lambda x:float(re.findall("([0-9]+?)\.png",x)[0]))


In [9]:
# let's read label file 
image_label = pd.read_csv('image_label.csv', header=None, nrows=num_images)
image_label = np.array(image_label)
np.shape(image_label)

(20, 1)

In [11]:
class CIFAR_Image:
    def __init__(self, image, label):
        # Dimensions of image for reconstruction - not really necessary 
        # for this dataset, but some datasets may include images of 
        # varying sizes
        self.channels = image.shape[2]
        self.size = image.shape[:2]

        self.image = image.tobytes()
        self.label = label

    def get_image(self):
        """ Returns the image as a numpy array. """
        image = np.frombuffer(self.image, dtype=np.uint8)
        return image.reshape(*self.size, self.channels)

In [17]:
def store_many_hdf5(images, labels):
    
    num_images = len(images)
    map_size = num_images * images[0].nbytes * 10

    file = h5py.File(database_dir / f"hdf5_{num_images}_images.h5", "w")

    # Create a dataset in the file
    dataset = file.create_dataset(
        "images", np.shape(images), h5py.h5t.STD_U8BE, data=images
    )
    meta_set = file.create_dataset(
        "meta", np.shape(labels), h5py.h5t.STD_U8BE, data=labels
    )
    file.close()



In [18]:
# finall, let's save images to LMDB 
store_many_hdf5(image_array, image_label)
