In [50]:
from random import shuffle
import glob
from pathlib import Path

import numpy as np
import tables

import cv2

## list images and label them

In [32]:
shuffle_data = False  # shuffle the addresses before saving
data_dir = Path.home() / 'data/animals'
hdf5_path = data_dir / 'dataset.hdf5'  # address to where you want to save the hdf5 file
cat_dog_train_path = data_dir / 'train/'

In [33]:
data_dir, cat_dog_train_path

(PosixPath('/Users/ilyarudyak/data/animals'),
 PosixPath('/Users/ilyarudyak/data/animals/train'))

In [34]:
filenames = list(cat_dog_train_path.glob('*'))

In [35]:
len(filenames)

25000

In [36]:
filenames[:5]

[PosixPath('/Users/ilyarudyak/data/animals/train/dog.8011.jpg'),
 PosixPath('/Users/ilyarudyak/data/animals/train/cat.5077.jpg'),
 PosixPath('/Users/ilyarudyak/data/animals/train/dog.7322.jpg'),
 PosixPath('/Users/ilyarudyak/data/animals/train/cat.2718.jpg'),
 PosixPath('/Users/ilyarudyak/data/animals/train/cat.10151.jpg')]

In [52]:
str(filenames[0])

'/Users/ilyarudyak/data/animals/train/dog.8011.jpg'

In [37]:
labels = [0 if 'cat' in str(f) else 1 for f in filenames]

In [38]:
labels[:5]

[1, 0, 1, 0, 0]

In [39]:
len(labels)

25000

In [40]:
# divide the hata into 60% train, 20% validation, and 20% test
train_filenames = filenames[0:int(0.6*len(filenames))]
train_labels = labels[0:int(0.6*len(filenames))]

val_filenames = filenames[int(0.6*len(filenames)):int(0.8*len(filenames))]
val_labels = labels[int(0.6*len(filenames)):int(0.8*len(filenames))]

test_filenames = filenames[int(0.8*len(filenames)):]
test_labels = labels[int(0.8*len(filenames)):]

## create a `HDF5` file

In [42]:
img_dtype = tables.UInt8Atom()

In [43]:
data_shape = (0, 224, 224, 3)

In [44]:
hdf5_file = tables.open_file(hdf5_path, mode='w')

In [45]:
train_storage = hdf5_file.create_earray(hdf5_file.root, 'train_img', img_dtype, shape=data_shape)
val_storage = hdf5_file.create_earray(hdf5_file.root, 'val_img', img_dtype, shape=data_shape)
test_storage = hdf5_file.create_earray(hdf5_file.root, 'test_img', img_dtype, shape=data_shape)

mean_storage = hdf5_file.create_earray(hdf5_file.root, 'train_mean', img_dtype, shape=data_shape)

In [46]:
# create the label arrays and copy the labels data in them
hdf5_file.create_array(hdf5_file.root, 'train_labels', train_labels)
hdf5_file.create_array(hdf5_file.root, 'val_labels', val_labels)
hdf5_file.create_array(hdf5_file.root, 'test_labels', test_labels)

/test_labels (Array(5000,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'python'
  byteorder := 'little'
  chunkshape := None

## load images and save them

In [47]:
# a numpy array to save the mean of the images
mean = np.zeros(data_shape[1:], np.float32)

In [53]:
# loop over train filenames
for i in range(len(train_filenames)):
    # print how many images are saved every 5000 images
    if i % 5000 == 0 and i > 1:
        print('Train data: {}/{}'.format(i, len(train_filenames)))

    # read an image and resize to (224, 224)
    # cv2 load images as BGR, convert it to RGB
    filename = train_filenames[i]
    img = cv2.imread(str(filename))
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # add any image pre-processing here

    # if the data order is Theano, axis orders should change
#     if data_order == 'th':
#         img = np.rollaxis(img, 2)

    # save the image and calculate the mean so far
    train_storage.append(img[None])
    mean += img / float(len(train_labels))

Train data: 5000/15000
Train data: 10000/15000


In [56]:
# loop over validation addresses
for i in range(len(val_filenames)):
    # print how many images are saved every 1000 images
    if i % 1000 == 0 and i > 1:
        print('Validation data: {}/{}'.format(i, len(val_filenames)))

    # read an image and resize to (224, 224)
    # cv2 load images as BGR, convert it to RGB
    filename = val_filenames[i]
    img = cv2.imread(str(filename))
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # add any image pre-processing here

    # if the data order is Theano, axis orders should change
#     if data_order == 'th':
#         img = np.rollaxis(img, 2)

    # save the image
    val_storage.append(img[None])

Validation data: 1000/5000
Validation data: 2000/5000
Validation data: 3000/5000
Validation data: 4000/5000


In [59]:
# loop over test addresses
for i in range(len(test_filenames)):
    # print how many images are saved every 1000 images
    if i % 1000 == 0 and i > 1:
        print('Test data: {}/{}'.format(i, len(test_filenames)))

    # read an image and resize to (224, 224)
    # cv2 load images as BGR, convert it to RGB
    filename = test_filenames[i]
    img = cv2.imread(str(filename))
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # add any image pre-processing here

    # if the data order is Theano, axis orders should change
#     if data_order == 'th':
#         img = np.rollaxis(img, 2)

    # save the image
    test_storage.append(img[None])

Test data: 1000/5000
Test data: 2000/5000
Test data: 3000/5000
Test data: 4000/5000


In [60]:
mean_storage.append(mean[None])
hdf5_file.close()

## read `hdf5` file

In [61]:
hdf5_file = tables.open_file(hdf5_path, mode='r')

In [62]:
data_num = hdf5_file.root.train_img.shape[0]

In [63]:
data_num

15000