In [1]:
from config import dog_vs_cats_config as config
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pyimagesearch.preprocessing import AspectAwarePreprocessor
from pyimagesearch.io import HDF5DatasetWriter
from imutils import paths
import numpy as np 
import progressbar
import json 
import cv2 
import os

In [2]:
# grab the path to the image
trainPaths = list(paths.list_images(config.IMAGES_PATH))
trainLabels = [p.split(os.path.sep)[-2].split('.')[0]
    for p in trainPaths]
# For floyd [pt.split(os.path.sep)[-2] for pt in imagePaths else use (os.path.sep)[-1]]
le = LabelEncoder()
trainLabels = le.fit_transform(trainLabels)

In [3]:
# perform stratified sampling from the training set to build the
# testing split from the training data
split = train_test_split(trainPaths, trainLabels, test_size=config.NUM_TEST_IMAGES, stratify=trainLabels, random_state=42)
(trainPaths, testPaths, trainLabels, testLabels) = split

# perform another stratified sampling, this time to build the validation data
split = train_test_split(trainPaths, trainLabels, test_size=config.NUM_VAL_IMAGES, stratify=trainLabels, random_state=42)
(trainPaths, valPaths, trainLabels, valLabels) = split

In [4]:
# construct a list pairing the training, validation and testing
# image paths along with their corresponding labels and outputs HDF5
# files
datasets = [
    ('train', trainPaths, trainLabels, config.TRAIN_HDF5),
    ('val', valPaths, valLabels, config.VAL_HDF5),
    ('test', testPaths, testLabels, config.TEST_HDF5)]

# init the image preprocessor and the list of RGB channel
# averages
aap = AspectAwarePreprocessor(256,256)
(R,G,B) = ([],[],[])

In [5]:
# loop over the dataset tuples
for(dType, paths, labels, outputPath) in datasets:
    # create HDF5 writer
    print("[INFO] building {}...".format(outputPath))
    writer = HDF5DatasetWriter((len(paths), 256, 256, 3), outputPath)

    # init the progress bar
    widgets = ["Building Dataset: ", progressbar.Percentage(), " ",progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval=len(paths), widgets=widgets).start()


    # loop over the image path
    for (i, (path, label)) in enumerate (zip(paths, labels)):
        # load the image and process it
        image = cv2.imread(path)
        image = aap.preprocess(image)

        # if we are building the training dataset, then compute the
        # mean of each channel in the image, then update the
        # respective lists

        if dType == "train":
            (b,g,r) = cv2.mean(image)[:3]
            R.append(r)
            G.append(g)
            B.append(b)

        # add the image and label # to the HDF5 dataset
        writer.add([image], [label])
        pbar.update(i)

    # close the HDF5  writer
    pbar.finish()
    writer.close()
    


Building Dataset:   0% |                                       | ETA:  --:--:--

[INFO] building HDF5/train.hdf5...


Building Dataset: 100% |########################################| Time: 0:02:58
Building Dataset:   0% |                                       | ETA:  --:--:--

[INFO] building HDF5/val.hdf5...


Building Dataset: 100% |########################################| Time: 0:00:22
Building Dataset:   0% |                                       | ETA:  --:--:--

[INFO] building HDF5/test.hdf5...


Building Dataset: 100% |########################################| Time: 0:00:23


In [6]:
# construct a dictionary of averages, then serializes the means to a JSON file
print("[INFO] serializing image...")
D = {"R": np.mean(R), "G":np.mean(G), "B":np.mean(B)}
f = open(config.DATASET_MEAN, 'w')
f.write(json.dumps(D))
f.close()

[INFO] serializing image...
