In [None]:
# Copyright 2018 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# OpenCV Date Preprocessing of Images for DNN/CNN

This 'ML pipeline' module uses openCV in Python to preprocess images for a Deep Neural Network (DNN) or Convoluntional Neural Network (CNN).

*Prerequites*

1. Images must be grayscale (single channel), RGB or BGR (three channel color) or RGBA (+alpha channel).
2. Images must be 8 bits per pixel (bpp), but all images must be of the same bits per pixel.
3. Images may be a mix of size and colorspace.
4. Each image must have a single classification (i.e., label)
5. Images may be in JPG, BMP, PNG, TIF formats.
6. Images must be read in from disk.
7. You must have enough memory to load all the images

*Not Supported*

1. 12 bit per pixel (such as electronmicrospy), and 16 bits per pixel (high color range ~ gamut).
2. CMYK colorspace

*Image Preprocessing*
1. Conversion: Select Color vs. Gray Scale  
2. Resize: Select Size  
3. Normalization: Select Normalization method  
4. Output:  
    A) Select vector format (1D -> DNN, 2D -> CNN)   
    B) Select data type (float16 or float32)

### Imports

This module uses the following libraries:

        numpy           - in-memory arrays
        cv2             - image manipulation (openCV, version 2)
        multiprocessing - concurrent processing
    

In [None]:
import numpy as np
import cv2
import multiprocessing as mp

## Support Functions

This module has the following support functions:

`image_input()`     -  Read image in from disk, perform any colorspace conversion.  
`image_resize()`    -  Perform any resizing (downsampling) of image.  
`image_normalize()` -  Perform normalization of pixel values.    
`load_files()`      -  Perform processing of images for a collection (same label).  
`load_directory()`  -  Perform processing of collections (dataset) laid out in a directory/subdirectory structure.

### Image Input and Conversion (image_input)

1. Read image in from a file into raw pixel format using openCV.
2. Convert to either color or grayscale when read in.

A color image consists of three or more channels (i.e., color planes), while a grayscale image consists of a single channel. Each additional channel requires more in-memory space, larger input vector and neural network, and longer to train. For example, a 100x100 (height x width) grayscale image when decompressed into raw pixel data is 10,000 bytes (at 8 bits per pixel). The same color image, which has three 100x100 color planes, when decompressed into raw pixel data is 3,000 bytes.

OpenCV uses the CCIR 601 formula for converting RGB to grayscale:

(0.299 * Red) + (0.587 * Green) + (0.114 * Blue)

*Best Practice*

Grayscale is sufficient for training recognition of objects which are 2D in nature, such as handwritting, and sketching. Typically, color, texture, etc does not contribute to the human or machine identification of the object. These objects are generally recognized by their shape (edge detection).

All other forms of object classification tend to do better with color.

In [None]:
GRAYSCALE = cv2.IMREAD_GRAYSCALE
COLOR     = cv2.IMREAD_COLOR

def image_input(file, colorspace=COLOR):
    """ Read an image in from disk and convert to specified color space.
    Args:
        file      : (str) file path to the image.
        colorspace: (int) the openCV flag for colorspace conversion.

    Returns:
        Uncompressed 'color converted' raw pixel data as numpy array
        
    Raises:
        Exception: could not read in image.
    """
    try:
        return cv2.imread(file, colorspace)
    except:
        raise Exception('image_input(): could not read in image: ' + file)

### Resize (image_resize)

Resize performs downsampling from a higher resolution to a lower resolution image. At a lower resolution, the input vector is smaller and the number of neurons/layers needed is less. 

*Best Practice*

At high resolutions, images have more pixel information than what's needed to train a model. The more pixel data that is retained, the bigger the input vector, memory footprint and time needed to train. Best practice is to determine the minimum size of the input vector to get the desired result, and resize the images accordingly.

When downsampling, cv2.INTER_AREA and cv2.INTER_NEAREST methods produce the least artificats.
When upsampling, cv2.INTER_LINEAR, cv2.INTER_CUBIC and cv2.INTER_LANCZOS4 methods will produce the smoothest edges.

In [None]:
def image_resize(image, resize=(128,128), flatten=False):
    """ Resize (downsample) an image for the target neural network
    Args:
        image : (numpy) an image in raw pixel data
        resize: (tuple(int, int)) the new size of the image specified as (height, width)
    Returns:
        The resized image as raw pixel data as numpy array
        
    Raises:
        Exception: Could not resize the image.
    """
    # size must be of type set and length two (i.e., (H, W))
    try:
        if flatten:
            return cv2.resize(image, resize, interpolation=cv2.INTER_AREA).flatten()
        return cv2.resize(image, resize, interpolation=cv2.INTER_AREA)
    except Exception as e:
        raise Exception('image_resize(): could not resize image to: ' + str(resize))

### Normalization

Normalizing pixel data is used to smooth out the dynamic range of pixel information, lowering noise and generally decreases the time that it takes a neural network to converge to a desired accuracy.

There are three common techniques for pixel normalization:

    Normalization between 0 and 1  : (x - x.min()) / (x.max() - x.min())         
    Normalization between -1 and 1 : 2*(x - x.min()) / (x.max() - x.min()) - 1    
    Standardization, with mean at 0: (x - x.mean()) / x.std()                      
    
The first two normalization methods above can be performed fast, while the third (standardization) requires more compute time.

*Best Practice*  

Normalization using 0..1, or -1..1 range are generally sufficient for grayscale images. Color images tend to have a far more dynanic range and generally standardization produces a better result.

To achieve best perform, if there is sufficient memory, the normalization step should be applied as a one-time operation across all the images, vs. one at a time. Generally, numpy multi-dimensional arrays are used for in-memory array of matrices (continuous bytes), in which byte access and matrix operations will be the most efficient.

### Input Vector

Neural networks such as a DNN or FCNN take input as a 1D vector (sometimes referred to as flat). A CNN takes as input a matrix (i.e., multi-dimensional tensor). Additionally, the data type of the pixel information will effect the in-memory space and compute time when training weights. Typically, the datatype is float32 (4 bytes per pixel).

In principle, float16 (i.e., half float) would be preferable in that it reduces the memory space by 50% and if there is hardware support for native half float matrix operations the compute time is reduced by approxiamtely 75%.

During backward probagation, tiny numbers (less than one) are going to be multiple by each other. As these multiplications are probagated it could reach a point where the number is so small that the hardware cannot represent the number anymore. This is known as the vanishing gradient.

*Best Practice*

A half float should only be used if either: the number of layers is very small, or the hardware supports stochastic gradient rounding. In the later, the hardware will detect dot product matrix operations that would result in zero (vanishing gradient) and replace with a random tiny value (as in some NIVIDIA GPUs).

In [None]:
NORMAL_0_1  = 0
NORMAL_N1_1 = 1
STANDARD    = 2

def image_normalization(images, normal=NORMAL_0_1, datatype=np.float32):
    """ Normalize the pixel values of a collection of images.
    Args:
        images  : (numpy) collection of images in raw pixel data as a numpy array of matrices (each corresponding to an image)
        normal  : (int) flag for selecting the normalization method
        datatype: (type) the datatype to convert the raw pixel data to
    Returns:
        A collection of normalized images as a numpy array of matrices (each corresponding to an image).
    Raises:
        ValueError: Invalid value for normal.
    """
    # This normalizes (scales pixel values) between the range 0 .. 255
    images = images.astype(datatype)
    if normal == NORMAL_0_1:
        images /= 255.0
    # This normalizes (scales pixel values) between the range -128 .. 127
    elif normal == NORMAL_N1_1:
        images = images / 127.5 - 1
    # This uses standardization, where pixel are scaled with a mean of 0 and standard deviation of 1
    elif normal == STANDARD:
        images = (images - images.mean()) / np.std(images)
        # the 1e-5 is to add a tiny amount to prevent the possibility of dividing by zero.
        #images = (images - images.mean()) / np.sqrt(images.var() + 1e-5)
    else:
        raise ValueError('normalization(): invalid parameter for normal: ' + str(normal))
    return images

## Loader

The load_directory() routine loads a collection of images from disk to be preprocessed for training a neural network. There are many possible directory/file layout for training images. This module supports a popular and commonly seen layout as follows:

                                    root_dir  
                                    /   |   \
                                        V
                                    
            subdir_class1          subdir_class2 ...              subdir_classN
              /  |  \
                 V
        image1 image2 ... imageN
        
In this layout, the toplevel (root directory) is the parent of the collection. Underneath it are a plurality of subdirectories. Each subdirectory represents a unique class (label) of images. For example, if the collection was for cats and dogs, one subdirectory would be for 'cats' and the other for 'dogs'. Under each subdirectory are the images that correspond to the subdirectory's class.


The load_directory() module performs the following steps:
    1. Takes as input the root (parent) directory and verifies the layout.
    2. For each subdirectory (class), process the group of images under the subdirectory.
    3. For each subdirectory (class), assemble a prepared dataset (processed image data and labels) for a neural network.
    
*Best Practices*

For balancing, each class (subdirectory) should have roughly an equal number of images. If unbalanced, a bias maybe introduced into the trained model. For example, if 90% of the images are cats and 10% are dogs, the trained model will likely predict all dogs as cats.

Angle, lighting and perspective maybe important dependent on the deployed application. While image augmentation is popular, it's incorrect usage can lead to deployed models incorrectly (false positives) identifying objects. For example, if the deployed model is for a fixed positioned camera overlooking a conveyour belt, and the training included perspective changes, wide lighting variance, hue variance, and grainness, the model may inadvertently learn 'noise' as part of the identification. In the deployed version, toss something on the conveyour belt that does not belong there and the model may misidentify it (false positive).

While these techniques prevent overfitting through generalization, the counter is the increase likelihood of false positives. Only use images (and image augmentation) that reflect the actual image input conditions of the deployed model. 

In [None]:
def load_files(files, colorspace=COLOR, resize=(128,128), normal=NORMAL_0_1, flatten=False, datatype=np.float32, label=None):
    """ Load a list of file paths and preprocess as images for a nerual network.
    Args:
        files     : (list(str)) a list of files paths of images (of the same classification).
        colorspace: (int) the openCV flag for colorspace conversion
        resize    : (tuple(int, int)) the new size of the image specified as (height, width)
        normal    : (int) flag for selecting the normalization method
        flatten   : (bool) flag for selecting to flatten into 1D vector (True)
        datatype  : (type) the datatype to convert the raw pixel data to
        label     : (str) the label (class) associated with the files (collection)

    Returns:
        A collection of images as a numpy array of matrices (each corresponding to an image) ready for feeding
        into the input vector of a neural network, and a list of errors for (if) any image failed to be processed.
        
    Raises:
        None.
    """
    images = []
    errors = []
    for file in files:
        try:
            # Read in an image from disk
            image = image_input(file, colorspace)
            # Resize the image for the target neural network
            images.append(image_resize(image, resize, flatten))
        except Exception as e:
            # Skip processing this image,
            # Keep a list of the images that failed to process and reason why
            errors.append( (file, e) )
    
    try:
        # Convert list of images into numpy multidimensional array
        images = np.asarray(images)
        # Normalize the images
        images = image_normalization(images, normal, datatype)
    except Exception as e:
        # this is a critical (unrecoverable) error
        return None, label, errors
    
    # Assemble a multidimensional numpy array of input vectors for this list of files.
    return images, label, errors

In [None]:
import os, time

def load_directory(dir, colorspace=COLOR, resize=(128,128), normal=NORMAL_0_1, flatten=False, datatype=np.float32, concurrent=1, verbose=False):
    """ Load and Process a dataset of images for training a neural network.
    Args:
        dir       : (str) A directory structure of images, where subfolders are the classes.
        colorspace: (int) the openCV flag for colorspace conversion.
        resize    : (tuple(int, int)) the new size of the image specified as (height, width).
        normal    : (int) flag for selecting the normalization method.
        flatten   : (bool) flag for selecting to flatten into 1D vector (True).
        datatype  : (type) the datatype to convert the raw pixel data to.
        concurrent: (int) the number of collections to process in parallel.
        verbose   : (bool) flag to display to console progress, warnings and errors.
    
    Returns:
        A list of tuples, where each tuple is the pair: processed images for a class, and the corresponding class.
        
    Raises:
        None.
    """
    if not os.path.isdir(dir):
        raise Exception('load_directory(): root dir is not a directory: ' + dir)

    start_time = time.time()

    # return object: set of collections and corresponding labels
    collections = []
    
    # concurrency setup
    pool = None
    if concurrent > 1:
        pool = mp.Pool(concurrent)

    # Add directory seperator, if not already
    if not dir.endswith('/'):
        dir += '/'
    # Iterate through all the subdirectories. These should be the classes (labels) and their corresponding contents
    # the images.
    subdirs = [dir + subdir for subdir in os.listdir(dir) ]
    for subdir in subdirs:
        # Process only subdirectories. For example, there maybe a license file under the root (parent) directory.
        if os.path.isdir(subdir):
            # Get all the files in the directory
            files = [subdir + '/' + file for file in os.listdir(subdir)]
            
            # Subdirectory name is the label for these images (collection)
            label = os.path.basename(subdir)
            try:
                # Load and process all the images for this collection (class)
                if pool:
                    pool.apply_async(load_files, (files, colorspace, resize, normal, flatten, datatype, label), callback=collections.append)
                else:
                    data, _, errors  = load_files(files, colorspace, resize, normal, flatten, datatype, label)
                    
                    # Assemble a list of each collection and its label
                    collections.append( (data, label) )
                if verbose: print("Data Preprocessed:", subdir)
            except Exception as e:
                if verbose: print("ERROR: Unable to process images in Directory:", subdir, e)
        else:
            if verbose: print("WARNING: Directory entry is not a folder:", subdir)
     
    if pool:
        pool.close()
        pool.join()
            
    if verbose: print("Total Time:", time.time() - start_time)
    return collections