# Depth Prediction from RGB and Infrared Input

This model predicts a depth image given a rgb and an infrared input image of the same resolution.



## Import the necessary moduls

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from keras.utils import Sequence # for data generator class
from keras.models import Model
from keras.layers import Input, Conv2D, Activation, BatchNormalization, Dropout, concatenate, Conv2DTranspose
from keras.layers import Add # for skip connections
from keras.utils import plot_model
import json # for saving training history

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Training data situation
Training data (as well as test and validation data) will lie in directories with the following structure:

<pre>
data
|-- train
    |-- Color
        |-- 1.jpg
        |-- 2.jpg
        ...
        |-- n.jpg
    |-- Infrared
        |-- 1.png
        |-- 2.png
        ...
        |-- n.png
    |-- Depth
        |-- 1.png
        |-- 2.png
        ...
        |-- n.png
|-- test
    |-- Color
    |-- Infrared
    |-- Depth
|-- validation
    |-- Color
    |-- Infrared
    |-- Depth
</pre>

## The Data Generator
Because there are many training and test images, it is reasonable to utilize a data loader, which reads training data batch wise. Because the default keras data loader (`ImageDataGenerator`) does not work with two input parameters, we need to write our own. For this, the tutorial from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly is utilized.

In [2]:
class DataGenerator(Sequence):
    'Assumes that examples in the provided folder are named from 1 to n, with n being the number of images'
    def __init__(self, path_to_data_set='data/train', batch_size=32, image_size=(480,640), shuffle=True, scale_images=False):
        self.path_to_data = path_to_data_set
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = shuffle
        self.scale_images = scale_images
        self.training_size = self.__get_training_data_size(self.path_to_data)
        self.on_epoch_end()
        
    def __get_training_data_size(self, path_to_data):
        'gets the number of samples'
        path_color = os.path.join(path_to_data,'Color')
        if os.path.isdir(path_color):
            size = len([color for color in os.listdir(path_color) if os.path.isfile(os.path.join(path_color, color))])
            return size
        else:
            return 0
        
    def __len__(self):
        'Number of batches per epoche'
        return int(np.floor(self.training_size / self.batch_size))
    
    def on_epoch_end(self):
        'Update indices (and their ordering) after each epoch'
        # image names start with 1, np.arange(n,m) returns values from n to (m-1)
        self.indices = np.arange(1, self.training_size+1)
        if self.shuffle == True:
            np.random.shuffle(self.indices)
            
    def __data_generation(self, list_images):
        'Generates data of size batch_size' # X = (batch_size, 480, 640, 1)
        X1 = np.empty((self.batch_size, *self.image_size, 3), dtype=np.uint8) # color images
        X2 = np.empty((self.batch_size, *self.image_size), dtype=np.uint16) # ir image
        y = np.empty((self.batch_size, *self.image_size), dtype=np.uint16)  # depth image
        
        # Generate data
        for idx, name in enumerate(list_images):
            # load images in arrays
            img = cv2.imread(os.path.join(self.path_to_data, 'Color', str(name)+".jpg"), cv2.IMREAD_COLOR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            if self.scale_images == False:
                X1[idx,] = img.astype(np.uint8)
            else:
                X1[idx,] = (img/255.).astype(np.float32)
            
            img = cv2.imread(os.path.join(self.path_to_data, 'Infrared', str(name)+".png"), cv2.IMREAD_ANYDEPTH)
            if self.scale_images == False:
                X2[idx,] = img.astype(np.uint16)
            else:
                X2[idx,] = (img/65535.).astype(np.float32)
            
            img = cv2.imread(os.path.join(self.path_to_data, 'Depth', str(name)+".png"), cv2.IMREAD_ANYDEPTH)
            y[idx,] = img.astype(np.uint16)
        
        return X1, X2.reshape(-1, 480, 640, 1), y.reshape(-1, 480, 640, 1)
    
    def __getitem__(self, index):
        'Generate one batch of data, X1 contains 8-bit RGB images, X2 16-bit infrared images and y corresponding 16-bit depth images'
        # Generate indices of data
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        
        # Generate data
        X1, X2, y = self.__data_generation(indices)
        
        return [X1, X2], y
    
    
            

## Initialize the DataLoader

In [3]:
training_generator = DataGenerator(
    path_to_data_set=os.path.join('data', 'train'),
    batch_size=32,
    image_size=(480,640),
    shuffle=True,
    scale_images=True
    )

validation_generator = DataGenerator(
    path_to_data_set=os.path.join('data', 'validation'),
    batch_size=32,
    image_size=(480,640),
    shuffle=True,
    scale_images=True
    )

## VGG Class Definition
To make the model more friendly to read (and to prevent the repetition of layer code), this part defines a function to create multiple layers at once

In [35]:
class VGG:
    def __init__(self):
        self.layer_counting = {}
        
    def VGG_Block(self, number_of_layers, units, kernel_size, padding, activation):
        def Input(z):
            for i in range(1,number_of_layers+1):
                name = 'Conv' + str(kernel_size[0]) + '-' + str(units)
                # make sure we have unique layer names
                if name in self.layer_counting:
                    self.layer_counting[name] += 1
                else:
                    self.layer_counting[name] = 1
                name += '_' + str(self.layer_counting[name])
                z = Conv2D(units, kernel_size=kernel_size, padding=padding, activation=activation, name=name)(z)
                name_bn = name + '_BN'
                z = BatchNormalization(name=name_bn)(z)
            return z
        return Input

## The actual model
This section defines the network architecture of the neural network. It consists of different parts:
- input layers
- fusion layer
- VGG16-like encoder network (configuration D) (see https://arxiv.org/pdf/1409.1556.pdf)
- mirrored decoder network
- output

In [36]:
vgg = VGG()
# Color branch
input_color = Input(shape=(480,640,3), name="Color_Input")
x = Model(inputs=input_color, outputs=input_color)

# Infrared branch
input_ir = Input(shape=(480,640,1), name="Infrared_Input")
y = Model(inputs=input_ir, outputs=input_ir)

# combine both branches
combined = concatenate([x.output, y.output], name="Concatenate")

# first skip connection start
skip_one = combined

# VGG16 style encoder (configuration D)
z = vgg.VGG_Block(number_of_layers=2, units=64, kernel_size=(3,3), padding="same", activation="relu")(combined)
# max pooling replaced with strided convolution
z = Conv2D(64, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="Conv3-64_strided")(z)

# second skip connection start
skip_two = z

z = vgg.VGG_Block(number_of_layers=2, units=128, kernel_size=(3,3), padding="same", activation="relu")(z)
# max pooling replaced with strided convolution
z = Conv2D(128, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="Conv3-128_strided")(z)

# third skip connection start
skip_three = z

z = vgg.VGG_Block(number_of_layers=3, units=256, kernel_size=(3,3), padding="same", activation="relu")(z)
# max pooling replaced with strided convolution
z = Conv2D(256, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="Conv3-256_strided")(z)

# fourth skip connection start
skip_four = z

z = vgg.VGG_Block(number_of_layers=3, units=512, kernel_size=(3,3), padding="same", activation="relu")(z)
# max pooling replaced with strided convolution
z = Conv2D(512, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="Conv3-512_strided")(z)

# fifth skip connection start
skip_five = z

z = vgg.VGG_Block(number_of_layers=3, units=512, kernel_size=(3,3), padding="same", activation="relu")(z)
# max pooling replaced with strided convolution
z = Conv2D(512, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="Conv3-512_strided_2")(z)

# sixth skip connection start
skip_six = z

# end of encoder part

z = vgg.VGG_Block(number_of_layers=3, units=1024, kernel_size=(3,3), padding="same", activation="relu")(z)

# start of decoder part (= mirrored encoder part)

z = Conv2DTranspose(512, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="DeConv3-512_1")(z)
z = vgg.VGG_Block(number_of_layers=3, units=512, kernel_size=(3,3), padding="same", activation="relu")(z)

# fifth skip connection end
#z = Add()([z, skip_five])

z = Conv2DTranspose(512, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="DeConv3-512_2")(z)
z = vgg.VGG_Block(number_of_layers=3, units=512, kernel_size=(3,3), padding="same", activation="relu")(z)


# fourth skip connection end
#z = Add()([z, skip_four])

z = Conv2DTranspose(256, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="DeConv3-512_3")(z)
z = vgg.VGG_Block(number_of_layers=3, units=256, kernel_size=(3,3), padding="same", activation="relu")(z)


# third skip connection end
#z = Add()([z, skip_three])

z = Conv2DTranspose(128, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="DeConv3-256")(z)
z = vgg.VGG_Block(number_of_layers=2, units=128, kernel_size=(3,3), padding="same", activation="relu")(z)

# second skip connection end
#z = Add()([z, skip_two])

z = Conv2DTranspose(64, kernel_size=(3,3), strides=(2,2), padding="same", activation="relu", name="DeConv3-128")(z)
z = vgg.VGG_Block(number_of_layers=2, units=64, kernel_size=(3,3), padding="same", activation="relu")(z)


# first skip connection end
#z = Add()([z, skip_one])


# end of decoder part

# output layer
z = Conv2D(1, kernel_size=(3,3), padding="same", name="Conv3-1")(z)

model = Model(inputs=[x.input, y.input], outputs=z)

model.compile(
    optimizer="adam",
    loss="mae",
    metrics=['mae', 'mse'])



# TODO: add skip connections: https://towardsdatascience.com/understanding-and-coding-a-resnet-in-keras-446d7ff84d33

## Visualize the network

In [37]:
#plot_model(model, to_file=os.path.join('Images','model.png'), show_shapes=True)
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Color_Input (InputLayer)        (None, 480, 640, 3)  0                                            
__________________________________________________________________________________________________
Infrared_Input (InputLayer)     (None, 480, 640, 1)  0                                            
__________________________________________________________________________________________________
Concatenate (Concatenate)       (None, 480, 640, 4)  0           Color_Input[0][0]                
                                                                 Infrared_Input[0][0]             
__________________________________________________________________________________________________
Conv3-64_1 (Conv2D)             (None, 480, 640, 64) 2368        Concatenate[0][0]                
__________

## Train the Model

In [None]:
hist = model.fit_generator(
           generator=training_generator,
           validation_data=validation_generator,
           #use_multiprocessing=True,
           #workers=6,
           epochs=100)

## Save Model and Training History

In [None]:
with open('history.json', 'w') as f:
    json.dump(hist.history, f)
    
model.save('model.h5')