# 3D bounding box model

In this notebook, we design the model that takes training images and 2D bounding box information and outputs 3D bounding boxes.

In [33]:
# Import statements.
from data_processing import parse_annotation, data_gen
from dataset import TRAIN_KEY, VAL_KEY
import os
import numpy as np
from typing import Dict, Any
import tensorflow as tf
from tensorflow import keras
import keras.backend as K
from keras.callbacks import History
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, LeakyReLU, Dropout, Reshape, Softmax

In [32]:
# Global constants.
VGG_INPUT_SHAPE = (224, 224, 3)
DIM_OUT_SHAPE = 3
ORIENT_OUT_SHAPE = (2, 2)
CONF_OUT_SHAPE = 2
DEFAULT_MODEL_ARGS = {'input_shape': (224, 224, 3)}
DEFAULT_TRAIN_ARGS = {'epochs': 5,
                      'batch_size': 32,
                      'use_tensorboard': False,
                      'model_checkpoint_filename': None}

In [22]:
def orientation_loss(y_true, y_pred):
        anchors = tf.reduce_sum(tf.square(y_true), axis=2)
        anchors = tf.greater(anchors, tf.constant(0.5))
        anchors = tf.reduce_sum(tf.cast(anchors, tf.float32), 1)
        loss = (y_true[:, :, 0] * y_pred[:, :, 0] + y_true[:, :, 1] * \
                y_pred[:, :, 1])
        loss = tf.reduce_sum((2 - 2 * tf.reduce_mean(loss, axis=0))) / anchors
        return tf.reduce_mean(loss)

In [8]:
def get_model_3d_deepbox(vgg_filename) -> keras.Model:
    inputs = keras.Input(shape=VGG_INPUT_SHAPE)
    x = Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu")(inputs)
    x = Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu")(x)
    x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x)
    x = Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x)
    x = Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x)
    x = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x)
    x = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(x)
    x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x)
    conv5 = Flatten()(x)
    dim_out = Dense(512, activation=None)(conv5)
    dim_out = LeakyReLU(alpha=0.1)(dim_out)
    dim_out = Dropout(rate=0.5)(dim_out)
    dim_out = Dense(DIM_OUT_SHAPE, activation=None)(dim_out)
    orient_out = Dense(256, activation=None)(conv5)
    orient_out = LeakyReLU(alpha=0.1)(orient_out)
    orient_out = Dropout(rate=0.5)(orient_out)
    orient_out = Dense(ORIENT_OUT_SHAPE[0] * ORIENT_OUT_SHAPE[1], activation=None)(orient_out)
    orient_out = Reshape([-1, ORIENT_OUT_SHAPE[0], ORIENT_OUT_SHAPE[1]])(orient_out)
    orient_out = Lambda(lambda x: K.l2_normalize(x, axis=2))(orient_out)
    conf_out = Dense(256, activation=None)(conv5)
    conf_out = LeakyReLU(alpha=0.1)(conf_out)
    conf_out = Dropout(rate=0.5)(conf_out)
    conf_out = Dense(CONF_OUT_SHAPE, activation=None)(conf_out)
    conf_out = Softmax()(conf_out)
    model = keras.Model(inputs=inputs, outputs=[dim_out, orient_out, conf_out])
    model.compile(loss=[tf.keras.losses.mean_squared_error,
                        orientation_loss,
                        tf.keras.losses.categorical_crossentropy],
                  loss_weights=[4, 8, 1],
                  optimizer='adam')
    return model

In [21]:
def train(model: keras.Model, partition, labels, image_dir, label_dir, train_args=DEFAULT_TRAIN_ARGS) -> History:
    train_objs = parse_annotation(partition[TRAIN_KEY], label_dir)
    val_objs = parse_annotation(partition[VAL_KEY], label_dir)
    np.random.shuffle(train_objs)
    np.random.shuffle(val_objs)
    train_gen = data_gen(image_dir, train_objs, train_args['batch_size'])
    val_gen = data_gen(image_dir, val_objs, train_args['batch_size'])
    train_steps = int(np.ceil(len(train_objs) / train_args['batch_size']))
    val_steps = int(np.ceil(len(val_objs) / train_args['batch_size']))
    callbacks = []
    if train_args['use_tensorboard']:
        log_dir = 'logs_{0}'.format(datetime.now())
        tensorboard_callback = TensorBoard(log_dir=log_dir)
        callbacks.append(tensorboard_callback)
    if train_args['model_checkpoint_filename']:
        checkpoint_callback = ModelCheckpoint(
            train_args['model_checkpoint_filename'],
            save_best_only=True)
        callbacks.append(checkpoint_callback) 
    return model.fit(
        x=train_gen,
        epochs=train_args['epochs'],
        callbacks=callbacks,
        validation_data=val_gen,
        steps_per_epoch=train_steps,
        validation_steps=val_steps)

In [13]:
def evaluate(model, partition, labels, image_dir) -> Dict[str, Any]:
    return None

In [15]:
def predict(model, image_paths):
    return None