In [0]:
import math
import numpy as np

import keras.backend as K
from keras.layers import Input, Conv2D, MaxPooling2D

#from utils import L2Normalization

In [0]:
num_classes = 2
share_location = True

config = {
	'run_soon': True,
	'resume_training': True,
	'remove_old_models': False,
	'denser_prior_boxes': True,
	'use_polygon': True,
	'train_data': "./data/train_lmdb/",
	'test_data': "./data/test_lmdb/",
	'resize_width': 384,
	'resize_height': 384,
	'lr_mult': 1,
	'base_lr': 0.0001,
	'pretrain_model': "models/model_pre_train_syn.caffemodel",
	'label_map_file': "data/text/labelmap_voc.prototxt",
	'flip': True,
	'clip': False,
}

min_dim = 300
mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
# in percent %
min_ratio = 10
max_ratio = 90
step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
min_sizes = []
max_sizes = []
for ratio in range(min_ratio, max_ratio + 1, step):
    min_sizes.append(min_dim * ratio / 100.)
    max_sizes.append(min_dim * (ratio + step) / 100.)
min_sizes = [min_dim * 10 / 100.] + min_sizes
max_sizes = [min_dim * 20 / 100.] + max_sizes
steps = [8, 16, 32, 64, 100, 300]
# aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
aspect_ratios = [[2,3,4,5], [2,3,4,5], [2,3,4,5], [2,3,4,5], [2,3,4,5], [2,3,4,5]]
# L2 normalize conv4_3.
normalizations = [20, -1, -1, -1, -1, -1]
# variance used to encode/decode prior bboxes.
prior_variance = [0.1, 0.1, 0.2, 0.2]

In [0]:
def CreatedMultihead_Multitask(input_tensor,
                                    from_layers,
                                    min_sizes,
                                    max_sizes,
                                    use_polygon,
                                    aspect_ratios,
                                    steps,
                                    normalizations,
                                    num_classes,
                                    share_location,
                                    flip,
                                    clip,
                                    prior_variance,
                                    denser_prior_boxes,
                                    kernel_size,
                                    pad):
    ''' creates the top layer network for detecting the bounding boxes '''

    assert num_classes, "must provide num_classes"
    assert num_classes > 0, "num_classes must be positive number"
    if normalizations:
        assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length"
    assert len(from_layers) == len(min_sizes), "from_layers and min_sizes should have same length"
    if max_sizes:
        assert len(from_layers) == len(max_sizes), "from_layers and max_sizes should have same length"
    if aspect_ratios:
        assert len(from_layers) == len(aspect_ratios), "from_layers and aspect_ratios should have same length"
    if steps:
        assert len(from_layers) == len(steps), "from_layers and steps should have same length"

    num = len(from_layers)
    should_normalize = []
    num_outputs_loc = []
    num_outputs_conf = []
    for i in range(0, num):
        # Get the normalize value.
        if normalizations:
            if normalizations[i] != -1:
                should_normalize.append(('True', normalizations[i]))

        # Estimate number of priors per location given provided parameters.
        min_size = min_sizes[i]
        if type(min_size) is not list:
            min_size = [min_size]
        aspect_ratio = []
        if len(aspect_ratios) > i:
            aspect_ratio = aspect_ratios[i]
            if type(aspect_ratio) is not list:
                aspect_ratio = [aspect_ratio]
        max_size = []
        if len(max_sizes) > i:
            max_size = max_sizes[i]
            if type(max_size) is not list:
                max_size = [max_size]
            if max_size:
                assert len(max_size) == len(min_size), "max_size and min_size should have same length."
        if max_size:
            num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
        else:
            num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
        if flip:
            num_priors_per_location += len(aspect_ratio) * len(min_size)
        step = []
        if len(steps) > i:
            step = steps[i]
        if denser_prior_boxes:
             num_priors_per_location=2*num_priors_per_location

        # number of outputs for localizing layer
        if use_polygon:
            num_loc_output = num_priors_per_location * (4 + 8)
        else:
            num_loc_output = num_priors_per_location * (4 + 5)
        if not share_location:
            num_loc_output *= num_classes
        num_outputs_loc.append(num_loc_output)

        # number of outputs for confidence layer
        num_conf_output = num_priors_per_location * num_classes
        num_outputs_conf.append(num_conf_output)

    return num_outputs_loc, num_outputs_conf

In [0]:
def VGG16Body(input_tensor, istrainable=True):
    ''' fully convolutionized VGG model '''

    # conv1
    conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv1_2', trainable=istrainable)(conv1_1)
    conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv1_1', trainable=istrainable)(input_tensor)
    pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2)

    # conv2
    conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2_1', trainable=istrainable)(pool1)
    conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2_2', trainable=istrainable)(conv2_1)
    pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2)

    # conv3
    conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_1')(pool2)
    conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_2')(conv3_1)
    conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_2')(conv3_2)
    pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3)

    # conv4
    conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_1')(pool3)
    conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_2')(conv4_1)
    conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_3')(conv4_2)
    pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3)

    # conv5
    conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv5_1')(pool4)
    conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv5_2')(conv5_1)
    conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv5_3')(conv5_2)
    pool5 = MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same', name='pool5')(conv5_3)

    # original VGG16 model has FC layer after Conv5 but here we want fully conv layer
    # so we convert the original FC layers to conv layers
    # conv6 (fc6)
    conv6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', name='conv6')(pool5)

    # conv7
    conv7 = Conv2D(1024, (1, 1), activation='relu', padding='same', name='conv7')(conv6)

    return conv7

In [0]:
def AddExtraLayers(input_tensor):
    ''' extra convolution layers on top of VGG16 '''

    # conv6
    conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', name='conv6_1')(input_tensor)
    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1)
    conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', name='conv6_2')(conv6_1)

    # conv7
    conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', name='conv7_1')(conv6_2)
    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1)
    conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', name='conv7_2')(conv7_1)

    # conv8
    conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', name='conv8_1')(conv7_2)
    conv8_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', name='conv8_2')(conv8_1)

    # conv9
    conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', name='conv9_1')(conv8_2)
    conv9_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', name='conv9_2')(conv9_1)

    return conv9_2

In [0]:
def TextBoxesplusplusModel(image_shape,
                        mean_subtraction,
                        stddev_norm,
                        swap_channels):
    ''' full model of TextboxesPlusPlus '''

    img_height, img_width, img_channels = image_shape[0], image_shape[1], image_shape[2]

    #######################################################
    # Define the lambda layers necessary
    #######################################################
    def image_identity_layer(input_tensor):
        return input_tensor

    def image_mean_subtraction_layer(input_tensor):
        return input_tensor - np.array(mean_subtraction)

    def image_stddev_norm_layer(input_tensor):
        return input_tensor - np.array(stddev_norm)

    def image_swap_channels_layer(input_tensor):
        swap_tensor = K.stack([input_tensor[...,swap_channels[0]], input_tensor[..., swap_channels[1]], \
                                    input_tensor[..., swap_channels[2]]], axis=-1)
        return swap_tensor

    #######################################################
    # Define the Textboxes++ model
    #######################################################
    x = Input(shape=(img_height, img_width, img_channels))

    # create a identity layer for further use
    x1 = Lambda(image_identity_layer, output_shape=image_shape, name='image_identity_layer')(x)

    ''' apply the basic normalizations as applied '''
    # mean subtraction
    if mean_subtraction:
        x1 = Lambda(image_mean_subtraction_layer, output_shape=image_shape, name='image_mean_subtraction')(x1)
    # stddev normalization
    if stddev_norm:
        x1 = Lambda(image_stddev_norm_layer, output_shape=image_shape, name='image_stddev_norm')(x1)
    # channel swap
    if swap_channels:
        x1 = Lambda(image_swap_channels_layer, output_shape=image_shape, name='image_swap_channels')(x1)

    # fully convolutional VGG16 layer frontend
    x1 = VGG16Body(x1)

    # add extra convolution layers on TextboxesPlusPlus
    x1 = AddExtraLayers(x1)

    # normalize conv4_3 layer
    conv4_3_norm = L2L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)