In [1]:
import tensorflow as tf
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display
import glob

In [2]:
_BATCH_NORM_DECAY = 0.9
_BATCH_NORM_EPSILON = 1e-05
_LEAKY_RELU = 0.1
_ANCHORS = [(10, 13), (16, 30), (33, 23),
            (30, 61), (62, 45), (59, 119),
            (116, 90), (156, 198), (373, 326)]
_MODEL_SIZE = (416, 416)

In [3]:
def batch_norm(inputs, training, data_format):
    return tf.layers.batch_normalization(
        inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
        momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON,
        scale=True, training=training)

In [4]:
def fixed_padding(inputs, kernel_size, data_format):
    print('[INFO] fixed padding')
    pad_total = kernel_size - 1
    pad_beg = pad_total // 2
    pad_end = pad_total - pad_beg

    if data_format == 'channels_first':
        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
                                        [pad_beg, pad_end],
                                        [pad_beg, pad_end]])
    else:
        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
                                        [pad_beg, pad_end], [0, 0]])
    return padded_inputs

In [5]:
test = tf.Variable(tf.truncated_normal([10,448,448,3], stddev=0.02))

fixed_padding(test,3,data_format=None)

Instructions for updating:
Colocations handled automatically by placer.
[INFO] fixed padding


<tf.Tensor 'Pad:0' shape=(10, 450, 450, 3) dtype=float32>

In [6]:
def conv2d_fixed_padding(inputs, filters, kernel_size, data_format, strides=1):
    #print('[INFO] conv2d')
    if strides > 1:
        inputs = fixed_padding(inputs, kernel_size, data_format)

    return tf.layers.conv2d(
        inputs=inputs, filters=filters, kernel_size=kernel_size,
        strides=strides, padding=('SAME' if strides == 1 else 'VALID'),
        use_bias=False, data_format=data_format)

In [7]:
def darknet53_residual_block(inputs, filters, training, data_format,
                             strides=1):
    shortcut = inputs

    inputs = conv2d_fixed_padding(
        inputs, filters=filters, kernel_size=1, strides=strides,
        data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    inputs = conv2d_fixed_padding(
        inputs, filters=2 * filters, kernel_size=3, strides=strides,
        data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    inputs += shortcut

    return inputs

In [23]:
def darknet53(inputs, training, data_format):
    inputs = conv2d_fixed_padding(inputs, filters=32, kernel_size=3,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
    print(inputs.get_shape())
    inputs = conv2d_fixed_padding(inputs, filters=64, kernel_size=3,
                                  strides=2, data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
    print(inputs.get_shape())
    inputs = darknet53_residual_block(inputs, filters=32, training=training,
                                      data_format=data_format)
    print(inputs.get_shape())
    inputs = conv2d_fixed_padding(inputs, filters=128, kernel_size=3,
                                  strides=2, data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
    print(inputs.get_shape())
    for _ in range(2):
        inputs = darknet53_residual_block(inputs, filters=64,
                                          training=training,
                                          data_format=data_format)
    print(inputs.get_shape())
    inputs = conv2d_fixed_padding(inputs, filters=256, kernel_size=3,
                                  strides=2, data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
    print(inputs.get_shape())
    for _ in range(8):
        inputs = darknet53_residual_block(inputs, filters=128,
                                          training=training,
                                          data_format=data_format)

    print(inputs.get_shape())
    route1 = inputs

    inputs = conv2d_fixed_padding(inputs, filters=512, kernel_size=3,
                                  strides=2, data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
    print(inputs.get_shape())
    for _ in range(8):
        inputs = darknet53_residual_block(inputs, filters=256,
                                          training=training,
                                          data_format=data_format)

    route2 = inputs
    print(inputs.get_shape())
    inputs = conv2d_fixed_padding(inputs, filters=1024, kernel_size=3,
                                  strides=2, data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
    print(inputs.get_shape())
    for _ in range(4):
        inputs = darknet53_residual_block(inputs, filters=512,
                                          training=training,
                                          data_format=data_format)
    print(inputs.get_shape())
    return route1, route2, inputs

In [24]:
test = tf.Variable(tf.truncated_normal([10,410,410,3], stddev=0.02))

route1, route2, inputs = darknet53(test,training=False,data_format=None)
print('route1 : ',route1.get_shape())
print('route2 : ',route2.get_shape())
print('inputs : ',inputs.get_shape())

(10, 410, 410, 32)
[INFO] fixed padding
(10, 205, 205, 64)
(10, 205, 205, 64)
[INFO] fixed padding
(10, 103, 103, 128)
(10, 103, 103, 128)
[INFO] fixed padding
(10, 52, 52, 256)
(10, 52, 52, 256)
[INFO] fixed padding
(10, 26, 26, 512)
(10, 26, 26, 512)
[INFO] fixed padding
(10, 13, 13, 1024)
(10, 13, 13, 1024)
route1 :  (10, 52, 52, 256)
route2 :  (10, 26, 26, 512)
inputs :  (10, 13, 13, 1024)


In [25]:
def yolo_convolution_block(inputs, filters, training, data_format):
    inputs = conv2d_fixed_padding(inputs, filters=filters, kernel_size=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    inputs = conv2d_fixed_padding(inputs, filters=2 * filters, kernel_size=3,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    inputs = conv2d_fixed_padding(inputs, filters=filters, kernel_size=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    inputs = conv2d_fixed_padding(inputs, filters=2 * filters, kernel_size=3,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    inputs = conv2d_fixed_padding(inputs, filters=filters, kernel_size=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    route = inputs

    inputs = conv2d_fixed_padding(inputs, filters=2 * filters, kernel_size=3,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training=training, data_format=data_format)
    inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

    return route, inputs

In [26]:
route, inputs = yolo_convolution_block(inputs, 
                                       filters=512, 
                                       training=False,
                                       data_format=None)

print('route : ',route.get_shape())
print('inputs : ',inputs.get_shape())

route :  (10, 13, 13, 512)
inputs :  (10, 13, 13, 1024)


In [27]:
def yolo_layer(inputs, n_classes, anchors, img_size, data_format):
    """Creates Yolo final detection layer.

    Detects boxes with respect to anchors.

    Args:
        inputs: Tensor input.
        n_classes: Number of labels.
        anchors: A list of anchor sizes.
        img_size: The input size of the model.
        data_format: The input format.

    Returns:
        Tensor output.
    """
    n_anchors = len(anchors)
    shape = inputs.get_shape().as_list()
    print("input : ",shape)
    # 나오는 값의 크기 : n_anchors * (5 + n_classes)
    inputs = tf.layers.conv2d(inputs, filters=n_anchors * (5 + n_classes),
                              kernel_size=1, strides=1, use_bias=True,
                              data_format=data_format)
    shape = inputs.get_shape().as_list()
    print("n_anchors * (5 + n_classes) : ",shape)
    grid_shape = shape[2:4] if data_format == 'channels_first' else shape[1:3]
    print("grid : ",grid_shape)
    if data_format == 'channels_first':
        inputs = tf.transpose(inputs, [0, 2, 3, 1])
    inputs = tf.reshape(inputs, [-1, n_anchors * grid_shape[0] * grid_shape[1],
                                 5 + n_classes])
    print("reshape : ",inputs.get_shape().as_list())
    strides = (img_size[0] // grid_shape[0], img_size[1] // grid_shape[1])
    print("output strides",strides)
    box_centers, box_shapes, confidence, classes = \
        tf.split(inputs, [2, 2, 1, n_classes], axis=-1)
    print('box center : ',box_centers.get_shape().as_list())
    print('box shape : ',box_shapes.get_shape().as_list())
    print('box confidence : ',confidence.get_shape().as_list())
    print('box classes : ',classes.get_shape().as_list())
    x = tf.range(grid_shape[0], dtype=tf.float32)
    print('grid x : ',x.get_shape().as_list())
    y = tf.range(grid_shape[1], dtype=tf.float32)
    print('grid y : ',y.get_shape().as_list())
    x_offset, y_offset = tf.meshgrid(x, y)
    x_offset = tf.reshape(x_offset, (-1, 1))
    y_offset = tf.reshape(y_offset, (-1, 1))
    x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
    x_y_offset = tf.tile(x_y_offset, [1, n_anchors])
    x_y_offset = tf.reshape(x_y_offset, [1, -1, 2])
    print('grid offset : ',x_y_offset.get_shape().as_list())
    box_centers = tf.nn.sigmoid(box_centers)
    print('sigmoid box center : ',box_centers.get_shape().as_list())
    box_centers = (box_centers + x_y_offset) * strides
    print('offset + box center : ',box_centers.get_shape().as_list())
    anchors = tf.tile(anchors, [grid_shape[0] * grid_shape[1], 1])
    print('anchors',anchors.get_shape().as_list())
    box_shapes = tf.exp(box_shapes) * tf.to_float(anchors)
    with tf.Session() as sess:
        print(sess.run(anchors))
    print('box shape : ',box_shapes.get_shape().as_list())
    confidence = tf.nn.sigmoid(confidence)
    print('box confidence : ',confidence.get_shape().as_list())
    classes = tf.nn.sigmoid(classes)
    print('box classes : ',classes.get_shape().as_list())
    inputs = tf.concat([box_centers, box_shapes,
                        confidence, classes], axis=-1)
    print('result : ',inputs.get_shape().as_list())
    return inputs

In [28]:
detect1 = yolo_layer(inputs, n_classes=80,
                                 anchors=_ANCHORS[0:3],
                                 img_size=_MODEL_SIZE,
                                 data_format=None)

#tf.reshape(detect1,(-1,26,26,3,85))

input :  [10, 13, 13, 1024]
n_anchors * (5 + n_classes) :  [10, 13, 13, 255]
grid :  [13, 13]
reshape :  [10, 507, 85]
output strides (32, 32)
box center :  [10, 507, 2]
box shape :  [10, 507, 2]
box confidence :  [10, 507, 1]
box classes :  [10, 507, 80]
grid x :  [13]
grid y :  [13]
grid offset :  [1, 507, 2]
sigmoid box center :  [10, 507, 2]
offset + box center :  [10, 507, 2]
anchors [507, 2]
[[10 13]
 [16 30]
 [33 23]
 ...
 [10 13]
 [16 30]
 [33 23]]
box shape :  [10, 507, 2]
box confidence :  [10, 507, 1]
box classes :  [10, 507, 80]
result :  [10, 507, 85]


In [14]:
def upsample(inputs, out_shape, data_format):
    if data_format == 'channels_first':
        inputs = tf.transpose(inputs, [0, 2, 3, 1])
        new_height = out_shape[3]
        new_width = out_shape[2]
    else:
        new_height = out_shape[2]
        new_width = out_shape[1]

    inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width))

    if data_format == 'channels_first':
        inputs = tf.transpose(inputs, [0, 3, 1, 2])

    return inputs

In [15]:
inputs = conv2d_fixed_padding(route, filters=256, kernel_size=1,
                                          data_format=None)
inputs = batch_norm(inputs, training=False,
                    data_format=None)
inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)

print(inputs.get_shape())

upsample_size = route2.get_shape().as_list()
inputs = upsample(inputs, out_shape=upsample_size,
                              data_format=None)
print(inputs.get_shape())

(10, 13, 13, 256)
(10, 26, 26, 256)


In [16]:
def build_boxes(inputs):
    print("===build===")
    center_x, center_y, width, height, confidence, classes = \
        tf.split(inputs, [1, 1, 1, 1, 1, -1], axis=-1)
    print(center_x.get_shape().as_list())
    print(center_y.get_shape().as_list())
    print(width.get_shape().as_list())
    print(height.get_shape().as_list())
    print(confidence.get_shape().as_list())
    print(classes.get_shape().as_list())
    
    top_left_x = center_x - width / 2
    top_left_y = center_y - height / 2
    bottom_right_x = center_x + width / 2
    bottom_right_y = center_y + height / 2

    boxes = tf.concat([top_left_x, top_left_y,
                       bottom_right_x, bottom_right_y,
                       confidence, classes], axis=-1)
    print(boxes.get_shape().as_list())
    return boxes

In [17]:
def non_max_suppression(inputs, n_classes, max_output_size, iou_threshold,
                        confidence_threshold):
    batch = tf.unstack(inputs)
    boxes_dicts = []
    for boxes in batch:
        boxes = tf.boolean_mask(boxes, boxes[:, 4] > confidence_threshold)
        classes = tf.argmax(boxes[:, 5:], axis=-1)
        classes = tf.expand_dims(tf.to_float(classes), axis=-1)
        boxes = tf.concat([boxes[:, :5], classes], axis=-1)

        boxes_dict = dict()
        for cls in range(n_classes):
            mask = tf.equal(boxes[:, 5], cls)
            mask_shape = mask.get_shape()
            if mask_shape.ndims != 0:
                class_boxes = tf.boolean_mask(boxes, mask)
                boxes_coords, boxes_conf_scores, _ = tf.split(class_boxes,
                                                              [4, 1, -1],
                                                              axis=-1)
                boxes_conf_scores = tf.reshape(boxes_conf_scores, [-1])
                indices = tf.image.non_max_suppression(boxes_coords,
                                                       boxes_conf_scores,
                                                       max_output_size,
                                                       iou_threshold)
                class_boxes = tf.gather(class_boxes, indices)
                boxes_dict[cls] = class_boxes[:, :5]

        boxes_dicts.append(boxes_dict)

    return boxes_dicts

In [18]:
class Yolo_v3:
    def __init__(self, n_classes, model_size, max_output_size, iou_threshold,
             confidence_threshold, data_format=None):
        if not data_format:
            if tf.test.is_built_with_cuda():
                data_format = 'channels_first'
            else:
                data_format = 'channels_last'

        self.n_classes = n_classes
        self.model_size = model_size
        self.max_output_size = max_output_size
        self.iou_threshold = iou_threshold
        self.confidence_threshold = confidence_threshold
        self.data_format = data_format
    def __call__(self, inputs, training):
        with tf.variable_scope('yolo_v3_model'):
            if self.data_format == 'channels_first':
                inputs = tf.transpose(inputs, [0, 3, 1, 2])

            inputs = inputs / 255
            # feature extract
            route1, route2, inputs = darknet53(inputs, training=training,
                                               data_format=self.data_format)
        
            route, inputs = yolo_convolution_block(
                inputs, filters=512, training=training,
                data_format=self.data_format)
            # detection
            detect1 = yolo_layer(inputs, n_classes=self.n_classes,
                                 anchors=_ANCHORS[6:9],
                                 img_size=self.model_size,
                                 data_format=self.data_format)

            inputs = conv2d_fixed_padding(route, filters=256, kernel_size=1,
                                          data_format=self.data_format)
            inputs = batch_norm(inputs, training=training,
                                data_format=self.data_format)
            inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
            upsample_size = route2.get_shape().as_list()
            inputs = upsample(inputs, out_shape=upsample_size,
                              data_format=self.data_format)
            axis = 1 if self.data_format == 'channels_first' else 3
            inputs = tf.concat([inputs, route2], axis=axis)
            
            route, inputs = yolo_convolution_block(
                inputs, filters=256, training=training,
                data_format=self.data_format)
            detect2 = yolo_layer(inputs, n_classes=self.n_classes,
                                 anchors=_ANCHORS[3:6],
                                 img_size=self.model_size,
                                 data_format=self.data_format)

            inputs = conv2d_fixed_padding(route, filters=128, kernel_size=1,
                                          data_format=self.data_format)
            inputs = batch_norm(inputs, training=training,
                                data_format=self.data_format)
            inputs = tf.nn.leaky_relu(inputs, alpha=_LEAKY_RELU)
            upsample_size = route1.get_shape().as_list()
            inputs = upsample(inputs, out_shape=upsample_size,
                              data_format=self.data_format)
            inputs = tf.concat([inputs, route1], axis=axis)
            route, inputs = yolo_convolution_block(
                inputs, filters=128, training=training,
                data_format=self.data_format)
            detect3 = yolo_layer(inputs, n_classes=self.n_classes,
                                 anchors=_ANCHORS[0:3],
                                 img_size=self.model_size,
                                 data_format=self.data_format)

            inputs = tf.concat([detect1, detect2, detect3], axis=1)
            inputs = build_boxes(inputs)

            boxes_dicts = non_max_suppression(
                inputs, n_classes=self.n_classes,
                max_output_size=self.max_output_size,
                iou_threshold=self.iou_threshold,
                confidence_threshold=self.confidence_threshold)

            return boxes_dicts

In [129]:
inputs = tf.concat([inputs, route2], axis=3)

In [130]:
print(inputs.get_shape())
route, inputs = yolo_convolution_block(
                inputs, filters=256, training=False,
                data_format=None)

(10, 26, 26, 768)


In [131]:
print(route.get_shape())
print(inputs.get_shape())

(10, 26, 26, 256)
(10, 26, 26, 512)
