In [1]:
import numpy as np
import tensorflow as tf

from tensorflow.keras import Model
from tensorflow.keras.layers import (
    Add,
    Concatenate,
    Conv2D,
    Input,
    Lambda,
    LeakyReLU,
    MaxPool2D,
    UpSampling2D,
    ZeroPadding2D,
)

from tensorflow.keras.regularizers import l2

In [2]:
yolo_max_boxes = 100 #maximum number of detections at one time
yolo_iou_threshold = 0.5 #how close two of the same class have to be in order to count it as one detection
yolo_score_threshold = 0.5 #minimum detected confidence of a class in order to count it as a detection


yolo_anchors = np.array([(10, 13), (16, 30), (33, 23), (30, 61), (62, 45),
                         (59, 119), (116, 90), (156, 198), (373, 326)],
                        np.float32) / 416
yolo_anchor_masks = np.array([[6, 7, 8], [3, 4, 5], [0, 1, 2]])
from yolov3_tf2.models import YoloV3, YoloV3Tiny
from yolov3_tf2.utils import load_darknet_weights

In [3]:
class BatchNormalization(tf.keras.layers.BatchNormalization):
    """
    Make trainable=False freeze BN for real (the og version is sad)
    """

    def call(self, x, training=False):
        if training is None:
            training = tf.constant(False)
        training = tf.logical_and(training, self.trainable)
        return super().call(x, training)

In [4]:
def DarknetConv(x, filters, size, strides=1, batch_norm=True):
    if strides == 1:
        padding = 'same'
    else:
        x = ZeroPadding2D(((1, 0), (1, 0)))(x)  # top left half-padding
        padding = 'valid'
    x = Conv2D(filters=filters, kernel_size=size,
               strides=strides, padding=padding,
               use_bias=not batch_norm, kernel_regularizer=l2(0.0005))(x)
    if batch_norm:
        x = BatchNormalization()(x)
        x = LeakyReLU(alpha=0.1)(x)
    return x


def DarknetResidual(x, filters):
    prev = x
    x = DarknetConv(x, filters // 2, 1)
    x = DarknetConv(x, filters, 3)
    x = Add()([prev, x])
    return x


def DarknetBlock(x, filters, blocks):
    x = DarknetConv(x, filters, 3, strides=2)
    for _ in range(blocks):
        x = DarknetResidual(x, filters)
    return x


def Darknet(name=None):
    x = inputs = Input([None, None, 3])
    x = DarknetConv(x, 32, 3)
    x = DarknetBlock(x, 64, 1)
    x = DarknetBlock(x, 128, 2)  # skip connection
    x = x_36 = DarknetBlock(x, 256, 8)  # skip connection
    x = x_61 = DarknetBlock(x, 512, 8)
    x = DarknetBlock(x, 1024, 4)
    return tf.keras.Model(inputs, (x_36, x_61, x), name=name)

def YoloConv(filters, name=None):
    def yolo_conv(x_in):
        if isinstance(x_in, tuple):
            inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:])
            x, x_skip = inputs

            # concat with skip connection
            x = DarknetConv(x, filters, 1)
            x = UpSampling2D(2)(x)
            x = Concatenate()([x, x_skip])
        else:
            x = inputs = Input(x_in.shape[1:])

        x = DarknetConv(x, filters, 1)
        x = DarknetConv(x, filters * 2, 3)
        x = DarknetConv(x, filters, 1)
        x = DarknetConv(x, filters * 2, 3)
        x = DarknetConv(x, filters, 1)
        return Model(inputs, x, name=name)(x_in)
    return yolo_conv

def YoloOutput(filters, anchors, classes, name=None):
    def yolo_output(x_in):
        x = inputs = Input(x_in.shape[1:])
        x = DarknetConv(x, filters * 2, 3)
        x = DarknetConv(x, anchors * (classes + 5), 1, batch_norm=False)
        x = Lambda(lambda x: tf.reshape(x, (-1, tf.shape(x)[1], tf.shape(x)[2],
                                            anchors, classes + 5)))(x)
        return tf.keras.Model(inputs, x, name=name)(x_in)
    return yolo_output

def yolo_boxes(pred, anchors, classes):
    # pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...classes))
    grid_size = tf.shape(pred)[1]
    box_xy, box_wh, objectness, class_probs = tf.split(
        pred, (2, 2, 1, classes), axis=-1)

    box_xy = tf.sigmoid(box_xy)
    objectness = tf.sigmoid(objectness)
    class_probs = tf.sigmoid(class_probs)
    pred_box = tf.concat((box_xy, box_wh), axis=-1)  # original xywh for loss

    # !!! grid[x][y] == (y, x)
    grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
    grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)  # [gx, gy, 1, 2]

    box_xy = (box_xy + tf.cast(grid, tf.float32)) / \
        tf.cast(grid_size, tf.float32)
    box_wh = tf.exp(box_wh) * anchors

    box_x1y1 = box_xy - box_wh / 2
    box_x2y2 = box_xy + box_wh / 2
    bbox = tf.concat([box_x1y1, box_x2y2], axis=-1)

    return bbox, objectness, class_probs, pred_box


def yolo_nms(outputs, anchors, masks, classes):
    # boxes, conf, type
    b, c, t = [], [], []

    for o in outputs:
        b.append(tf.reshape(o[0], (tf.shape(o[0])[0], -1, tf.shape(o[0])[-1])))
        c.append(tf.reshape(o[1], (tf.shape(o[1])[0], -1, tf.shape(o[1])[-1])))
        t.append(tf.reshape(o[2], (tf.shape(o[2])[0], -1, tf.shape(o[2])[-1])))

    bbox = tf.concat(b, axis=1)
    confidence = tf.concat(c, axis=1)
    class_probs = tf.concat(t, axis=1)

    scores = confidence * class_probs
    boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
        boxes=tf.reshape(bbox, (tf.shape(bbox)[0], -1, 1, 4)),
        scores=tf.reshape(
            scores, (tf.shape(scores)[0], -1, tf.shape(scores)[-1])),
        max_output_size_per_class=yolo_max_boxes,
        max_total_size=yolo_max_boxes,
        iou_threshold=yolo_iou_threshold,
        score_threshold=yolo_score_threshold
    )

    return boxes, scores, classes, valid_detections


def YoloV3(size=None, channels=3, anchors=yolo_anchors,
           masks=yolo_anchor_masks, classes=80, training=False):
    #80 class labels due to mscoco dataset
    
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    x = inputs = Input([size, size, channels], name='input')

    x_36, x_61, x = Darknet(name='yolo_darknet')(x)

    x = YoloConv(512, name='yolo_conv_0')(x)
    output_0 = YoloOutput(512, len(masks[0]), classes, name='yolo_output_0')(x)

    x = YoloConv(256, name='yolo_conv_1')((x, x_61))
    output_1 = YoloOutput(256, len(masks[1]), classes, name='yolo_output_1')(x)

    x = YoloConv(128, name='yolo_conv_2')((x, x_36))
    output_2 = YoloOutput(128, len(masks[2]), classes, name='yolo_output_2')(x)

    if training:
        return Model(inputs, (output_0, output_1, output_2), name='yolov3')

    boxes_0 = Lambda(lambda x: yolo_boxes(x, anchors[masks[0]], classes),
                     name='yolo_boxes_0')(output_0)
    boxes_1 = Lambda(lambda x: yolo_boxes(x, anchors[masks[1]], classes),
                     name='yolo_boxes_1')(output_1)
    boxes_2 = Lambda(lambda x: yolo_boxes(x, anchors[masks[2]], classes),
                     name='yolo_boxes_2')(output_2)

    outputs = Lambda(lambda x: yolo_nms(x, anchors, masks, classes),
                     name='yolo_nms')((boxes_0[:3], boxes_1[:3], boxes_2[:3]))

    return Model(inputs, outputs, name='yolov3')

In [5]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')

if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

yolo = YoloV3()
#yolo.summary()
load_darknet_weights(yolo, 'yolov3.weights')
img = np.random.random((1, 320, 320, 3)).astype(np.float32)
output = yolo(img)
print('sanity check passed')
yolo.save_weights('yolov3.tf')
yolo.load_weights('yolov3.tf')

class_names = [c.strip() for c in open('coco.names').readlines()]

sanity check passed


In [6]:
import os
import cv2
import time
from matplotlib import pyplot as plt

In [7]:
def transform_images(x_train, size):
    x_train = tf.image.resize(x_train, (size, size))
    x_train = x_train / 255
    return x_train

In [8]:
from seaborn import color_palette
from PIL import Image, ImageDraw, ImageFont

def draw_outputs(img, outputs, class_names):
    colors = ((np.array(color_palette("hls", 80)) * 255)).astype(np.uint8)
    boxes, objectness, classes, nums = outputs
    boxes, objectness, classes, nums = boxes[0], objectness[0], classes[0], nums[0]
    wh = np.flip(img.shape[0:2])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font='futur.ttf',
                              size=(img.size[0] + img.size[1]) // 100)
    for i in range(nums):
        color = colors[int(classes[i])]
        x1y1 = ((np.array(boxes[i][0:2]) * wh).astype(np.int32))
        x2y2 = ((np.array(boxes[i][2:4]) * wh).astype(np.int32))
        thickness = (img.size[0] + img.size[1]) // 200
        x0, y0 = x1y1[0], x1y1[1]
        for t in np.linspace(0, 1, thickness):
            x1y1[0], x1y1[1] = x1y1[0] - t, x1y1[1] - t
            x2y2[0], x2y2[1] = x2y2[0] - t, x2y2[1] - t
            draw.rectangle([x1y1[0], x1y1[1], x2y2[0], x2y2[1]], outline=tuple(color))
        confidence = '{:.2f}%'.format(objectness[i]*100)
        text = '{} {}'.format(class_names[int(classes[i])], confidence)
        print(text)
        text_size = draw.textsize(text, font=font)
        draw.rectangle([x0, y0 - text_size[1], x0 + text_size[0], y0],
                        fill=tuple(color))
        draw.text((x0, y0 - text_size[1]), text, fill='black',
                              font=font)
    rgb_img = img.convert('RGB')
    img_np = np.asarray(rgb_img)
    img = cv2.cvtColor(img_np, cv2.COLOR_BGR2RGB)

    return img

In [14]:
yolo = YoloV3()
#yolo.summary()
load_darknet_weights(yolo, 'yolov3.weights')
#img = np.random.random((1, 320, 320, 3)).astype(np.float32)
#output = yolo(img)
#print('sanity check passed')
yolo.save_weights('yolov3.tf')
yolo.load_weights('yolov3.tf')


video_path = os.path.join(os.getcwd(),'video.mp4')
vid = cv2.VideoCapture(video_path)

fps = 0.0
counter = 0
empty_frame_counter = 0
output_file = True

if output_file:
    # by default VideoCapture returns float instead of int
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))
    codec = cv2.VideoWriter_fourcc(*"XVID")
    out = cv2.VideoWriter('processed_video.mp4', codec, fps, (width, height))

while True:
    _,img = vid.read()
        
    #yolo = YoloV3()
    #yolo.summary()
    #load_darknet_weights(yolo, 'yolov3.weights')
    #img = np.random.random((1, 320, 320, 3)).astype(np.float32)
    #output = yolo(img)
    #print('sanity check passed')
    #yolo.save_weights('yolov3.tf')
    #yolo.load_weights('yolov3.tf')
        
    if img is None:
        time.sleep(0.1)
        empty_frame_counter+=1
        if empty_frame_counter < 3:
            continue
        else: 
            break

    img_in = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
    img_in = tf.expand_dims(img_in, 0)
    img_in = transform_images(img_in, 416)
 
    print(img_in[0][0][0])

    t1 = time.time()
    boxes, scores, classes, nums = yolo.predict(img_in)
    fps  = ( fps + (1./(time.time()-t1)) ) / 2

    img = draw_outputs(img, (boxes, scores, classes, nums), class_names)
    img = cv2.putText(img, "FPS: {:.2f}".format(fps), (0, 30),
                      cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2)
    
    counter += 1
    print('frame counter is',counter)
    print('fps is',fps)
    print('number of detected objects',classes.sum())
    print(' ')
    
    #if classes.sum() != 0:
    #    plt.imshow(img, interpolation='nearest')
    #    plt.show()
        #break
    
    #if counter > 10:
    #    break
    
    if output_file:
        out.write(img)
        
    if cv2.waitKey(1) == ord('q'):
            break

tf.Tensor([0.15007542 0.13936652 0.14615385], shape=(3,), dtype=float32)
person 99.52%
person 98.73%
person 98.10%
person 97.95%
person 94.08%
car 91.23%
person 86.86%
person 70.15%
frame counter is 1
fps is 15.185811366667147
number of detected objects 2.0
 
tf.Tensor([0.15399699 0.14615385 0.15007542], shape=(3,), dtype=float32)
frame counter is 2
fps is 11.11525172832967
number of detected objects 0.0
 
tf.Tensor([0.15686275 0.14901961 0.15294118], shape=(3,), dtype=float32)
frame counter is 3
fps is 8.357161845006022
number of detected objects 0.0
 
tf.Tensor([0.15791856 0.15007542 0.15399699], shape=(3,), dtype=float32)
frame counter is 4
fps is 8.251005402660848
number of detected objects 0.0
 
tf.Tensor([0.16395175 0.1550528  0.15897436], shape=(3,), dtype=float32)
frame counter is 5
fps is 10.370502150428582
number of detected objects 0.0
 
tf.Tensor([0.19426848 0.17360483 0.17360483], shape=(3,), dtype=float32)
frame counter is 6
fps is 9.956244084180014
number of detected obj

frame counter is 56
fps is 17.85129279841813
number of detected objects 0.0
 
tf.Tensor([0.28627452 0.26666668 0.27058825], shape=(3,), dtype=float32)
frame counter is 57
fps is 17.65076476431445
number of detected objects 0.0
 
tf.Tensor([0.2708899  0.25128207 0.26591253], shape=(3,), dtype=float32)
frame counter is 58
fps is 17.36844902709775
number of detected objects 0.0
 
tf.Tensor([0.2708899  0.25128207 0.26591253], shape=(3,), dtype=float32)
frame counter is 59
fps is 17.870011884773206
number of detected objects 0.0
 
tf.Tensor([0.2570136  0.23740573 0.25022626], shape=(3,), dtype=float32)
frame counter is 60
fps is 17.506320074425137
number of detected objects 0.0
 
tf.Tensor([0.25987935 0.24027151 0.25595778], shape=(3,), dtype=float32)
frame counter is 61
fps is 17.69738317876586
number of detected objects 0.0
 
tf.Tensor([0.22533938 0.20678735 0.21855205], shape=(3,), dtype=float32)
frame counter is 62
fps is 17.491573165347816
number of detected objects 0.0
 
tf.Tensor([0.

frame counter is 113
fps is 17.448952457919397
number of detected objects 0.0
 
tf.Tensor([0.11764707 0.09803922 0.10980393], shape=(3,), dtype=float32)
frame counter is 114
fps is 17.461007864826456
number of detected objects 0.0
 
tf.Tensor([0.11764707 0.09803922 0.10980393], shape=(3,), dtype=float32)
frame counter is 115
fps is 17.611660897708504
number of detected objects 0.0
 
tf.Tensor([0.11870287 0.09909502 0.12262443], shape=(3,), dtype=float32)
frame counter is 116
fps is 17.10349603404061
number of detected objects 0.0
 
tf.Tensor([0.12156864 0.10196079 0.1254902 ], shape=(3,), dtype=float32)
frame counter is 117
fps is 17.050224318647743
number of detected objects 0.0
 
tf.Tensor([0.12156864 0.10196079 0.11764707], shape=(3,), dtype=float32)
frame counter is 118
fps is 17.60267052937746
number of detected objects 0.0
 
tf.Tensor([0.13046758 0.09909502 0.09517346], shape=(3,), dtype=float32)
frame counter is 119
fps is 17.387247528511374
number of detected objects 0.0
 
tf.T

frame counter is 167
fps is 17.179610790807956
number of detected objects 0.0
 
tf.Tensor([0.1458522  0.09984918 0.11447965], shape=(3,), dtype=float32)
frame counter is 168
fps is 17.013827986376427
number of detected objects 0.0
 
tf.Tensor([0.1162896  0.07028658 0.08491705], shape=(3,), dtype=float32)
frame counter is 169
fps is 17.056528330386328
number of detected objects 0.0
 
tf.Tensor([0.11553545 0.06953244 0.0841629 ], shape=(3,), dtype=float32)
frame counter is 170
fps is 17.09677665339546
number of detected objects 0.0
 
tf.Tensor([0.11734541 0.07134239 0.08597285], shape=(3,), dtype=float32)
frame counter is 171
fps is 17.806315707781494
number of detected objects 0.0
 
tf.Tensor([0.10769232 0.06455506 0.07631976], shape=(3,), dtype=float32)
frame counter is 172
fps is 18.119949921959098
number of detected objects 0.0
 
tf.Tensor([0.10769232 0.06455506 0.07631976], shape=(3,), dtype=float32)
frame counter is 173
fps is 17.971019691436794
number of detected objects 0.0
 
tf.

frame counter is 221
fps is 17.826534892158907
number of detected objects 0.0
 
tf.Tensor([0.21870288 0.12458523 0.14811464], shape=(3,), dtype=float32)
frame counter is 222
fps is 18.285343632810235
number of detected objects 0.0
 
tf.Tensor([0.54796386 0.43031675 0.45384616], shape=(3,), dtype=float32)
frame counter is 223
fps is 18.47296794396776
number of detected objects 0.0
 
tf.Tensor([0.5411765  0.42352945 0.43529415], shape=(3,), dtype=float32)
frame counter is 224
fps is 18.138223866461125
number of detected objects 0.0
 
tf.Tensor([0.19638011 0.10120664 0.0933635 ], shape=(3,), dtype=float32)
frame counter is 225
fps is 17.70946526799218
number of detected objects 0.0
 
tf.Tensor([0.14087482 0.07526396 0.0546003 ], shape=(3,), dtype=float32)
frame counter is 226
fps is 17.590936722259933
number of detected objects 0.0
 
tf.Tensor([0.18838613 0.09140272 0.06003017], shape=(3,), dtype=float32)
frame counter is 227
fps is 17.860509749532568
number of detected objects 0.0
 
tf.T

frame counter is 275
fps is 18.292174181641485
number of detected objects 0.0
 
tf.Tensor([0.60603327 0.6245852  0.67556566], shape=(3,), dtype=float32)
frame counter is 276
fps is 17.972372954214723
number of detected objects 0.0
 
tf.Tensor([0.46862748 0.48536956 0.55098045], shape=(3,), dtype=float32)
frame counter is 277
fps is 18.330284571887166
number of detected objects 0.0
 
tf.Tensor([0.5063349  0.52594274 0.5808447 ], shape=(3,), dtype=float32)
frame counter is 278
fps is 18.381934354011932
number of detected objects 0.0
 
tf.Tensor([0.4844646 0.507994  0.5432881], shape=(3,), dtype=float32)
frame counter is 279
fps is 18.091082611242403
number of detected objects 0.0
 
tf.Tensor([0.67797893 0.6819005  0.7054299 ], shape=(3,), dtype=float32)
frame counter is 280
fps is 18.06558870367453
number of detected objects 0.0
 
tf.Tensor([0.64856714 0.6524887  0.6760181 ], shape=(3,), dtype=float32)
frame counter is 281
fps is 18.029817012223205
number of detected objects 0.0
 
tf.Ten

In [15]:
out.release()
cv2.destroyAllWindows()

In [16]:
boxes, scores, classes, nums = yolo.predict(img_in)
print(classes.sum())

0.0


In [17]:
classes

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)

In [18]:
output

(<tf.Tensor: shape=(1, 100, 4), dtype=float32, numpy=
 array([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
 