# Training a network for object detection

This puts everything together.
Please note: I'm not training from scratch.
YOLO is pretrained on imagenet, I simply don't have the resources for that.

So I've used this code: https://github.com/allanzelener/YAD2K
To export tiny yolo in keras format.

From the exported model, I'm cutting off the "regression head".
Only the convolutions up to the last maxpooling are taken, everything after that is new and trained with my setup.

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

import cv2

import sys
sys.path.insert(0,"/home/lars/libraries/keras/")
import keras
assert keras.__version__[0] == "2", "we work on version 2 of keras"

In [None]:
from keras.layers import Input
from keras.layers import BatchNormalization, SpatialDropout2D
from keras.layers.pooling import MaxPool2D
from keras.layers.convolutional import Conv2D
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model, load_model

import keras.backend as K

from keras.callbacks import Callback, ModelCheckpoint

### allow dynamic memory allocation

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)

### Load the pretrained model

In [None]:
extraction_model = load_model("models/tiny_yolo_voc.h5")

In [None]:
extraction_model.summary()

In [None]:
# the pretrained weights shouldn't be updated any more
# I'm only using them for feature extraction
for layer in extraction_model.layers:
    layer.trainable = False

### Extracting features from the pretrained model
Now I'm taking features from the pretrained model and concatenating them to one big feature map.
This code is meant as a template to take features from any extraction model and merge them.
As you can see, I'm only using the features after the last pooling.
But you could take this code and combine it with vgg16 or something else, then you might want to take intermediate feature maps, too.

The big yolo network has skip connections and does use something like this.

In [None]:
from keras.layers.merge import Concatenate

block3 = extraction_model.get_layer(name="max_pooling2d_4").output
block4 = extraction_model.get_layer(name="max_pooling2d_5").output
block5 = extraction_model.get_layer(name="max_pooling2d_6").output

block3_resized = MaxPool2D((2,2), name="e_maxpool1")(block3)
block4_resized = block4
block5_resized = block5

shape3 = [int(dim) for dim in block3_resized.shape[1:3]]
shape4 = [int(dim) for dim in block4_resized.shape[1:3]]
shape5 = [int(dim) for dim in block5_resized.shape[1:3]]

assert shape3 == shape4, "resolution must be identical"
assert shape4 == shape5, "resolution must be identical"

#extracted_features = Concatenate(axis=-1)([
#    block3_resized, 
#    block4_resized, 
#    block5_resized])
extracted_features = block5_resized
extracted_features.shape.as_list()

### A new model
This recreates the layout of tiny yolo.
But the layers are not trained yet. This way I can check if the setup really works.

In [None]:
B = 5   # number of anchor boxes
C = 20  # number of classes

In [None]:
# start with the extracted features
conv = extracted_features

# block 1
conv = Conv2D(1024, 3, 
              padding="same",
              use_bias = False,
              kernel_regularizer = keras.regularizers.l2(0.0005),
              name="head_conv1")(conv)
conv = BatchNormalization(name="head_bnorm1")(conv)
conv = LeakyReLU(0.1, name="head_lrelu1")(conv)
conv = SpatialDropout2D(0.3)(conv)

# block 2
conv = Conv2D(1024, 3, 
              padding="same",
              use_bias = False,
              kernel_regularizer = keras.regularizers.l2(0.0005),
              name="head_conv2")(conv)
conv = BatchNormalization(name="head_bnorm2")(conv)
conv = LeakyReLU(0.1, name="head_lrelu2")(conv)
conv = SpatialDropout2D(0.15)(conv)

# output
conv = Conv2D(B *(C+5), 1, 
              padding="same",
              use_bias = True,
              kernel_regularizer = keras.regularizers.l2(0.0005),
              name="head_conv3")(conv)

detection_model = Model(inputs=extraction_model.input, outputs = conv)

### parameters of the model

These are training parameters.
The input and output resolution are important for setting up the boxes as loss for training.
The lambdas are factors to weigh the different loss components against each other.

In [None]:
input_tensor = detection_model.input

in_x = int(input_tensor.shape[1])
in_y = int(input_tensor.shape[2])

output_tensor = detection_model.output

out_x = int(output_tensor.shape[1])
out_y = int(output_tensor.shape[2])

lambda_coords = 10
lambda_class = 2
lambda_obj = 5
lambda_noobj = 0.5

### Set up the training data
Follow the guide on the darknet side to set up VOC:
https://pjreddie.com/darknet/yolo/

In [None]:
# prepare a config for the augmentations
config={}
config["max_hsv_scale"] = [0.1, 0.5, 0.5]
config["max_rotation"] = 10
config["max_shift"] = 0.05
config["zoom_range"] = (0.8,1.2)

In [None]:
train_path= "/home/lars/data/darknet/VOC/train.txt"
test_path = "/home/lars/data/darknet/VOC/2007_test.txt"

In [None]:
# iterator class to provide data to model.fit_generator
from generator import Augmenter

In [None]:
batch_size=64

# generators for training data and test data
train_gen = Augmenter(train_path, 
                      in_x, in_y, out_x, out_y,
                      B, C, batch_size = batch_size)
val_gen = Augmenter(test_path, 
                      in_x, in_y, out_x, out_y,
                      B, C, batch_size = batch_size)

In [None]:
# test the generator
batch = next(val_gen)
imgs = batch[0]
objects = batch[1]
    
plt.imshow(imgs[0, :,:])

# Loss function

The loss function makes use of currying. Therefore this code is a little complicated.
Keras expects a loss in this format loss(y_true, y_pred).

But the loss_func in loss.py needs to know additional parameters such as the network size.
I'm feeding that data by currying the loss_func and providing the additional parameters now.
The result is a function with two remaining parameters and a signature as expected by keras.

This currying can go very wrong, if you mix up the order of the parameters.
If the loss function is called, it prints the parameters it has been given.
Be sure to check this.
Look at model.compile.

In [None]:
# anchor boxes are taken from the tiny yolo voc config
anchors = np.zeros((B, 2))
anchors[:] =[[1.08,1.19],  [3.42,4.41],  [6.63,11.38],  [9.42,5.11],  [16.62,10.52]]

# the anchors are given as width, height
# this doesn't work with numpy's layout
# we have to switch the x and y dimensions
temp = anchors[:,0].copy()
anchors[:,0]=anchors[:,1]
anchors[:,1]= temp

In [None]:
from loss_function import loss_func


meta_data = [anchors, out_x, out_y, B, C, lambda_class, lambda_coords, lambda_obj, lambda_noobj]
loss = loss_func(*meta_data)

# Training the model
Compile with the custom loss, set up a few callbacks and train.

In [None]:
from keras.optimizers import Adam, SGD

# check this: are the parameters correct ?
detection_model.compile(Adam(lr=0.0001), loss)

#detection_model.compile(SGD(lr=1e-4, momentum=0.9, decay = 1e-7), loss)

In [None]:
# taken from the keras source
# if the learning rate is too fast, NaNs can occur, stop the training in this case
class TerminateOnNaN(Callback):
    
    def __init__(self):
        self.seen = 0

    def on_batch_end(self, batch, logs=None):
        self.seen += 1
        
        logs = logs or {}
        loss = logs.get('loss')
        
        if loss is not None:
            if np.isnan(loss) or np.isinf(loss):
                print('Batch %d: Invalid loss, terminating training' % (batch))
                print("logs: ", logs)
                
                self.model.stop_training = True

In [None]:
nan_terminator = TerminateOnNaN()

# train in small steps and append histories
# if training is interrupted, the history array still contains usable data
import time
histories = []
times = []
for i in range(20):

    history=detection_model.fit_generator(train_gen, 6400//batch_size, 
                                          epochs=10, 
                                          callbacks=[nan_terminator],
                                          validation_data = val_gen,
                                          validation_steps = 1600//batch_size,
                                          #use_multiprocessing=False)
                                          workers =4,
                                          max_queue_size=24)
    histories.append(history)
    times.append(time.time())

### Plot the test / val loss
As you can see, the model reaches about 1000 for validation loss.
Then it overfits.

This number can't be interpreted correctly. It depends on the size of the network and the batch.
A solution would be to take the mean in the loss instead of summing all components.
But that would mess with the learning rate.

I'm evaluating the pretrained model against the validation generator.
Surprisingly, the new model reaches better scores.
A possible explanation: The original yolo doesn't use rotations as augmentations. The validation generator uses rotations.
Or the number of samples from the validation set was simply too small

In [None]:
losses = []
val_losses = []

for item in histories:
    losses.extend(item.history["loss"])
    val_losses.extend(item.history["val_loss"])

In [None]:

plt.plot(losses)
plt.plot(val_losses)
plt.legend(["train", "val"])
plt.title("training loss")
plt.show()

In [None]:
extraction_model.compile(Adam(lr=0.0001), loss)

In [None]:
len(histories)

In [None]:
extraction_model.evaluate_generator(val_gen, 12800//batch_size, max_queue_size=20, workers=4, use_multiprocessing=False)

# Evaluation

### Two important helper functions to work with the data
with get_probabilities you can extract the predicted classes, objectness and the combined probability from the output of the network

the extract_from_blob helps with the blob of data fed to the keras loss.
This blob is hard to read, so the function slices the individual parts out of it and converts them to a dictionary

In [None]:
from prediction_utils import extract_from_blob, get_probabilities

In [None]:
test_gen = val_gen
# get some sample data
batch = next(test_gen)
img = batch[0].copy()

plt.imshow(img[0])
plt.show()

In [None]:
# feed the data to the model
preds = detection_model.predict(batch[0])
preds.shape

### Comparing given objectness with confidence of the network

In [None]:
# extract the given objectness for this image
loss_dict = extract_from_blob(batch[1], out_x, out_y, B, C)

# read the given objectness out of the loss dictionary
f_objectness = loss_dict["f_objectness"].reshape((-1,out_x, out_y, B))

In [None]:
# get the data out of the predictions
classes, objectness, probs = get_probabilities(preds)

# probs is along the B dimension
# for every cell in the output activation map, get the best bounding box score
max_probs = probs.max(axis=-1)

threshold = 0.3
thresholded = max_probs > threshold

f, axes = plt.subplots(1, 3, figsize=(10,10))


axes[0].imshow(f_objectness[0,:,:,0])
axes[0].set_title("given objectness")

axes[1].imshow(max_probs)
axes[1].set_title("confidence")

axes[2].imshow(thresholded)
axes[2].set_title("thresholded")
plt.show()

### Getting the predicted bounding boxes

In [None]:
# which coordinates are bigger than the threshold ?
xy = np.where(thresholded)

# reshape the predictions to expose the anchor boxes along the 4th axis
preds = preds.reshape((-1, out_x, out_y, B, 4 + 1 + C))

# array to store the detections
detections=[]

# look at all the coordinates found by the thresholding
for row, col in zip(xy[0], xy[1]):
    
    # for this coordinate, find the box with the highest objectness
    current_probs = objectness[row, col]
    box_idx = np.argmax(current_probs)
    box = preds[0, row, col, box_idx]
    
    # get the predicted coordinates, convert them to percent
    # this is the same code as in the generator and the loss function
    # the network learns to predict coordinates encoded in this way
    p_x = (row + np_sigmoid(box[0])) / out_x
    p_y = (col + np_sigmoid(box[1])) / out_y
    p_dx = (np.exp(box[2]) * anchors[box_idx, 0]) / out_x
    p_dy = (np.exp(box[3]) * anchors[box_idx, 1]) / out_y
    
    # resize the predicted coordinates to the input resolution
    min_x = int ((p_x - p_dx/2.) * in_x)
    max_x = int ((p_x + p_dx/2.) * in_x)
    min_y = int ((p_y - p_dy/2.) * in_y)
    max_y = int ((p_y + p_dy/2.) * in_y)
    
    # clip them to the image size
    min_x = max(min_x, 0)
    max_x = min(max_x, in_x)
    min_y = max(min_y, 0)
    max_y = min(max_y, in_y)
    
    # get the highest class prediction
    current_classes = classes[row, col, box_idx]
    label = np.argmax(current_classes)
    
    
    detections.append((label, min_x, max_x, min_y, max_y, current_probs.max()))

In [None]:
print("number of detections: ", len(detections))

## Non-Max Suppression
Sometimes yolo will predict the same object in more than one cell.
This happens mostly for very big objects where the center of the object is not clear.

We need non-max suppression to remove overlapping bounding boxes.

We apply the non-max suppression to each label separately.

In [None]:
# taken from the yolo repository
names = ["aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor"]

In [None]:
# sort the detections, create a dictionary that maps labels to detections and their confidence scores
label_dict={}

for detection in detections:
    
    label, min_x, max_x, min_y, max_y, score = detection
    
    if label in label_dict:
        label_dict[label].append(((min_x, min_y, max_x, max_y), score))
    else:
        label_dict[label] = [((min_x, min_y, max_x, max_y), score)]

In [None]:
# create a new dictionary. Again, it maps labels to detections
# but the detections are now filtered with non-max suppression
nms = {}

for label in label_dict:
    boxes = [box for (box, score) in label_dict[label]]
    scores = [score for (box, score) in label_dict[label]]
    
    # tensorflow has a built-in algorithm for non-max suppresion
    # the result is an array of indexes into the list of boxes
    # those indices are the chosen / retained boxes
    # unfortunately, the list is a tensor
    # we need a session to evaluate the tensor
    # at the very top of this notebook we have created this session
    idx = tf.image.non_max_suppression(boxes, scores, 5, iou_threshold=0.2)
    idx = sess.run(idx)
    
    # boxes we keep
    boxes = [boxes[i] for i in idx]
    
    nms[label] = boxes

In [None]:
# convert the mapping in nms
# instead of label -> list of boxes
# we now have
# name -> list of boxes
nms = {names[key]: value for key,value in nms.items()}

In [None]:
print("we found the following boxes after non-max suppression")
print(nms)

### Plotting the output
I'm using opencv to draw rectangles around all detections
and to write the name in text onto the image.

The image has a very low resolution.
For the output it is upscaled.
The main reason for this is to allow high-res text.

In [None]:
img = batch[0][0]
output_img = img.copy()
dim_x, dim_y = output_img.shape[:2]
factor = 5
output_img = cv2.resize(output_img, (dim_y*factor, dim_x*factor))

for label in nms:
    boxes = nms[label]
    for box in boxes:
        min_x, min_y, max_x, max_y = box
        min_x *= factor
        min_y *= factor
        max_x *= factor
        max_y *= factor
        
        cv2.rectangle(output_img,(min_y, min_x),(max_y, max_x),(0,1,0),10)
        cv2.putText(output_img, label, (min_y, min_x), cv2.FONT_HERSHEY_SIMPLEX, fontScale=3.5, color=(0, 0, 1), thickness=12)

In [None]:
plt.imshow(output_img)
plt.show()