# Training a network for object detection

This puts everything together.
Please note: I'm not training from scratch.
YOLO is pretrained on imagenet, I simply don't have the resources for that.

So I've used this code: https://github.com/allanzelener/YAD2K
To export tiny yolo in keras format.

From the exported model, I'm cutting off the "regression head".
Only the convolutions up to the last maxpooling are taken, everything after that is new and trained with my setup.

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

import cv2

import sys
sys.path.insert(0,"/home/lars/libraries/keras/")
import keras
assert keras.__version__[0] == "2", "we work on version 2 of keras"

In [None]:
from keras.layers import Input
from keras.layers import BatchNormalization, SpatialDropout2D
from keras.layers.pooling import MaxPool2D
from keras.layers.convolutional import Conv2D
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model, load_model

import keras.backend as K

from keras.callbacks import Callback, ModelCheckpoint

### allow dynamic memory allocation

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=False #True
sess = tf.Session(config=config)
K.set_session(sess)

### Load the pretrained model

In [None]:
extraction_model = load_model("models/yolo_1088_320_10_summarized.h5")

In [None]:
extraction_model.summary()

In [None]:
B = 10   # number of anchor boxes
C = 5  # number of classes

In [None]:
# the pretrained weights shouldn't be updated any more
# I'm only using them for feature extraction
for layer in extraction_model.layers[:-21]:
    print(layer.name)
    layer.trainable = False

In [None]:
detection_model = extraction_model

### parameters of the model

These are training parameters.
The input and output resolution are important for setting up the boxes as loss for training.
The lambdas are factors to weigh the different loss components against each other.

In [None]:
input_tensor = detection_model.input

in_x = int(input_tensor.shape[1])
in_y = int(input_tensor.shape[2])

output_tensor = detection_model.output

out_x = int(output_tensor.shape[1])
out_y = int(output_tensor.shape[2])

lambda_coords = 10
lambda_class = 2
lambda_obj = 5
lambda_noobj = 0.5

### Set up the training data
Follow the guide on the darknet side to set up VOC:
https://pjreddie.com/darknet/yolo/

In [None]:
# prepare a config for the augmentations
config={}
config["max_hsv_scale"] = [0, 0, 0]
config["max_rotation"] = 0
config["max_shift"] = 0
config["zoom_range"] = (0.9,1.1)

In [None]:
train_path= "/home/lars/programming/kitti_labeler/out_split/train.txt"
test_path = "/home/lars/programming/kitti_labeler/out_split/val.txt"

In [None]:
# iterator class to provide data to model.fit_generator
from generator import Augmenter

In [None]:
batch_size=4

# generators for training data and test data
train_gen = Augmenter(train_path, 
                      in_x, in_y, out_x, out_y,
                      B, C, batch_size = batch_size)
val_gen = Augmenter(test_path, 
                      in_x, in_y, out_x, out_y,
                      B, C, batch_size = batch_size)

In [None]:
# test the generator
batch = next(train_gen)
imgs = batch[0]
objects = batch[1]
    
plt.imshow(imgs[0, :,:])

# Loss function

The loss function makes use of currying. Therefore this code is a little complicated.
Keras expects a loss in this format loss(y_true, y_pred).

But the loss_func in loss.py needs to know additional parameters such as the network size.
I'm feeding that data by currying the loss_func and providing the additional parameters now.
The result is a function with two remaining parameters and a signature as expected by keras.

This currying can go very wrong, if you mix up the order of the parameters.
If the loss function is called, it prints the parameters it has been given.
Be sure to check this.
Look at model.compile.

In [None]:
# anchor boxes are taken from the tiny yolo voc config
anchors = np.zeros((B, 2))
anchors[:] =np.array([0.18, 0.44, 0.23, 1.35, 0.33, 3.58, 0.43, 0.56, 0.49, 1.87, 0.66, 5.55, 0.99, 2.83, 1.01, 0.83, 1.61,6.34, 3.00,2.70]).reshape((B, 2))

In [None]:
# the anchors are given as width, height
# this doesn't work with numpy's layout
# we have to switch the x and y dimensions

temp = anchors[:,0].copy()
anchors[:,0]=anchors[:,1]
anchors[:,1]= temp

In [None]:
anchors[0]

In [None]:
from loss_function import loss_func


meta_data = [anchors, out_x, out_y, B, C, lambda_class, lambda_coords, lambda_obj, lambda_noobj]
loss = loss_func(*meta_data)

# Training the model
Compile with the custom loss, set up a few callbacks and train.

In [None]:
from keras.optimizers import Adam, SGD

# check this: are the parameters correct ?
#detection_model.compile(Adam(lr=1e-7), loss)

detection_model.compile(SGD(lr=1e-7, momentum=0.1, decay = 1e-7), loss)

In [None]:
# taken from the keras source
# if the learning rate is too fast, NaNs can occur, stop the training in this case
class TerminateOnNaN(Callback):
    
    def __init__(self):
        self.seen = 0

    def on_batch_end(self, batch, logs=None):
        self.seen += 1
        
        logs = logs or {}
        loss = logs.get('loss')
        
        if loss is not None:
            if np.isnan(loss) or np.isinf(loss):
                print('Batch %d: Invalid loss, terminating training' % (batch))
                print("logs: ", logs)
                
                self.model.stop_training = True

In [None]:
nan_terminator = TerminateOnNaN()

# train in small steps and append histories
# if training is interrupted, the history array still contains usable data
import time
histories = []
times = []
for i in range(20):
    history=detection_model.fit_generator(train_gen, 6400//batch_size, 
                                          epochs=5, 
                                          callbacks=[nan_terminator],
                                          validation_data = val_gen,
                                          validation_steps = 1600//batch_size,
                                          #use_multiprocessing=False)
                                          workers =4,
                                          max_queue_size=24)
    histories.append(history)
    times.append(time.time())

### Plot the test / val loss
As you can see, the model reaches about 1000 for validation loss.
Then it overfits.

This number can't be interpreted correctly. It depends on the size of the network and the batch.
A solution would be to take the mean in the loss instead of summing all components.
But that would mess with the learning rate.

I'm evaluating the pretrained model against the validation generator.
Surprisingly, the new model reaches better scores.
A possible explanation: The original yolo doesn't use rotations as augmentations. The validation generator uses rotations.
Or the number of samples from the validation set was simply too small

In [None]:
losses = []
val_losses = []

for item in histories:
    losses.extend(item.history["loss"])
    val_losses.extend(item.history["val_loss"])

In [None]:

plt.plot(losses)
plt.plot(val_losses)
plt.legend(["train", "val"])
plt.title("loss")
plt.show()

# Evaluation

### Two important helper functions to work with the data
with get_probabilities you can extract the predicted classes, objectness and the combined probability from the output of the network

the extract_from_blob helps with the blob of data fed to the keras loss.
This blob is hard to read, so the function slices the individual parts out of it and converts them to a dictionary

In [None]:
#del extract_from_blob
#del get_probabilities
from utils.prediction import extract_from_blob, get_probabilities

In [None]:
test_gen = val_gen
# get some sample data
batch = next(test_gen)
img = batch[0].copy()

plt.imshow(img[0])
plt.show()

In [None]:
# feed the data to the model
predictions = detection_model.predict(batch[0])
predictions.shape

### Comparing given objectness with confidence of the network

In [None]:
# extract the given objectness for this image
loss_dict = extract_from_blob(batch[1], out_x, out_y, B, C)

# read the given objectness out of the loss dictionary
f_objectness = loss_dict["f_objectness"].reshape((-1,out_x, out_y, B))

In [None]:
# get the data out of the predictions
classes, objectness, probs = get_probabilities(predictions[0], out_x, out_y, B, C)

# probs is along the B dimension
# for every cell in the output activation map, get the best bounding box score
max_probs = probs.max(axis=-1)

threshold = 0.3
thresholded = max_probs > threshold

f, axes = plt.subplots(1, 3, figsize=(10,10))


axes[0].imshow(f_objectness[0,:,:,0])
axes[0].set_title("given objectness")

axes[1].imshow(max_probs)
axes[1].set_title("confidence")

axes[2].imshow(thresholded)
axes[2].set_title("thresholded")
plt.show()

In [None]:
max_probs.max()

### Getting the predicted bounding boxes

In [None]:
from utils.activations import np_sigmoid, softmax

from nms import get_detections, apply_nms, idx_to_name

In [None]:
detections = get_detections(predictions[0], threshold, anchors, out_x, out_y, in_x, in_y, B, C)

In [None]:
print("number of detections: ", len(detections))

## Non-Max Suppression
Sometimes yolo will predict the same object in more than one cell.
This happens mostly for very big objects where the center of the object is not clear.

We need non-max suppression to remove overlapping bounding boxes.

We apply the non-max suppression to each label separately.

In [None]:
# taken from the yolo repository
names = ["tree trunk",
"traffic light",
"traffic sign",
"lantern",
"pole"]

In [None]:
nms = apply_nms(detections, sess)

In [None]:
nms = idx_to_name(nms, names)

In [None]:
print("we found the following boxes after non-max suppression")
print(nms)

### Plotting the output
I'm using opencv to draw rectangles around all detections
and to write the name in text onto the image.

The image has a very low resolution.
For the output it is upscaled.
The main reason for this is to allow high-res text.

In [None]:
img = batch[0][0]
output_img = img.copy()
dim_x, dim_y = output_img.shape[:2]
factor = 5
output_img = cv2.resize(output_img, (dim_y*factor, dim_x*factor))

for label in nms:
    boxes = nms[label]
    for box in boxes:
        min_x, min_y, max_x, max_y = box
        min_x *= factor
        min_y *= factor
        max_x *= factor
        max_y *= factor
        
    
        cv2.rectangle(output_img,(min_y, min_x),(max_y, max_x),(0,1,0),10)
        #cv2.rectangle(output_img,(min_y-100, min_x-100),(min_y + 100, min_x+100),(0,1,0),-1)
        cv2.putText(output_img, label, (min_y, min_x), cv2.FONT_HERSHEY_SIMPLEX, fontScale=3.5, color=(1, 1, 1), thickness=12)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(output_img)
plt.show()