# Fine-tuning with Birds Data Set

In [32]:
from keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TerminateOnNaN, CSVLogger
from keras import backend as K
from keras.models import load_model
from math import ceil
import numpy as np
from matplotlib import pyplot as plt

from models.keras_ssd300 import ssd_300
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
from keras_layers.keras_layer_L2Normalization import L2Normalization

from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
from ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast

from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_geometric_ops import Resize
from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
from data_generator.data_augmentation_chain_original_ssd import SSDDataAugmentation
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms

from eval_utils.average_precision_evaluator import Evaluator

import os
root_dir = os.getcwd()

%matplotlib inline

## 1. Set the model configuration parameters

In [15]:
img_height = 300 # Height of the model input images
img_width = 300 # Width of the model input images
img_channels = 3 # Number of color channels of the model input images
mean_color = [123, 117, 104] # The per-channel mean of the images in the dataset. Do not change this value if you're using any of the pre-trained weights.
swap_channels = [2, 1, 0] # The color channel order in the original SSD is BGR, so we'll have the model reverse the color channel order of the input images.
n_classes = 20 # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] # The anchor box scaling factors used in the original SSD300 for the MS COCO datasets
scales = scales_pascal
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]] # The anchor box aspect ratios used in the original SSD300; the order matters
two_boxes_for_ar1 = True
steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
clip_boxes = False # Whether or not to clip the anchor boxes to lie entirely within the image boundaries
variances = [0.1, 0.1, 0.2, 0.2] # The variances by which the encoded target coordinates are divided as in the original implementation
normalize_coords = True

## 2. Create new SSD model and load pre-trained weight
Pre-trained weight (VOC2007+2012) can be found in the following link: https://drive.google.com/file/d/1M99knPZ4DpY9tI60iZqxXsAxX2bYWDvZ/view

In [16]:
# 1: Build the Keras model.

K.clear_session() # Clear previous models from memory.

model = ssd_300(image_size=(img_height, img_width, img_channels),
                n_classes=n_classes,
                mode='training',
                l2_regularization=0.0005,
                scales=scales,
                aspect_ratios_per_layer=aspect_ratios,
                two_boxes_for_ar1=two_boxes_for_ar1,
                steps=steps,
                offsets=offsets,
                clip_boxes=clip_boxes,
                variances=variances,
                normalize_coords=normalize_coords,
                subtract_mean=mean_color,
                swap_channels=swap_channels)

# 2: Load some weights into the model. Adjust weight path accordingly. 
weights_path = os.path.join(root_dir,'VGG_VOC0712Plus_SSD_300x300_iter_240000.h5') 

model.load_weights(weights_path, by_name=True)

# 3: Instantiate an optimizer and the SSD loss function and compile the model. 
#SGD optimizer is used here since it yields better result. Arguments inside SGD don't matter since they will be redefined later.

#adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False)

ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=sgd, loss=ssd_loss.compute_loss)

## 3. Set up the data generators for the training

The code cells below set up the data generators for the training and validation datasets to train the model. The data set used is 2012 `trainval` for reproducing fine-tuned original result, and plus the Caltech-UCSD Birds-200-2011 for extra "birds" data. Comment out parse_csv for not loading birds data set.

In [17]:
# 1: Instantiate two `DataGenerator` objects: One for training, one for validation.

train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)
val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)

# 2: Parse the image and label lists for the training and validation datasets. This can take a while.
VOC_2012_images_dir                  = os.path.join(root_dir,'dataset/VOC2012/JPEGImages')
VOC_2012_annotations_dir             = os.path.join(root_dir,'dataset/VOC2012/Annotations')
VOC_2012_train_image_set_filename    = os.path.join(root_dir,'dataset/VOC2012/ImageSets/Main/train.txt')
VOC_2012_val_image_set_filename      = os.path.join(root_dir,'dataset/VOC2012/ImageSets/Main/val.txt')
bird_dir = os.path.join(root_dir,'dataset/bird_data')

classes = ['background','aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat','chair','cow', 'diningtable',
           'dog','horse', 'motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor']

train_dataset.parse_xml(images_dirs=[VOC_2012_images_dir],image_set_filenames=[VOC_2012_train_image_set_filename],
                        annotations_dirs=[VOC_2012_annotations_dir],classes=classes,
                        include_classes='all',
                        exclude_truncated=False,
                        exclude_difficult=False,
                        ret=False)

#CSV files are modified such that they all belong to the bird's class
train_dataset.parse_csv(images_dir=bird_dir,
                         labels_filename=os.path.join(root_dir,'dataset/bird_data/train_birds.csv'),
                         input_format=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'])

val_dataset.parse_xml(images_dirs=[VOC_2012_images_dir],
                      image_set_filenames=[VOC_2012_val_image_set_filename],
                      annotations_dirs=[VOC_2012_annotations_dir],
                      classes=classes,
                      include_classes='all',
                      exclude_truncated=False,
                      exclude_difficult=True,
                      ret=False)

val_dataset.parse_csv(images_dir=bird_dir,
                      labels_filename=os.path.join(root_dir,'dataset/bird_data/val_birds.csv'),
                      input_format=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'],
                      include_classes='all')

Processing image set 'train.txt': 100%|██████████| 5717/5717 [00:09<00:00, 618.30it/s]
# before adding birds data:  5717
# after adding birds data:  6460
Processing image set 'val.txt': 100%|██████████| 1106/1106 [00:01<00:00, 597.17it/s]
# before adding birds data:  1106
# after adding birds data:  1478


In [23]:
# 3: Set the batch size.

batch_size = 16 # Change the batch size if you like, or if you run into GPU memory issues.

# 4: Set the image transformations for pre-processing and data augmentation options.

# For the training generator:
ssd_data_augmentation = SSDDataAugmentation(img_height=img_height,
                                            img_width=img_width,
                                            background=mean_color)

# For the validation generator:
convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height, width=img_width)

# 5: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

# The encoder constructor needs the spatial dimensions of the model's predictor layers to create the anchor boxes.
predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3],
                   model.get_layer('fc7_mbox_conf').output_shape[1:3],
                   model.get_layer('conv6_2_mbox_conf').output_shape[1:3],
                   model.get_layer('conv7_2_mbox_conf').output_shape[1:3],
                   model.get_layer('conv8_2_mbox_conf').output_shape[1:3],
                   model.get_layer('conv9_2_mbox_conf').output_shape[1:3]]

ssd_input_encoder = SSDInputEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    scales=scales,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    matching_type='multi',
                                    pos_iou_threshold=0.5,
                                    neg_iou_limit=0.5,
                                    normalize_coords=normalize_coords)

# 6: Create the generator handles that will be passed to Keras' `fit_generator()` function.

train_generator = train_dataset.generate(batch_size=batch_size,
                                         shuffle=False,
                                         transformations=[ssd_data_augmentation],
                                         label_encoder=ssd_input_encoder,
                                         returns={'processed_images',
                                                  'encoded_labels'},
                                         keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size,
                                     shuffle=False,
                                     transformations=[convert_to_3_channels,
                                                      resize],
                                     label_encoder=ssd_input_encoder,
                                     returns={'processed_images',
                                              'encoded_labels'},
                                     keep_images_without_gt=False)

train_dataset_size = train_dataset.get_dataset_size()
val_dataset_size   = val_dataset.get_dataset_size()

## 4. Set the remaining training parameters

In [24]:
#Use a constant learning rate since only training for a small nunber of epochs.
    
def lr_schedule(epoch):
    return 0.001

In [25]:
# Define model callbacks.

model_checkpoint = ModelCheckpoint(filepath= os.path.join(root_dir,'VGG_VOC0712COCO_SSD_300x300_witBirds.h5'), 
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True,
                                   save_weights_only=False,
                                   mode='auto',
                                   period=1)

csv_logger = CSVLogger(filename='Birds_tuning.csv',
                       separator=',',
                       append=True)

learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule,
                                                verbose=1)

terminate_on_nan = TerminateOnNaN()

callbacks = [model_checkpoint,
             csv_logger,
             learning_rate_scheduler,
             terminate_on_nan]

## 5. Train

In [None]:
#Training time around 2 hours on GeForce GTX 1050
#Set `initial_epoch` and `final_epoch` accordingly for resuming.
initial_epoch   = 0
final_epoch     = 5
steps_per_epoch = 20

history = model.fit_generator(generator=train_generator,
                              steps_per_epoch=steps_per_epoch,
                              epochs=final_epoch,
                              callbacks=callbacks,
                              validation_data=val_generator,
                              validation_steps=ceil(val_dataset_size/batch_size)
                              ,initial_epoch=initial_epoch)

## 6. Evaluation

Use the evaluator to test the performance on 2007 test data.

### 6.1 Chaning to inference mode for evaluation. Rebuild model and load the weights just trained to it.

In [29]:
model_mode = 'inference'

K.clear_session() # Clear previous models from memory.

model = ssd_300(image_size=(img_height, img_width, 3),
                n_classes=n_classes,
                mode=model_mode,
                l2_regularization=0.0005,
                scales=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05], # The scales for MS COCO [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]
                aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                         [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                                         [1.0, 2.0, 0.5],
                                         [1.0, 2.0, 0.5]],
                two_boxes_for_ar1=True,
                steps=[8, 16, 32, 64, 100, 300],
                offsets=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
                clip_boxes=False,
                variances=[0.1, 0.1, 0.2, 0.2],
                normalize_coords=True,
                subtract_mean=[123, 117, 104],
                swap_channels=[2, 1, 0],
                confidence_thresh=0.01,
                iou_threshold=0.45,
                top_k=200,
                nms_max_output_size=400)

# 2: Load the trained weights into the model.

weights_path = os.path.join(root_dir,'VGG_VOC0712COCO_SSD_300x300_witBirds.h5') 

model.load_weights(weights_path, by_name=True)

# 3: Compile the model so that Keras won't complain the next time you load it.

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

### 6.2 Create a data generator for the evaluation dataset

In [30]:
dataset = DataGenerator()

#For faster evaluation, only a subset of VOC2007 test data are used.
Pascal_VOC_dataset_images_dir = os.path.join(root_dir,'dataset/testing07/JPEGImages')
Pascal_VOC_dataset_annotations_dir = os.path.join(root_dir,'dataset/testing07/Annotations')
Pascal_VOC_dataset_image_set_filename = os.path.join(root_dir,'dataset/testing07/ImageSets/Main/test.txt')

classes = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

dataset.parse_xml(images_dirs=[Pascal_VOC_dataset_images_dir],
                  image_set_filenames=[Pascal_VOC_dataset_image_set_filename],
                  annotations_dirs=[Pascal_VOC_dataset_annotations_dir],
                  classes=classes,
                  include_classes='all',
                  exclude_truncated=False,
                  exclude_difficult=False,
                  ret=False)

Processing image set 'test.txt': 100%|██████████| 614/614 [00:01<00:00, 590.26it/s]


### 6.3 Run the evaluation

In [33]:
evaluator = Evaluator(model=model,
                      n_classes=n_classes,
                      data_generator=dataset,
                      model_mode=model_mode)

results = evaluator(img_height=img_height,
                    img_width=img_width,
                    batch_size=8,
                    data_generator_mode='resize',
                    round_confidences=False,
                    matching_iou_threshold=0.5,
                    border_pixels='include',
                    sorting_algorithm='quicksort',
                    average_precision_mode='sample',
                    num_recall_points=11,
                    ignore_neutral_boxes=True,
                    return_precisions=True,
                    return_recalls=True,
                    return_average_precisions=True,
                    verbose=True)

mean_average_precision, average_precisions, precisions, recalls = results

Number of images in the evaluation dataset: 614

Producing predictions batch-wise: 100%|██████████| 77/77 [03:43<00:00,  2.69s/it]
Matching predictions to ground truth, class 1/20.: 100%|██████████| 3364/3364 [00:00<00:00, 22786.15it/s]
Matching predictions to ground truth, class 2/20.: 100%|██████████| 1369/1369 [00:00<00:00, 35169.61it/s]
Matching predictions to ground truth, class 3/20.: 100%|██████████| 10747/10747 [00:00<00:00, 36042.49it/s]
Matching predictions to ground truth, class 4/20.: 100%|██████████| 5417/5417 [00:00<00:00, 29843.35it/s]
Matching predictions to ground truth, class 5/20.: 100%|██████████| 1380/1380 [00:00<00:00, 46081.71it/s]
Matching predictions to ground truth, class 6/20.: 100%|██████████| 747/747 [00:00<00:00, 31173.40it/s]
Matching predictions to ground truth, class 7/20.: 100%|██████████| 8036/8036 [00:00<00:00, 26678.08it/s]
Matching predictions to ground truth, class 8/20.: 100%|██████████| 1778/1778 [00:00<00:00, 28741.84it/s]
Matching predictions 

In [34]:
for i in range(1, len(average_precisions)):
    print("{:<14}{:<6}{}".format(classes[i], 'AP', round(average_precisions[i], 3)))
print()
print("{:<14}{:<6}{}".format('','mAP', round(mean_average_precision, 3)))

aeroplane     AP    0.859
bicycle       AP    0.693
bird          AP    0.772
boat          AP    0.714
bottle        AP    0.383
bus           AP    0.97
car           AP    0.847
cat           AP    0.696
chair         AP    0.552
cow           AP    0.487
diningtable   AP    0.659
dog           AP    0.657
horse         AP    0.738
motorbike     AP    0.791
person        AP    0.747
pottedplant   AP    0.408
sheep         AP    0.772
sofa          AP    0.659
train         AP    0.825
tvmonitor     AP    0.551

              mAP   0.689
