In [1]:
import sys
sys.path.append('../MaskRCNN/')
import itertools
import numpy as np
import shutil
import cv2
import six
assert six.PY3, "FasterRCNN requires Python 3!"
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import tqdm
import time
import subprocess
import os

import tensorpack_viz as tpviz
from tensorpack_tfutils import get_tf_version_tuple, get_model_loader
from tensorpack_utils import fix_rng_seed
from tensorpack_input_source import QueueInput
from tensorpack_train import TrainConfig
from tensorpack_interface import launch_train_with_config
from tensorpack_callbacks import PeriodicCallback, EnableCallbackIf, ModelSaver,\
                                 GraphProfiler, PeakMemoryTracker, EstimatedTimeLeft, SessionRunTimeout, \
                                 MovingAverageSummary, ProgressBar, MergeAllSummaries, RunUpdateOps, ScheduledHyperParamSetter
import tensorpack_logger as logger


from dataset import DetectionDataset
from config import finalize_configs, config as cfg
from data import get_eval_dataflow, get_train_dataflow, get_batch_train_dataflow
from eval import DetectionResult, predict_image, multithread_predict_dataflow, EvalCallback
from viz import draw_annotation, draw_final_outputs, draw_predictions, draw_proposal_recall
from performance import ThroughputTracker, humanize_float
from model.generalized_rcnn import ResNetFPNModel
import horovod.tensorflow as hvd

config = ['MODE_MASK=True',
'MODE_FPN=True',
'DATA.BASEDIR=/workspace/shared_workspace/data/coco/coco/',
'DATA.TRAIN=["train2017"]',
'DATA.VAL=("val2017",)',
'TRAIN.BATCH_SIZE_PER_GPU=4',
'TRAIN.LR_EPOCH_SCHEDULE=[(8, 0.1), (10, 0.01), (12, None)]',
'TRAIN.EVAL_PERIOD=24',
'TRAIN.BACKBONE_NCHW=False',
'TRAIN.FPN_NCHW=False',
'TRAIN.RPN_NCHW=False',
'TRAIN.MASK_NCHW=False',
'RPN.TOPK_PER_IMAGE=True',
'PREPROC.PREDEFINED_PADDING=False',
'BACKBONE.WEIGHTS=/workspace/shared_workspace/data/coco/pretrained-models/ImageNet-R50-AlignPadding.npz',
'BACKBONE.NORM=FreezeBN',
'TRAIN.WARMUP_INIT_LR=0.000416666666667',
'FRCNN.BBOX_REG_WEIGHTS=[20., 20., 10., 10.]',
'TRAINER=horovod']

In [2]:
cfg.update_args(config)

In [3]:
MODEL = ResNetFPNModel(True)

In [4]:
DetectionDataset()

<dataset.DetectionDataset at 0x7f9594d90b00>

In [5]:
is_horovod = cfg.TRAINER == 'horovod'

In [6]:
hvd.init()

In [7]:
finalize_configs(is_training=True)

[32m[0818 13:31:42 @config.py:264][0m [5m[31mWRN[0m It's not recommended to use horovod for single-machine training. Replicated trainer is more stable and has the same efficiency.
[32m[0818 13:31:42 @config.py:285][0m Config: ------------------------------------------
{'BACKBONE': {'FREEZE_AFFINE': False,
              'FREEZE_AT': 2,
              'NORM': 'FreezeBN',
              'RESNET_NUM_BLOCKS': [3, 4, 6, 3],
              'STRIDE_1X1': False,
              'TF_PAD_MODE': False,
              'WEIGHTS': '/workspace/shared_workspace/data/coco/pretrained-models/ImageNet-R50-AlignPadding.npz'},
 'DATA': {'BASEDIR': '/workspace/shared_workspace/data/coco/coco/',
          'CLASS_NAMES': ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
                          'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
                          'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
                          'el

In [8]:
cfg.TRAIN.EVAL_PERIOD = 1
tf.set_random_seed(cfg.TRAIN.SEED)
fix_rng_seed(cfg.TRAIN.SEED*hvd.rank())
np.random.seed(cfg.TRAIN.SEED)

In [9]:
images_per_epoch = 120000
images_per_step = cfg.TRAIN.NUM_GPUS * cfg.TRAIN.BATCH_SIZE_PER_GPU
steps_per_epoch = images_per_epoch // images_per_step
batch_size_lr_factor = images_per_step # The LR is defined for bs=1 and then scaled linearly with the batch size
base_lr_adjusted_for_bs = cfg.TRAIN.BASE_LR * batch_size_lr_factor

In [10]:
# Warmup LR schedule is step based
warmup_start_step = 0
warmup_end_step = cfg.TRAIN.WARMUP_STEPS
warmup_start_lr = cfg.TRAIN.WARMUP_INIT_LR*8
warmup_end_lr = base_lr_adjusted_for_bs
warmup_schedule = [(warmup_start_step, warmup_start_lr), (warmup_end_step, warmup_end_lr)]

In [11]:
warmup_end_epoch = cfg.TRAIN.WARMUP_STEPS * 1. / steps_per_epoch
training_start_epoch = int(warmup_end_epoch + 0.5)
lr_schedule = [(training_start_epoch, base_lr_adjusted_for_bs)]

In [12]:
max_epoch = None
for epoch, scheduled_lr_multiplier in cfg.TRAIN.LR_EPOCH_SCHEDULE:
    if scheduled_lr_multiplier is None:
        max_epoch = epoch # Training end is indicated by a lr_multiplier of None
        break

    absolute_lr = base_lr_adjusted_for_bs * scheduled_lr_multiplier
    lr_schedule.append((epoch, absolute_lr))

In [13]:
train_dataflow = get_batch_train_dataflow(cfg.TRAIN.BATCH_SIZE_PER_GPU)

In train dataflow
loading annotations into memory...
Done (t=14.65s)
creating index...
index created!
[32m[0818 13:31:58 @dataset.py:50][0m Instances loaded from /workspace/shared_workspace/data/coco/coco/annotations/instances_train2017.json.


100%|██████████| 118287/118287 [00:17<00:00, 6953.26it/s]

[32m[0818 13:32:15 @tensorpack_utils.py:349][0m Load Load annotations for train2017 finished, time:17.0964sec.





Done loading roidbs
[32m[0818 13:32:18 @data.py:618][0m Filtered 1021 images which contain no non-crowd groudtruth boxes. Total #images for training: 117266
Batching roidbs
Done batching roidbs


In [14]:
logdir = '/workspace/shared_workspace/logs'

In [15]:
callbacks = [
    PeriodicCallback(
        ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
        every_k_epochs=20),
    # linear warmup
    ScheduledHyperParamSetter(
        'learning_rate', warmup_schedule, interp='linear', step_based=True),
    ScheduledHyperParamSetter('learning_rate', lr_schedule),
    PeakMemoryTracker(),
    EstimatedTimeLeft(median=True),
    SessionRunTimeout(60000).set_chief_only(True),   # 1 minute timeout
]

callbacks.extend([
    EvalCallback(dataset, *MODEL.get_inference_tensor_names(), logdir, 1, a_sync=True) #cfg.TRAIN.BATCH_SIZE_PER_GPU)
    for dataset in cfg.DATA.VAL
])


callbacks.append(ThroughputTracker(cfg.TRAIN.BATCH_SIZE_PER_GPU*cfg.TRAIN.NUM_GPUS,
                                   images_per_epoch,
                                   trigger_every_n_steps=2000,
                                   log_fn=logger.info))

# modify profiler callback

In [16]:
session_init = get_model_loader(cfg.BACKBONE.WEIGHTS)

In [17]:
traincfg = TrainConfig(
            model=MODEL,
            data=QueueInput(train_dataflow),
            callbacks=callbacks,
            extra_callbacks=[
               MovingAverageSummary(),
               ProgressBar(),
               MergeAllSummaries(period=250),
               RunUpdateOps()
            ],
            steps_per_epoch=steps_per_epoch,
            max_epoch=max_epoch,
            session_init=session_init,
            session_config=None,
            starting_epoch=cfg.TRAIN.STARTING_EPOCH
        )

In [18]:
launch_train_with_config(traincfg)

[32m[0818 13:32:19 @tensorpack_input_source.py:238][0m Setting up the queue 'QueueInput/input_queue' for CPU prefetching ...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
[32m[0818 13:32:19 @tensorpack_models.py:181][0m conv0 input: [None, None, None, 3]
Use channels_last data format
Instructions for updating:
Please use `layer.__call__` method instead.
[32m[0818 13:32:19 @tensorpack_models.py:855][0m [5m[31mWRN[0m [BatchNorm] Using moving_mean/moving_variance in training.
[32m[0818 13:32:19 @tensorpack_models.py:189][0m conv0 output: [None, None, None, 64]
[32m[0818 13:32:19 @tensorpack_models.py:181][0m pool0 input: [None, None, None, 64]
[32m[0818 13:32:19 @tensorpack_models.py:189][0m pool0 output: [None, None, None, 64]
[32m[0818 13:32:19 @tensorpack_models.py:181][0m group0/block0/conv1 input: [None, None, None, 64]
Use channels_last data format
[32m[0818 13:32:19 @tensorpack_models.py:855][0m [5m[31mWRN[0m [BatchNorm] Usin

ValueError: Shape must be rank 3 but is rank 2 for '{{node generate_fpn_proposals_topk_per_image/Lvl0/GenerateBoundingBoxProposals}} = GenerateBoundingBoxProposals[post_nms_topn=2000](generate_fpn_proposals_topk_per_image/Lvl0/strided_slice_1, generate_fpn_proposals_topk_per_image/Lvl0/transpose, generate_fpn_proposals_topk_per_image/Lvl0/Pad, generate_fpn_proposals_topk_per_image/Lvl0/Reshape, generate_fpn_proposals_topk_per_image/Lvl0/GenerateBoundingBoxProposals/nms_threshold, generate_fpn_proposals_topk_per_image/Lvl0/GenerateBoundingBoxProposals/pre_nms_topn, generate_fpn_proposals_topk_per_image/Lvl0/GenerateBoundingBoxProposals/min_size)' with input shapes: [?,?,?,3], [?,?,?,12], [?,5], [338688,4], [], [], [].