In [1]:
import sys
sys.path.append('../MaskRCNN/')
import itertools
import numpy as np
import shutil
import cv2
import six
assert six.PY3, "FasterRCNN requires Python 3!"
import tensorflow.compat.v1 as tf
'''from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)'''
tf.disable_eager_execution()
import tqdm
import time
import subprocess
import os

import tensorpack_viz as tpviz
from tensorpack_tfutils import get_tf_version_tuple, get_model_loader
from tensorpack_utils import fix_rng_seed
from tensorpack_input_source import QueueInput
from tensorpack_train import TrainConfig
from tensorpack_interface import launch_train_with_config
from tensorpack_callbacks import PeriodicCallback, EnableCallbackIf, ModelSaver,\
                                 GraphProfiler, PeakMemoryTracker, EstimatedTimeLeft, SessionRunTimeout, \
                                 MovingAverageSummary, ProgressBar, MergeAllSummaries, RunUpdateOps, ScheduledHyperParamSetter
import tensorpack_logger as logger


from dataset import DetectionDataset
from config import finalize_configs, config as cfg
from data import get_eval_dataflow, get_train_dataflow, get_batch_train_dataflow
from eval import DetectionResult, predict_image, multithread_predict_dataflow, EvalCallback
from viz import draw_annotation, draw_final_outputs, draw_predictions, draw_proposal_recall
from performance import ThroughputTracker, humanize_float
from model.generalized_rcnn import ResNetFPNModel
import horovod.tensorflow as hvd

config = ['MODE_MASK=True',
'MODE_FPN=True',
'DATA.BASEDIR=/workspace/shared_workspace/data/coco/coco/',
'DATA.TRAIN=["train2017"]',
'DATA.VAL=("val2017",)',
'TRAIN.BATCH_SIZE_PER_GPU=8',
'TRAIN.LR_EPOCH_SCHEDULE=[(8, 0.1), (10, 0.01), (12, None)]',
'TRAIN.EVAL_PERIOD=24',
'TRAIN.BACKBONE_NCHW=False',
'TRAIN.FPN_NCHW=False',
'TRAIN.RPN_NCHW=False',
'TRAIN.MASK_NCHW=False',
'RPN.TOPK_PER_IMAGE=True',
'PREPROC.PREDEFINED_PADDING=False',
'BACKBONE.WEIGHTS=/workspace/shared_workspace/data/coco/pretrained-models/ImageNet-R50-AlignPadding.npz',
'BACKBONE.NORM=FreezeBN',
'TRAIN.WARMUP_INIT_LR=0.000416666666667',
'FRCNN.BBOX_REG_WEIGHTS=[20., 20., 10., 10.]',
'TRAINER=horovod']

In [2]:
os.environ['TENSORPACK_FP16']='1'
os.environ['TF_CUDNN_USE_AUTOTUNE']='0'
os.environ['TF_ENABLE_NHWC']='1'

In [3]:
cfg.update_args(config)

In [4]:
MODEL = ResNetFPNModel(True)

In [5]:
DetectionDataset()

<dataset.DetectionDataset at 0x7f3d9809aa20>

In [6]:
is_horovod = cfg.TRAINER == 'horovod'

In [7]:
hvd.init()

In [8]:
finalize_configs(is_training=False)

[32m[0819 13:53:22 @config.py:285][0m Config: ------------------------------------------
{'BACKBONE': {'FREEZE_AFFINE': False,
              'FREEZE_AT': 2,
              'NORM': 'FreezeBN',
              'RESNET_NUM_BLOCKS': [3, 4, 6, 3],
              'STRIDE_1X1': False,
              'TF_PAD_MODE': False,
              'WEIGHTS': '/workspace/shared_workspace/data/coco/pretrained-models/ImageNet-R50-AlignPadding.npz'},
 'DATA': {'BASEDIR': '/workspace/shared_workspace/data/coco/coco/',
          'CLASS_NAMES': ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
                          'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
                          'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
                          'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
                          'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
             

In [9]:
cfg.TRAIN.EVAL_PERIOD = 1
tf.set_random_seed(cfg.TRAIN.SEED)
fix_rng_seed(cfg.TRAIN.SEED*hvd.rank())
np.random.seed(cfg.TRAIN.SEED)

In [10]:
images_per_epoch = 120000
images_per_step = cfg.TRAIN.NUM_GPUS * cfg.TRAIN.BATCH_SIZE_PER_GPU
steps_per_epoch = images_per_epoch // images_per_step
batch_size_lr_factor = images_per_step # The LR is defined for bs=1 and then scaled linearly with the batch size
base_lr_adjusted_for_bs = cfg.TRAIN.BASE_LR * batch_size_lr_factor

In [11]:
# Warmup LR schedule is step based
warmup_start_step = 0
warmup_end_step = cfg.TRAIN.WARMUP_STEPS
warmup_start_lr = cfg.TRAIN.WARMUP_INIT_LR*8
warmup_end_lr = base_lr_adjusted_for_bs
warmup_schedule = [(warmup_start_step, warmup_start_lr), (warmup_end_step, warmup_end_lr)]

In [12]:
warmup_end_epoch = cfg.TRAIN.WARMUP_STEPS * 1. / steps_per_epoch
training_start_epoch = int(warmup_end_epoch + 0.5)
lr_schedule = [(training_start_epoch, base_lr_adjusted_for_bs)]

In [13]:
max_epoch = None
for epoch, scheduled_lr_multiplier in cfg.TRAIN.LR_EPOCH_SCHEDULE:
    if scheduled_lr_multiplier is None:
        max_epoch = epoch # Training end is indicated by a lr_multiplier of None
        break

    absolute_lr = base_lr_adjusted_for_bs * scheduled_lr_multiplier
    lr_schedule.append((epoch, absolute_lr))

In [14]:
train_dataflow = get_batch_train_dataflow(cfg.TRAIN.BATCH_SIZE_PER_GPU)

In train dataflow
loading annotations into memory...
Done (t=17.47s)
creating index...
index created!
[32m[0819 13:53:40 @dataset.py:50][0m Instances loaded from /workspace/shared_workspace/data/coco/coco/annotations/instances_train2017.json.


100%|██████████| 118287/118287 [00:18<00:00, 6233.73it/s]

[32m[0819 13:53:59 @tensorpack_utils.py:349][0m Load Load annotations for train2017 finished, time:19.0846sec.





Done loading roidbs
[32m[0819 13:54:03 @data.py:618][0m Filtered 1021 images which contain no non-crowd groudtruth boxes. Total #images for training: 117266
Batching roidbs
Done batching roidbs


In [15]:
logdir = '/workspace/shared_workspace/logs'

In [16]:
callbacks = [
    # linear warmup
    ScheduledHyperParamSetter(
        'learning_rate', warmup_schedule, interp='linear', step_based=True),
    ScheduledHyperParamSetter('learning_rate', lr_schedule),
    EstimatedTimeLeft(median=True),
    SessionRunTimeout(60000).set_chief_only(True),   # 1 minute timeout
]
'''PeriodicCallback(
        ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
        every_k_epochs=20),
        PeakMemoryTracker(),'''

callbacks.extend([
    EvalCallback(dataset, *MODEL.get_inference_tensor_names(), logdir, 1, a_sync=True) #cfg.TRAIN.BATCH_SIZE_PER_GPU)
    for dataset in cfg.DATA.VAL
])


callbacks.append(ThroughputTracker(cfg.TRAIN.BATCH_SIZE_PER_GPU*cfg.TRAIN.NUM_GPUS,
                                   images_per_epoch,
                                   trigger_every_n_steps=2000,
                                   log_fn=logger.info))

# modify profiler callback

In [17]:
session_init = get_model_loader(cfg.BACKBONE.WEIGHTS)

In [18]:
traincfg = TrainConfig(
            model=MODEL,
            data=QueueInput(train_dataflow),
            callbacks=callbacks,
            extra_callbacks=[
               MovingAverageSummary(),
               ProgressBar(),
               MergeAllSummaries(period=250),
               RunUpdateOps()
            ],
            steps_per_epoch=steps_per_epoch,
            max_epoch=max_epoch,
            session_init=session_init,
            session_config=None,
            starting_epoch=cfg.TRAIN.STARTING_EPOCH
        )

In [19]:
launch_train_with_config(traincfg)

[32m[0819 13:54:05 @tensorpack_input_source.py:238][0m Setting up the queue 'QueueInput/input_queue' for CPU prefetching ...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
[32m[0819 13:54:05 @tensorpack_models.py:181][0m conv0 input: [None, None, None, 3]
Use channels_last data format
Instructions for updating:
Please use `layer.__call__` method instead.
[32m[0819 13:54:05 @tensorpack_models.py:855][0m [5m[31mWRN[0m [BatchNorm] Using moving_mean/moving_variance in training.
[32m[0819 13:54:06 @tensorpack_models.py:189][0m conv0 output: [None, None, None, 64]
[32m[0819 13:54:06 @tensorpack_models.py:181][0m pool0 input: [None, None, None, 64]
[32m[0819 13:54:06 @tensorpack_models.py:189][0m pool0 output: [None, None, None, 64]
[32m[0819 13:54:06 @tensorpack_models.py:181][0m group0/block0/conv1 input: [None, None, None, 64]
Use channels_last data format
[32m[0819 13:54:06 @tensorpack_models.py:855][0m [5m[31mWRN[0m [BatchNorm] Usin

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[32m[0819 13:54:22 @tensorpack_interface.py:175][0m [36mTrainable Variables: 
[0mname                                   shape                    dim
-------------------------------------  ------------------  --------
group1/block0/conv1/W:0                [1, 1, 256, 128]       32768
group1/block0/conv1/bn/gamma:0         [128]                    128
group1/block0/conv1/bn/beta:0          [128]                    128
group1/block0/conv2/W:0                [3, 3, 128, 128]      147456
group1/block0/conv2/bn/gamma:0         [128]                    128
group1/block0/conv2/bn/beta:0          [128]                    128
group1/block0/conv3/W:0                [1, 1, 128, 512]       65536
group1/block0/conv3/bn/gamma:0         [512]                    512
group1/block0/conv3/bn/beta:0          [512]                    512
group1/block0/convshortcut/W:0         [1, 1, 256, 512]      131072
group1/block0/convshortcut/bn/gamma:0  [512]                    512
group1/block0/convshortcut/bn/b

100%|██████████| 5000/5000 [00:00<00:00, 170791.76it/s]

[32m[0819 13:54:33 @tensorpack_utils.py:349][0m Load Load annotations for val2017 finished, time:0.0358sec.
[32m[0819 13:54:33 @tensorpack_callbacks.py:1586][0m [MovingAverageSummary] 27 operations in collection 'MOVING_SUMMARY_OPS' will be run with session hooks.
[32m[0819 13:54:33 @tensorpack_callbacks.py:1632][0m Summarizing collection 'summaries' of size 30.





[32m[0819 13:54:37 @tensorpack_interface.py:325][0m Creating the session ...
[32m[0819 13:54:48 @tensorpack_interface.py:342][0m Initializing the session ...
[32m[0819 13:54:48 @tensorpack_tfutils.py:891][0m Variables to restore from dict: group2/block3/conv3/bn/beta:0, group2/block5/conv1/W:0, group2/block2/conv1/bn/variance/EMA:0, group3/block2/conv2/bn/beta:0, group3/block0/conv3/bn/mean/EMA:0, group1/block3/conv1/bn/gamma:0, group2/block0/conv3/W:0, group1/block3/conv2/W:0, group3/block0/conv2/bn/gamma:0, group1/block1/conv3/bn/mean/EMA:0, group1/block1/conv3/bn/beta:0, group3/block0/conv3/bn/beta:0, group2/block0/convshortcut/bn/gamma:0, group0/block1/conv2/bn/variance/EMA:0, group2/block0/conv1/bn/variance/EMA:0, group1/block0/conv3/bn/variance/EMA:0, group0/block2/conv1/bn/beta:0, group1/block0/conv2/bn/mean/EMA:0, group0/block2/conv3/bn/beta:0, group3/block2/conv3/bn/variance/EMA:0, group0/block1/conv2/bn/beta:0, group2/block5/conv3/bn/beta:0, group1/block0/conv1/bn/mean/

  0%|          |0/30000[00:00<?,?it/s]

[32m[0819 14:01:46 @tensorpack_callbacks.py:601][0m [HyperParamSetter] At global_step=1, learning_rate changes from 0.003333 to 0.003335


  2%|1         |532/30000[03:18<2:28:45, 3.30it/s] 

[32m[0819 14:04:48 @tensorpack_interface.py:465][0m Detected Ctrl-C and exiting main loop.


  2%|1         |532/30000[03:18<3:03:24, 2.68it/s]


KeyboardInterrupt: 

[32m[0819 14:04:49 @tensorpack_input_source.py:194][0m EnqueueThread QueueInput/input_queue Exited.
