In [1]:
import sys
sys.path.append('..')
from awsdet import models
from awsdet import datasets
from awsdet import core
from awsdet import training
from awsdet.utils.runner import Runner
from awsdet.training.schedulers import WarmupScheduler
from awsdet.datasets.coco import evaluation
from configs.mrcnn_config import config
import tensorflow as tf
from tqdm.notebook import tqdm
from statistics import mean
import threading

import horovod.tensorflow as hvd
hvd.init()

devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices([devices[hvd.rank()]], 'GPU')
logical_devices = tf.config.list_logical_devices('GPU')
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.train_config.fp16})
tf.config.optimizer.set_jit(config.train_config.xla)

In [2]:
detector = models.TwoStageDetector(backbone=config.backbone_cfg,
                                   neck=config.fpn_cfg,
                                   rpn_head=config.rpn_head_cfg,
                                   roi_head=config.roi_head_cfg,
                                   train_cfg=config.train_config,
                                   test_cfg=config.test_config)

In [3]:
train_tdf = iter(datasets.build_dataset(config.train_data)().repeat())
val_tdf = iter(datasets.build_dataset(config.test_data)().repeat())

Instructions for updating:
Use fn_output_signature instead


In [4]:
result = detector(next(train_tdf)[0], training=False)

[2021-01-13 12:55:34.120 ip-172-31-38-50:14053 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-01-13 12:55:35.350 ip-172-31-38-50:14053 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


In [5]:
chkp = tf.compat.v1.train.NewCheckpointReader(config.backbone_checkpoint)
weights = [chkp.get_tensor(i) for i in ['/'.join(i.name.split('/')[-2:]).split(':')[0] for i in detector.layers[0].weights]]
detector.layers[0].set_weights(weights)

In [6]:
global_batch_size = config.train_data['batch_size'] * hvd.size()
steps_per_epoch = 1000 #config.train_config.images//global_batch_size
learning_rate = config.train_config.base_lr/8 * global_batch_size
    
schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay([steps_per_epoch * 8, steps_per_epoch * 11],
                                                                [learning_rate, learning_rate/10, learning_rate/100])

schedule = WarmupScheduler(schedule, learning_rate/10, steps_per_epoch//8)

optimizer = tf.keras.optimizers.SGD(learning_rate=schedule,
                                    momentum=0.9)

if config.train_config.fp16:
    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')

In [7]:
model_runner = Runner(model=detector, 
                      train_cfg=config.train_config, 
                      test_cfg=config.test_config, 
                      optimizer=optimizer)

In [8]:
pbar = tqdm(range(steps_per_epoch))
loss_history = []
for step in pbar:
    model_outputs = model_runner.train_step(next(train_tdf), 
                                            sync_weights=step==0, 
                                            sync_opt=step==0)
    loss_history.append(model_outputs['total_loss'].numpy())
    loss_rolling_mean = mean(loss_history[-50:])
    current_learning_rate = schedule(optimizer.iterations).numpy()
    if hvd.rank()==0:
        pbar.set_description("Loss {0:.4f}, LR: {1:.4f}".format(loss_rolling_mean, 
                                                                current_learning_rate))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [9]:
p_bar = tqdm(range(1000//global_batch_size + 1))
predictions = [model_runner.predict(next(val_tdf)) for i in p_bar]
args = [predictions, model_runner.test_cfg.annotations]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=251.0), HTML(value='')))




In [10]:
if model_runner.test_cfg.async_eval and hvd.rank()==0:
    eval_thread = threading.Thread(target=evaluation.evaluate_results, name="eval-thread", args=args)
    eval_thread.start()
else:
    evaluation.evaluate_results(*args, pbar=True)
    

processing eval
100%|██████████| 251/251 [01:31<00:00,  2.75it/s]
loading predictions
[MaskRCNN] INFO    : 0/100400
[MaskRCNN] INFO    : 1000/100400
[MaskRCNN] INFO    : 2000/100400
[MaskRCNN] INFO    : 3000/100400
[MaskRCNN] INFO    : 4000/100400
[MaskRCNN] INFO    : 5000/100400
[MaskRCNN] INFO    : 6000/100400
[MaskRCNN] INFO    : 7000/100400
[MaskRCNN] INFO    : 8000/100400
[MaskRCNN] INFO    : 9000/100400
[MaskRCNN] INFO    : 10000/100400
[MaskRCNN] INFO    : 11000/100400
[MaskRCNN] INFO    : 12000/100400
[MaskRCNN] INFO    : 13000/100400
[MaskRCNN] INFO    : 14000/100400
[MaskRCNN] INFO    : 15000/100400
[MaskRCNN] INFO    : 16000/100400
[MaskRCNN] INFO    : 17000/100400
[MaskRCNN] INFO    : 18000/100400
[MaskRCNN] INFO    : 19000/100400
[MaskRCNN] INFO    : 20000/100400
[MaskRCNN] INFO    : 21000/100400
[MaskRCNN] INFO    : 22000/100400
[MaskRCNN] INFO    : 23000/100400
[MaskRCNN] INFO    : 24000/100400
[MaskRCNN] INFO    : 25000/100400
[MaskRCNN] INFO    : 26000/100400
[MaskRCNN

In [None]:
to_binary = lambda array, size: tf.math.mod(tf.bitwise.right_shift(tf.expand_dims(array,-1), tf.range(size)), 2)