In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%load_ext line_profiler

import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
sess_config = tf.ConfigProto()

import sys
import os

COCO_DATA = 'data/coco'
MASK_RCNN_MODEL_PATH = 'lib/Mask_RCNN/'

if MASK_RCNN_MODEL_PATH not in sys.path:
    sys.path.append(MASK_RCNN_MODEL_PATH)
    
from samples.coco import coco
from mrcnn import utils
from mrcnn import model as modellib
from mrcnn import visualize
    
from lib import utils as siamese_utils
from lib import model as siamese_model
from lib import config as siamese_config
   
import time
import datetime
import random
import numpy as np
import skimage.io
import imgaug
import pickle
import matplotlib.pyplot as plt
from collections import OrderedDict

# Root directory of the project
ROOT_DIR = os.getcwd()

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])






Using TensorFlow backend.


### Dataset

In [2]:
# train_classes = coco_nopascal_classes
train_classes = np.array(range(1,81))

In [3]:
# Load COCO/train dataset
coco_train = siamese_utils.IndexedCocoDataset()
coco_train.load_coco(COCO_DATA, subset="train", subsubset="train", year="2017")
coco_train.prepare()
coco_train.build_indices()
coco_train.ACTIVE_CLASSES = train_classes

# Load COCO/val dataset
coco_val = siamese_utils.IndexedCocoDataset()
coco_val.load_coco(COCO_DATA, subset="train", subsubset="val", year="2017")
coco_val.prepare()
coco_val.build_indices()
coco_val.ACTIVE_CLASSES = train_classes

loading annotations into memory...
Done (t=14.13s)
creating index...
index created!
loading annotations into memory...
Done (t=20.51s)
creating index...
index created!


### Model

In [7]:
class SmallTrainConfig(siamese_config.Config):
    # Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 1
    IMAGES_PER_GPU = 2 # A 16GB GPU is required for a batch_size of 12
    NUM_CLASSES = 1 + 1
    NAME = 'small_coco'
    EXPERIMENT = 'example'
    CHECKPOINT_DIR = 'checkpoints/'
    # Adapt loss weights
    LOSS_WEIGHTS = {'rpn_class_loss': 2.0, 
                    'rpn_bbox_loss': 0.1, 
                    'mrcnn_class_loss': 2.0, 
                    'mrcnn_bbox_loss': 0.5, 
                    'mrcnn_mask_loss': 1.0}
    
class LargeTrainConfig(siamese_config.Config):
    # Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 4
    IMAGES_PER_GPU = 3 # 4 16GB GPUs are required for a batch_size of 12
    NUM_CLASSES = 1 + 1
    NAME = 'large_coco'
    EXPERIMENT = 'example'
    CHECKPOINT_DIR = 'checkpoints/'
    # Reduced image sizes
    TARGET_MAX_DIM = 192
    TARGET_MIN_DIM = 150
    IMAGE_MIN_DIM = 800
    IMAGE_MAX_DIM = 1024
    # Reduce model size
    FPN_CLASSIF_FC_LAYERS_SIZE = 1024
    FPN_FEATUREMAPS = 256
    # Reduce number of rois at all stages
    RPN_ANCHOR_STRIDE = 1
    RPN_TRAIN_ANCHORS_PER_IMAGE = 256
    POST_NMS_ROIS_TRAINING = 2000
    POST_NMS_ROIS_INFERENCE = 1000
    TRAIN_ROIS_PER_IMAGE = 200
    DETECTION_MAX_INSTANCES = 100
    MAX_GT_INSTANCES = 100
    # Adapt NMS Threshold
    DETECTION_NMS_THRESHOLD = 0.5
    # Adapt loss weights
    LOSS_WEIGHTS = {'rpn_class_loss': 2.0, 
                    'rpn_bbox_loss': 0.1, 
                    'mrcnn_class_loss': 2.0, 
                    'mrcnn_bbox_loss': 0.5, 
                    'mrcnn_mask_loss': 1.0}

#### Decide between small and large model

In [8]:
# The small model trains on a single GPU and runs much faster.
# The large model is the same we used in our experiments but needs multiple GPUs and more time for training.
model_size = 'small' # or 'large'

In [9]:
if model_size == 'small':
    config = SmallTrainConfig()
elif model_size == 'large':
    config = LargeTrainConfig()
    
config.display()


Configurations:
BACKBONE                       resnet50
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     2
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
CHECKPOINT_DIR                 checkpoints/
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        30
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.5
EXPERIMENT                     example
FPN_CLASSIF_FC_LAYERS_SIZE     512
FPN_FEATUREMAPS                256
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 2
IMAGE_MAX_DIM                  512
IMAGE_META_SIZE                14
IMAGE_MIN_DIM                  400
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [512 512   3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.02
LOSS_WEIGHTS                   {'rpn_class_loss': 2.0, 'rpn_bbox_loss': 0.1, 'mrcnn_class_loss': 2.0, 'mrcnn_bbox_loss': 0.5, 

In [10]:
# Create model object in inference mode.
model = siamese_model.SiameseMaskRCNN(mode="training", model_dir=MODEL_DIR, config=config)

### Training

In [11]:
train_schedule = OrderedDict()
train_schedule[1] = {"learning_rate": config.LEARNING_RATE, "layers": "heads"}
train_schedule[120] = {"learning_rate": config.LEARNING_RATE, "layers": "all"}
train_schedule[160] = {"learning_rate": config.LEARNING_RATE/10, "layers": "all"}

In [13]:
# Load weights trained on Imagenet
try: 
    model.load_latest_checkpoint(training_schedule=train_schedule)
except:
    model.load_imagenet_weights(pretraining='imagenet-687')

initializing from imagenet weights ...


In [14]:
for epochs, parameters in train_schedule.items():
    print("")
    print("training layers {} until epoch {} with learning_rate {}".format(parameters["layers"], 
                                                                          epochs, 
                                                                          parameters["learning_rate"]))
    model.train(coco_train, coco_val, 
                learning_rate=parameters["learning_rate"], 
                epochs=epochs, 
                layers=parameters["layers"])


training layers heads until epoch 1 with learning_rate 0.02

Starting at epoch 0. LR=0.02

Checkpoint Path: /data/lmp/code/siamese-mask-rcnn/logs/siamese_mrcnn_small_coco_example/siamese_mrcnn_{epoch:04d}.h5


ResourceExhaustedError: OOM when allocating tensor with shape[3,3,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training/SGD/Variable_10/Assign = Assign[T=DT_FLOAT, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](training/SGD/Variable_10, training/SGD/zeros_10)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'training/SGD/Variable_10/Assign', defined at:
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/asyncio/base_events.py", line 442, in run_forever
    self._run_once()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/asyncio/base_events.py", line 1462, in _run_once
    handle._run()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 545, in execute_request
    user_expressions, allow_stdin,
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tornado/gen.py", line 162, in _fake_ctx_run
    return f(*args, **kw)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2867, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2895, in _run_cell
    return runner(coro)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3072, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3263, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-fc3776d04725>", line 9, in <module>
    layers=parameters["layers"])
  File "/data/lmp/code/siamese-mask-rcnn/lib/model.py", line 707, in train
    use_multiprocessing=True,
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/engine/training.py", line 2080, in fit_generator
    self._make_train_function()
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/engine/training.py", line 992, in _make_train_function
    loss=self.total_loss)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/optimizers.py", line 182, in get_updates
    moments = [K.zeros(shape) for shape in shapes]
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/optimizers.py", line 182, in <listcomp>
    moments = [K.zeros(shape) for shape in shapes]
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 695, in zeros
    return variable(v, dtype=dtype, name=name)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 396, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 235, in __init__
    constraint=constraint)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 387, in _init_from_args
    validate_shape=validate_shape).op
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 283, in assign
    validate_shape=validate_shape)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 60, in assign
    use_locking=use_locking, name=name)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/data/lmp/anaconda3/envs/siamese-mask-rcnn/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[3,3,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: training/SGD/Variable_10/Assign = Assign[T=DT_FLOAT, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](training/SGD/Variable_10, training/SGD/zeros_10)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

