# Setup Environment


Import required packages:

In [None]:
import os

# Setup detectron2
import detectron2
from detectron2.utils.logger import setup_logger

import numpy as np
import json, random, shutil

import torch

from detectron2 import model_zoo
from detectron2.engine import DefaultTrainer, DefaultPredictor, HookBase, launch
from detectron2.engine import default_setup, default_argument_parser
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, DatasetCatalog, build_detection_train_loader
from detectron2.data.datasets import register_coco_instances
import detectron2.utils.comm as comm

Global variables and settings:


In [None]:
path_train = "/thecube/students/jravagli/datasets/train"
path_train_images = os.path.join(path_train, "image")
path_train_json = os.path.join(path_train, "train.json")
path_val_images = os.path.join(path_train, "image")
path_val_json = os.path.join(path_train, "valid.json")
# path_output_dir = "/thecube/students/jravagli/outputs/detectron"

# Model settings
lr = 0.02 # 2.5e-4 # Suggested by detectron tutorial
batch_size = 8
n_train_images = 163173 # Approximate number of training images
# We make a number of iterations so as to make the model see the whole training set *epochs* times
epochs = 12
iterations = epochs * n_train_images // batch_size
n_classes = 13 # Number of classes of the training set
# LR is reduced by a gamma factor after 8 and 11 epochs
scheduler_steps = (8*n_train_images // batch_size, 11*n_train_images // batch_size,)
weight_decay = 1e-5

resume_training = True
num_gpus = 1

Clear output directory:

In [None]:
# if not resume_training and os.path.isdir(path_output_dir):
#     shutil.rmtree(path_output_dir)
# os.makedirs(path_output_dir, exist_ok=True)

# Training


Define a hook to monitor the validation loss during training ([GitHub issue](https://github.com/facebookresearch/detectron2/issues/810#issuecomment-596194293)):

In [None]:
class ValidationLoss(HookBase):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg.clone()
        self.cfg.DATASETS.TRAIN = cfg.DATASETS.VAL
        self._loader = iter(build_detection_train_loader(self.cfg))
        
    def after_step(self):
        data = next(self._loader)
        with torch.no_grad():
            loss_dict = self.trainer.model(data)
            
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {"val_" + k: v.item() for k, v in 
                                 comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                self.trainer.storage.put_scalars(total_val_loss=losses_reduced, 
                                                 **loss_dict_reduced)

Define train procedure on multiple gpus:

In [None]:
def setup(args):
    """
    Create configs and perform basic setups.
    """
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(args.file_config))
    cfg.merge_from_list(args.opts)
    default_setup(cfg, args)
    return cfg


def main(args):
    # Define model configuration
    cfg = setup(args)
    cfg.DATASETS.TRAIN = ("deepfashion_train",)
    cfg.DATASETS.VAL = ("deepfashion_val",)
    cfg.DATASETS.TEST = ()
    cfg.DATALOADER.NUM_WORKERS = 8
    if args.resume_training:
        cfg.MODEL.WEIGHTS = None
    else:
        cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(args.file_config)  # Let training initialize from model zoo

    cfg.SOLVER.IMS_PER_BATCH = args.batch_size
    cfg.SOLVER.MAX_ITER = args.iterations    # Number of batch updates
    cfg.SOLVER.BASE_LR = args.lr
    cfg.SOLVER.MOMENTUM = 0.9
    cfg.SOLVER.GAMMA = 0.1
    # The iteration number to decrease learning rate by GAMMA
    cfg.SOLVER.STEPS = args.scheduler_steps
    cfg.SOLVER.WEIGHT_DECAY = args.weight_decay

    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128  # RoI batch size
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = args.n_classes
    # DO NOT SPECIFY OUTPUT_DIR, OTHERWISE AN ERROR FOR CONFLICTUAL ACCESS TO FILES WILL BE RAISED
#     cfg.OUTPUT_DIR = args.path_output_dir
    
    # Register the dataset
    register_coco_instances("deepfashion_train", {}, args.path_train_json, args.path_train_images)
    register_coco_instances("deepfashion_val", {}, args.path_val_json, args.path_val_images)
    
    # Train
    trainer = DefaultTrainer(cfg)
    
    val_loss = ValidationLoss(cfg)  
    trainer.register_hooks([val_loss])
    # swap the order of PeriodicWriter and ValidationLoss
    trainer._hooks = trainer._hooks[:-2] + trainer._hooks[-2:][::-1]
    
    trainer.resume_or_load(resume=args.resume_training)
    return trainer.train()


if __name__ == "__main__":
    args = default_argument_parser().parse_args()
    
    args.file_config = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
    args.path_train = path_train
    args.path_train_images = path_train_images
    args.path_train_json = path_train_json
    args.path_val_images = path_val_images
    args.path_val_json = path_val_json
#     args.path_output_dir = path_output_dir
    args.lr = lr
    args.batch_size = batch_size
    args.n_train_images = n_train_images
    args.epochs = epochs
    args.iterations = iterations
    args.n_classes = n_classes
    args.scheduler_steps = scheduler_steps
    args.weight_decay = weight_decay
    args.resume_training = resume_training
    args.num_gpus = num_gpus
    print("Command Line Args:", args)
    
    launch(
        main,
        args.num_gpus,
        num_machines=args.num_machines,
        machine_rank=args.machine_rank,
        dist_url=args.dist_url,
        args=(args,),
    )
