## Import Packages

In [1]:
import _init_path
import torch
import torch.optim as optim
import torch.optim.lr_scheduler as lr_sched
import torch.nn as nn
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
import os
import argparse
import logging
from functools import partial

from lib.net.point_rcnn import PointRCNN
import lib.net.train_functions as train_functions
from lib.datasets.kitti_rcnn_dataset import KittiRCNNDataset
from lib.config import cfg, cfg_from_file, save_config_to_file
import tools.train_utils.train_utils as train_utils
from tools.train_utils.fastai_optim import OptimWrapper
from tools.train_utils import learning_schedules_fastai as lsf

In [2]:
import inspect
inspect.getsourcefile(logging)

'/home/lingling/anaconda3/envs/pytorch/lib/python3.7/logging/__init__.py'

## functions

In [3]:
def create_logger(log_file):
    log_format = '%(asctime)s  %(levelname)5s  %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=log_format, filename=log_file)
    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)
    console.setFormatter(logging.Formatter(log_format))
    logging.getLogger(__name__).addHandler(console)
    return logging.getLogger(__name__)

In [4]:
def create_dataloader(logger):
    DATA_PATH = os.path.join('../', 'data')

    # create dataloader
    train_set = KittiRCNNDataset(root_dir=DATA_PATH, npoints=cfg.RPN.NUM_POINTS, split=cfg.TRAIN.SPLIT, mode='TRAIN',
                                 logger=logger,
                                 classes=cfg.CLASSES,
                                 rcnn_training_roi_dir=args.rcnn_training_roi_dir,
                                 rcnn_training_feature_dir=args.rcnn_training_feature_dir,
                                 gt_database_dir=args.gt_database)
    train_loader = DataLoader(train_set, batch_size=args.batch_size, pin_memory=True,
                              num_workers=args.workers, shuffle=True, collate_fn=train_set.collate_batch,
                              drop_last=True)

    if args.train_with_eval:
        test_set = KittiRCNNDataset(root_dir=DATA_PATH, npoints=cfg.RPN.NUM_POINTS, split=cfg.TRAIN.VAL_SPLIT, mode='EVAL',
                                    logger=logger,
                                    classes=cfg.CLASSES,
                                    rcnn_eval_roi_dir=args.rcnn_eval_roi_dir,
                                    rcnn_eval_feature_dir=args.rcnn_eval_feature_dir)
        test_loader = DataLoader(test_set, batch_size=1, shuffle=True, pin_memory=True,
                                 num_workers=args.workers, collate_fn=test_set.collate_batch)
    else:
        test_loader = None
    return train_loader, test_loader

In [5]:
def create_optimizer(model):

    if cfg.TRAIN.OPTIMIZER == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=cfg.TRAIN.LR, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
    elif cfg.TRAIN.OPTIMIZER == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, weight_decay=cfg.TRAIN.WEIGHT_DECAY,
                              momentum=cfg.TRAIN.MOMENTUM)
    elif cfg.TRAIN.OPTIMIZER == 'adam_onecycle':
        def children(m: nn.Module):
            return list(m.children())

        def num_children(m: nn.Module) -> int:
            return len(children(m))

        flatten_model = lambda m: sum(map(flatten_model, m.children()), []) if num_children(m) else [m]
        get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))]

        optimizer_func = partial(optim.Adam, betas=(0.9, 0.99))
        optimizer = OptimWrapper.create(
            optimizer_func, 3e-3, get_layer_groups(model), wd=cfg.TRAIN.WEIGHT_DECAY, true_wd=True, bn_wd=True
        )

        # fix rpn: do this since we use costomized optimizer.step
        if cfg.RPN.ENABLED and cfg.RPN.FIXED:
            for param in model.rpn.parameters():
                param.requires_grad = False
    else:
        raise NotImplementedError

    return optimizer

In [6]:

def create_scheduler(optimizer, total_steps, last_epoch):
    def lr_lbmd(cur_epoch):
        cur_decay = 1
        for decay_step in cfg.TRAIN.DECAY_STEP_LIST:
            if cur_epoch >= decay_step:
                cur_decay = cur_decay * cfg.TRAIN.LR_DECAY
        return max(cur_decay, cfg.TRAIN.LR_CLIP / cfg.TRAIN.LR)

    def bnm_lmbd(cur_epoch):
        cur_decay = 1
        for decay_step in cfg.TRAIN.BN_DECAY_STEP_LIST:
            if cur_epoch >= decay_step:
                cur_decay = cur_decay * cfg.TRAIN.BN_DECAY
        return max(cfg.TRAIN.BN_MOMENTUM * cur_decay, cfg.TRAIN.BNM_CLIP)

    if cfg.TRAIN.OPTIMIZER == 'adam_onecycle':
        lr_scheduler = lsf.OneCycle(
            optimizer, total_steps, cfg.TRAIN.LR, list(cfg.TRAIN.MOMS), cfg.TRAIN.DIV_FACTOR, cfg.TRAIN.PCT_START
        )
    else:
        lr_scheduler = lr_sched.LambdaLR(optimizer, lr_lbmd, last_epoch=last_epoch)

    bnm_scheduler = train_utils.BNMomentumScheduler(model, bnm_lmbd, last_epoch=last_epoch)
    return lr_scheduler, bnm_scheduler


## args input

In [7]:
from easydict import EasyDict as edict

args = edict()

args.cfg_file = 'cfgs/default.yaml'
args.train_mode = 'rpn'
args.batch_size = 4
args.epochs = 200


args.workers = 8
args.ckpt_save_interval = 5
args.output_dir = 'train_rcnn2'
args.mgpus = False


args.ckpt = None
args.rpn_ckpt = None

args.gt_database = 'gt_database/train_gt_database_3level_Car.pkl'
args.rcnn_training_roi_dir = None
args.rcnn_training_feature_dir = None


args.train_with_eval = False
args.rcnn_eval_roi_dir = None 
args.rcnn_eval_feature_dir = None

## Main Codes

In [8]:
print("args.cfg_file= " + args.cfg_file)
if args.cfg_file is not None:
    cfg_from_file(args.cfg_file)
cfg.TAG = os.path.splitext(os.path.basename(args.cfg_file))[0]

args.cfg_file= cfgs/default.yaml


  yaml_cfg = edict(yaml.load(f))


In [9]:
print("args.train_mode = " + args.train_mode)
if args.train_mode == 'rpn':
    cfg.RPN.ENABLED = True
    cfg.RCNN.ENABLED = False
    root_result_dir = os.path.join('../', 'output', 'rpn', cfg.TAG)
elif args.train_mode == 'rcnn':
    cfg.RCNN.ENABLED = True
    cfg.RPN.ENABLED = cfg.RPN.FIXED = True
    root_result_dir = os.path.join('../', 'output', 'rcnn', cfg.TAG)
elif args.train_mode == 'rcnn_offline':
    cfg.RCNN.ENABLED = True
    cfg.RPN.ENABLED = False
    root_result_dir = os.path.join('../', 'output', 'rcnn', cfg.TAG)
else:
    raise NotImplementedError

args.train_mode = rpn


In [10]:
print("args.output_dir = " + args.output_dir)
if args.output_dir is not None:
    root_result_dir = args.output_dir
os.makedirs(root_result_dir, exist_ok=True)

args.output_dir = train_rcnn2


In [11]:
log_file = os.path.join(root_result_dir, 'log_train.txt')
logger = create_logger(log_file)
logger.info('**********************Start logging**********************')

# log to file
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys() else 'ALL'
logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)

for key, val in vars(args).items():
    logger.info("{:16} {}".format(key, val))
    
save_config_to_file(cfg, logger=logger)

2019-11-05 17:00:51,624   INFO  **********************Start logging**********************
2019-11-05 17:00:51,624   INFO  CUDA_VISIBLE_DEVICES=ALL
2019-11-05 17:00:51,624   INFO  cfg_file         cfgs/default.yaml
2019-11-05 17:00:51,625   INFO  train_mode       rpn
2019-11-05 17:00:51,625   INFO  batch_size       4
2019-11-05 17:00:51,625   INFO  epochs           200
2019-11-05 17:00:51,626   INFO  workers          8
2019-11-05 17:00:51,626   INFO  ckpt_save_interval 5
2019-11-05 17:00:51,626   INFO  output_dir       train_rcnn2
2019-11-05 17:00:51,626   INFO  mgpus            False
2019-11-05 17:00:51,627   INFO  ckpt             None
2019-11-05 17:00:51,627   INFO  rpn_ckpt         None
2019-11-05 17:00:51,627   INFO  gt_database      gt_database/train_gt_database_3level_Car.pkl
2019-11-05 17:00:51,628   INFO  rcnn_training_roi_dir None
2019-11-05 17:00:51,628   INFO  rcnn_training_feature_dir None
2019-11-05 17:00:51,628   INFO  train_with_eval  False
2019-11-05 17:00:51,628   INFO

2019-11-05 17:00:51,664   INFO  
cfg.TEST = edict()
2019-11-05 17:00:51,665   INFO  cfg.TEST.SPLIT: val
2019-11-05 17:00:51,665   INFO  cfg.TEST.RPN_PRE_NMS_TOP_N: 9000
2019-11-05 17:00:51,665   INFO  cfg.TEST.RPN_POST_NMS_TOP_N: 100
2019-11-05 17:00:51,665   INFO  cfg.TEST.RPN_NMS_THRESH: 0.8
2019-11-05 17:00:51,666   INFO  cfg.TEST.RPN_DISTANCE_BASED_PROPOSE: True


In [12]:
# tensorboard log
tb_log = SummaryWriter(log_dir=os.path.join(root_result_dir, 'tensorboard'))

In [13]:
# create dataloader & network & optimizer
train_loader, test_loader = create_dataloader(logger)
model = PointRCNN(num_classes=train_loader.dataset.num_class, use_xyz=True, mode='TRAIN')
optimizer = create_optimizer(model)

2019-11-05 17:00:51,864   INFO  Loading gt_database(easy(pt_num>100): 5651, hard(pt_num<=100): 5366) from gt_database/train_gt_database_3level_Car.pkl
2019-11-05 17:00:51,865   INFO  Loading TRAIN samples from ../data/KITTI/object/training/label_2 ...
2019-11-05 17:00:52,447   INFO  Done: filter TRAIN results: 3265 / 3712



In [14]:
print("args.mgpus = "+ str(args.mgpus))
if args.mgpus:
    model = nn.DataParallel(model)
model.cuda()

args.mgpus = False


PointRCNN(
  (rpn): RPN(
    (backbone_net): Pointnet2MSG(
      (SA_modules): ModuleList(
        (0): PointnetSAModuleMSG(
          (groupers): ModuleList(
            (0): QueryAndGroup()
            (1): QueryAndGroup()
          )
          (mlps): ModuleList(
            (0): SharedMLP(
              (layer0): Conv2d(
                (conv): Conv2d(3, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): BatchNorm2d(
                  (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                )
                (activation): ReLU(inplace=True)
              )
              (layer1): Conv2d(
                (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (bn): BatchNorm2d(
                  (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                )
                (activation): ReLU(inplace=True)
              )
              (layer2): Co

In [15]:
# load checkpoint if it is possible
start_epoch = it = 0
last_epoch = -1
if args.ckpt is not None:
    pure_model = model.module if isinstance(model, torch.nn.DataParallel) else model
    it, start_epoch = train_utils.load_checkpoint(pure_model, optimizer, filename=args.ckpt, logger=logger)
    last_epoch = start_epoch + 1
    
print("args.ckpt = "+ str(args.ckpt))

args.ckpt = None


In [16]:
lr_scheduler, bnm_scheduler = create_scheduler(optimizer, total_steps=len(train_loader) * args.epochs,
                                               last_epoch=last_epoch)

In [17]:
if args.rpn_ckpt is not None:
    pure_model = model.module if isinstance(model, torch.nn.DataParallel) else model
    total_keys = pure_model.state_dict().keys().__len__()
    train_utils.load_part_ckpt(pure_model, filename=args.rpn_ckpt, logger=logger, total_keys=total_keys)
    
    
print("args.rpn_ckpt =  " + str(args.rpn_ckpt))

args.rpn_ckpt =  None


In [18]:
if cfg.TRAIN.LR_WARMUP and cfg.TRAIN.OPTIMIZER != 'adam_onecycle':
    lr_warmup_scheduler = train_utils.CosineWarmupLR(optimizer, T_max=cfg.TRAIN.WARMUP_EPOCH * len(train_loader),
                                                  eta_min=cfg.TRAIN.WARMUP_MIN)
else:
    lr_warmup_scheduler = None
    
print("if = " + str(cfg.TRAIN.LR_WARMUP and cfg.TRAIN.OPTIMIZER != 'adam_onecycle'))

if = False


In [19]:
# start training
logger.info('**********************Start training**********************')
ckpt_dir = os.path.join(root_result_dir, 'ckpt')
os.makedirs(ckpt_dir, exist_ok=True)
trainer = train_utils.Trainer(
    model,
    train_functions.model_joint_fn_decorator(),
    optimizer,
    ckpt_dir=ckpt_dir,
    lr_scheduler=lr_scheduler,
    bnm_scheduler=bnm_scheduler,
    model_fn_eval=train_functions.model_joint_fn_decorator(),
    tb_log=tb_log,
    eval_frequency=1,
    lr_warmup_scheduler=lr_warmup_scheduler,
    warmup_epoch=cfg.TRAIN.WARMUP_EPOCH,
    grad_norm_clip=cfg.TRAIN.GRAD_NORM_CLIP
)

2019-11-05 17:01:20,593   INFO  **********************Start training**********************


In [None]:
trainer.train(
    it,
    start_epoch,
    args.epochs,
    train_loader,
    test_loader,
    ckpt_save_interval=args.ckpt_save_interval,
    lr_scheduler_each_iter=(cfg.TRAIN.OPTIMIZER == 'adam_onecycle')
)

logger.info('**********************End training**********************')