In [1]:
import os
import os.path as osp

from mmengine.config import Config, DictAction
from mmengine.registry import RUNNERS
from mmengine.runner import Runner

from mmdet.utils import setup_cache_size_limit_of_dynamo

from pprint import pprint

### Arguments

In [2]:
def parse_args():
    args = dict()
    
    args['config'] = "/home/dmsai2/mmdetection/my_configs/faster-renn_r101_fpn_1x_coco.py" # train config file path
    args['work_dir'] = "/home/dmsai2/mmdetection/work_dir/" # the dir to save logs and models
    args['amp'] = False # enable automatic-mixed-precision training
    args['auto_scale_lr'] = True # enable automatically scaling LR
    args['resume'] = None # If specify checkpoint path, resume from it, while if not specify, try to auto resume from the latest checkpoint in the work directory.
    args['cfg_options'] = None # override some settings in the used config
    args['launcher'] = ['none', 'pytorch', 'slurm', 'mpi'][0] # job launcher
    args['local_rank'] = 0 # local rank of the process
    
    if 'LOCAL_RANK' in os.environ:
        args['local_rank'] = str(args['local_rank'])
        
    return args

In [3]:
args = parse_args()
pprint(args, indent=2)

# Reduce the number of repeated compilations and improve
# training speed.
# setup_cache_size_limit_of_dynamo()

{ 'amp': False,
  'auto_scale_lr': True,
  'cfg_options': None,
  'config': '/home/dmsai2/mmdetection/my_configs/faster-renn_r101_fpn_1x_coco.py',
  'launcher': 'none',
  'local_rank': 0,
  'resume': None,
  'work_dir': '/home/dmsai2/mmdetection/work_dir/'}


### Load Config

In [4]:
# load config
cfg = Config.fromfile(args['config'])
cfg.launcher = args['launcher']
if args['cfg_options'] is not None:
    cfg.merge_from_dict(args['cfg_options'])

In [5]:
# print config info prettier
pprint(cfg.to_dict(), indent=2)

{ 'auto_scale_lr': {'base_batch_size': 16, 'enable': False},
  'backend_args': None,
  'classes': ('tooth',),
  'data_root': '/home/dmsai2/mmdetection/data/tooth/',
  'dataset_type': 'CocoDataset',
  'default_hooks': { 'checkpoint': { 'by_epoch': True,
                                     'interval': 8,
                                     'type': 'CheckpointHook'},
                     'logger': {'interval': 50, 'type': 'LoggerHook'},
                     'param_scheduler': {'type': 'ParamSchedulerHook'},
                     'sampler_seed': {'type': 'DistSamplerSeedHook'},
                     'timer': {'type': 'IterTimerHook'},
                     'visualization': {'type': 'DetVisualizationHook'}},
  'default_scope': 'mmdet',
  'env_cfg': { 'cudnn_benchmark': False,
               'dist_cfg': {'backend': 'nccl'},
               'mp_cfg': {'mp_start_method': 'fork', 'opencv_num_threads': 0}},
  'launcher': 'none',
  'load_from': None,
  'log_level': 'INFO',
  'log_processor': { 'by_

### Set Work Directory

In [6]:
# work_dir is determined in this priority: CLI > segment in file > filename
if args['work_dir'] is not None:
    # update configs according to CLI args if args.work_dir is not None
    cfg.work_dir = args['work_dir']
elif cfg.get('work_dir', None) is None:
    # use config filename as default work_dir if cfg.work_dir is None
    cfg.work_dir = osp.join('./work_dirs',
                            osp.splitext(osp.basename(args['config']))[0])
    
print("work_dir:", cfg.work_dir)

work_dir: /home/dmsai2/mmdetection/work_dir/


### Set Dataset

In [7]:
# cfg.data_root = "C:\\Users\\uamdt3\\Desktop\\mmdetection\\data\\coco\\" # the root of the dataset
print("data_root:", cfg.data_root)

data_root: /home/dmsai2/mmdetection/data/tooth/


### Set Amp

In [8]:
# enable automatic-mixed-precision training
if args['amp'] is True:
    cfg.optim_wrapper.type = 'AmpOptimWrapper'
    cfg.optim_wrapper.loss_scale = 'dynamic'

### Set Auto-Scale-LR Scheduler

In [9]:
# enable automatically scaling LR
if args['auto_scale_lr']:
    if 'auto_scale_lr' in cfg and \
            'enable' in cfg.auto_scale_lr and \
            'base_batch_size' in cfg.auto_scale_lr:
        cfg.auto_scale_lr.enable = True
    else:
        raise RuntimeError('Can not find "auto_scale_lr" or '
                            '"auto_scale_lr.enable" or '
                            '"auto_scale_lr.base_batch_size" in your'
                            ' configuration file.')
        
print("auto_scale_lr:", cfg.auto_scale_lr)

auto_scale_lr: {'enable': True, 'base_batch_size': 16}


### Resume from pretrained model

In [10]:
# resume is determined in this priority: resume from > auto_resume
if args['resume'] == 'auto':
    cfg.resume = True
    cfg.load_from = None
elif args['resume'] is not None:
    cfg.resume = True
    cfg.load_from = args['resume']

### Set Runner type (default is None)

In [11]:
# build the runner from config
if 'runner_type' not in cfg:
    # build the default runner
    runner = Runner.from_cfg(cfg)
else:
    # build customized runner from the registry
    # if 'runner_type' is set in the cfg
    runner = RUNNERS.build(cfg)
    
pprint(runner, indent=2)

06/07 12:49:01 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.8.19 (default, Mar 20 2024, 19:58:24) [GCC 11.2.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 1896563543
    GPU 0: NVIDIA GeForce RTX 3070
    CUDA_HOME: /usr/local/cuda
    NVCC: Cuda compilation tools, release 11.8, V11.8.89
    GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
    PyTorch: 2.3.0+cu118
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.3.6 (Git Hash 86e6af5974177e513fd3fee58425e1063e7f1361)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 11.8
  - NVCC architecture flags: -gencode;arch=comput

### Start Tranining

In [12]:
runner.train()

loading annotations into memory...
Done (t=0.31s)
creating index...
index created!
06/07 12:49:03 - mmengine - [4m[97mINFO[0m - LR is set based on batch size of 16 and the current batch size is 16. Scaling the original LR by 1.0.
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
06/07 12:49:03 - mmengine - [4m[97mINFO[0m - load model from: torchvision://resnet101
06/07 12:49:03 - mmengine - [4m[97mINFO[0m - Loads checkpoint by torchvision backend from path: torchvision://resnet101


Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /home/dmsai2/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth



unexpected key in source state_dict: fc.weight, fc.bias

06/07 12:49:09 - mmengine - [4m[97mINFO[0m - Checkpoints will be saved to /home/dmsai2/mmdetection/work_dir.


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


06/07 12:49:28 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][ 50/129]  lr: 9.9098e-05  eta: 0:40:11  time: 0.3926  data_time: 0.0186  memory: 3197  loss: nan  loss_rpn_cls: 0.4221  loss_rpn_bbox: 0.0869  loss_cls: 0.3334  acc: 89.5752  loss_bbox: nan
06/07 12:49:46 - mmengine - [4m[97mINFO[0m - Epoch(train)  [1][100/129]  lr: 1.9920e-04  eta: 0:38:18  time: 0.3621  data_time: 0.0105  memory: 3197  loss: nan  loss_rpn_cls: 0.1899  loss_rpn_bbox: 0.0725  loss_cls: 0.3461  acc: 83.2520  loss_bbox: nan
06/07 12:49:57 - mmengine - [4m[97mINFO[0m - Exp name: faster-renn_r101_fpn_1x_coco_20240607_124900
06/07 12:50:01 - mmengine - [4m[97mINFO[0m - Epoch(val)  [1][50/68]    eta: 0:00:01  time: 0.0760  data_time: 0.0234  memory: 3197  
06/07 12:50:02 - mmengine - [4m[97mINFO[0m - Evaluating bbox...
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=2.99s).
Accumulating evaluati

FasterRCNN(
  (data_preprocessor): DetDataPreprocessor()
  (backbone): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): ResLayer(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=Tru