In [1]:
import argparse
import json
import os
import sys
import time

from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning, NumbaWarning
import warnings
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaWarning)

import numpy as np
import torch
import yaml
from det3d.datasets import build_dataset
from det3d.models import build_detector
from det3d.torchie import Config
from det3d.torchie.apis import (
    build_optimizer,
    get_root_logger,
    init_dist,
    set_random_seed,
    train_detector,
)
import torch.distributed as dist
import subprocess

# 주피터 노트북 경로설정
os.chdir('../')

def parse_args():
    parser = argparse.ArgumentParser(description="Train a detector")
    parser.add_argument("--config", default="configs/etriInfra/pp/etriInfra_centerpoint_pp_02voxel_two_pfn_10sweep.py", help="train config file path")
    parser.add_argument("--work_dir", help="the dir to save logs and models")
    parser.add_argument("--resume_from", help="the checkpoint file to resume from")
    parser.add_argument(
        "--validate",
        action="store_true",
        help="whether to evaluate the checkpoint during training",
    )
    parser.add_argument(
        "--gpus",
        type=int,
        default=1,
        help="number of gpus to use " "(only applicable to non-distributed training)",
    )
    parser.add_argument("--seed", type=int, default=None, help="random seed")
    parser.add_argument(
        "--launcher",
        choices=["pytorch", "slurm"],
        default="pytorch",
        help="job launcher",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--autoscale-lr",
        action="store_true",
        help="automatically scale lr with the number of gpus",
    )
    args = parser.parse_args(args=[])
    if "LOCAL_RANK" not in os.environ:
        os.environ["LOCAL_RANK"] = str(args.local_rank)

    return args
args = parse_args()
cfg = Config.fromfile(args.config)

no apex
No Tensorflow
Deformable Convolution not built!
No APEX!


In [2]:
# distribution 설정 안함
cfg.local_rank = args.local_rank 

In [3]:
# init logger before other steps
distributed = False
logger = get_root_logger(cfg.log_level)
logger.info("Distributed training: {}".format(distributed))
logger.info(f"torch.backends.cudnn.benchmark: {torch.backends.cudnn.benchmark}")

2024-04-09 10:29:33,810 - INFO - Distributed training: False
2024-04-09 10:29:33,811 - INFO - torch.backends.cudnn.benchmark: False


In [4]:
model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)

2024-04-09 10:29:33,882 - INFO - Finish RPN Initialization
2024-04-09 10:29:33,882 - INFO - num_classes: [2, 2, 1, 1, 2, 1]
2024-04-09 10:29:33,899 - INFO - Finish CenterHead Initialization


Use HM Bias:  -2.19


In [5]:
from det3d.datasets.registry import DATASETS
DATASETS
cfg.data.train['type']

'etrInfraDataset'

In [6]:
DATASETS

Registry(name=dataset, items=['ConcatDataset', 'RepeatDataset', 'PointCloudDataset', 'NuScenesDataset', 'WaymoDataset', 'etrInfraDataset'])

In [7]:
datasets = [build_dataset(cfg.data.train)]

2024-04-09 10:29:38,245 - INFO - {'car': 5, 'personal_mobility': 5, 'truck': 5, 'construction_vehicle': 5, 'bus': 5, 'ground_animal': 5, 'motorcycle': 5, 'bicycle': 5, 'pedestrian': 5}
2024-04-09 10:29:38,245 - INFO - [-1]
2024-04-09 10:29:39,589 - INFO - load 474176 car database infos
2024-04-09 10:29:39,589 - INFO - load 18209 truck database infos
2024-04-09 10:29:39,589 - INFO - load 10857 bus database infos
2024-04-09 10:29:39,590 - INFO - load 5070 motorcycle database infos
2024-04-09 10:29:39,590 - INFO - load 6526 construction_vehicle database infos
2024-04-09 10:29:39,590 - INFO - load 28750 pedestrian database infos
2024-04-09 10:29:39,591 - INFO - load 11105 personal_mobility database infos
2024-04-09 10:29:39,591 - INFO - load 554 bicycle database infos
2024-04-09 10:29:39,591 - INFO - load 298 ground_animal database infos
2024-04-09 10:29:40,080 - INFO - After filter database:
2024-04-09 10:29:40,081 - INFO - load 338483 car database infos
2024-04-09 10:29:40,081 - INFO - l

In [8]:
if cfg.checkpoint_config is not None:
    # save det3d version, config file content and class names in
    # checkpoints as meta data
    cfg.checkpoint_config.meta = dict(
        config=cfg.text, CLASSES=datasets[0].CLASSES
    )


In [9]:
# add an attribute for visualization convenience
model.CLASSES = datasets[0].CLASSES
print(datasets[0].CLASSES)

None


## train detector

In [10]:
from torch.utils.data import DataLoader
from det3d.datasets import DATASETS, build_dataloader
# cfg.data.samples_per_gpu: batch_size
# cfg.data.workers_per_gpu: workers_per_gpu, num_workers


data_loaders = [
        build_dataloader(
            ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed
        )
        for ds in datasets
    ]

In [11]:
data_loaders[0]

<torch.utils.data.dataloader.DataLoader at 0x7f6da86311c0>

In [12]:
logger.info(f"total epochs: {cfg.total_epochs}" )
logger.info(f" length of dataloader elems: {len(data_loaders[0])}")

total_steps = cfg.total_epochs * len(data_loaders[0])
logger.info(f"total_steps: {total_steps}")

2024-04-09 10:29:45,182 - INFO - total epochs: 20
2024-04-09 10:29:45,184 - INFO -  length of dataloader elems: 6320
2024-04-09 10:29:45,185 - INFO - total_steps: 126400


In [13]:
cfg.lr_config

{'type': 'one_cycle',
 'lr_max': 0.001,
 'moms': [0.95, 0.85],
 'div_factor': 10.0,
 'pct_start': 0.4}

In [14]:
from functools import partial
from det3d.solver.fastai_optim import OptimWrapper
from det3d.builder import _create_learning_rate_scheduler
from torch import nn


def flatten_model(m):
    return sum(map(flatten_model, m.children()), []) if len(list(m.children())) else [m]

def get_layer_groups(m):
    return [nn.Sequential(*flatten_model(m))]

def build_one_cycle_optimizer(model, optimizer_config):
    if optimizer_config.fixed_wd:
        optimizer_func = partial(
            torch.optim.Adam, betas=(0.9, 0.99), amsgrad=optimizer_config.amsgrad
        )


    optimizer = OptimWrapper.create(
        optimizer_func,
        3e-3,   # TODO: CHECKING LR HERE !!!
        get_layer_groups(model),
        wd=optimizer_config.wd,
        true_wd=optimizer_config.fixed_wd,
        bn_wd=True,
    )

    return optimizer

if cfg.lr_config.type == "one_cycle":
    # build trainer
    optimizer = build_one_cycle_optimizer(model, cfg.optimizer)
    lr_scheduler = _create_learning_rate_scheduler(
        optimizer, cfg.lr_config, total_steps
    )
    cfg.lr_config = None

In [15]:
model = model.cuda()
logger.info(f"model structure: {model}")

2024-04-09 10:29:45,994 - INFO - model structure: PointPillars(
  (reader): PillarFeatureNet(
    (pfn_layers): ModuleList(
      (0): PFNLayer(
        (linear): Linear(in_features=8, out_features=32, bias=False)
        (norm): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      )
      (1): PFNLayer(
        (linear): Linear(in_features=64, out_features=64, bias=False)
        (norm): BatchNorm1d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      )
    )
  )
  (backbone): PointPillarsScatter()
  (neck): RPN(
    (blocks): ModuleList(
      (0): Sequential(
        (0): ZeroPad2d((1, 1, 1, 1))
        (1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (2): BatchNorm2d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (3): ReLU()
        (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (5): BatchNorm2d(64, eps=0.001, momentum=0.01, aff

In [16]:
from det3d.torchie.trainer import DistSamplerSeedHook, Trainer, obj_from_dict
from collections import OrderedDict

def example_to_device(example, device, non_blocking=False) -> dict:
    example_torch = {}
    float_names = ["voxels", "bev_map"]
    for k, v in example.items():
        if k in ["anchors", "anchors_mask", "reg_targets", "reg_weights", "labels", "hm",
                "anno_box", "ind", "mask", 'cat', 'points']:
            example_torch[k] = [res.to(device, non_blocking=non_blocking) for res in v]
        elif k in [
            "voxels",
            "bev_map",
            "coordinates",
            "num_points",
            "num_voxels",
            "cyv_voxels",
            "cyv_num_voxels",
            "cyv_coordinates",
            "cyv_num_points",
            "gt_boxes_and_cls"
        ]:
            example_torch[k] = v.to(device, non_blocking=non_blocking)
        elif k == "calib":
            calib = {}
            for k1, v1 in v.items():
                calib[k1] = v1.to(device, non_blocking=non_blocking)
            example_torch[k] = calib
        else:
            example_torch[k] = v

    return example_torch


def parse_second_losses(losses):

    log_vars = OrderedDict()
    loss = sum(losses["loss"])
    for loss_name, loss_value in losses.items():
        if loss_name == "loc_loss_elem":
            log_vars[loss_name] = [[i.item() for i in j] for j in loss_value]
        else:
            log_vars[loss_name] = [i.item() for i in loss_value]

    return loss, log_vars

def batch_processor(model, data, train_mode, **kwargs):

    if "local_rank" in kwargs:
        device = torch.device(kwargs["local_rank"])
    else:
        device = None

    # data = example_convert_to_torch(data, device=device)
    example = example_to_device(data, device, non_blocking=False)

    del data

    if train_mode:
        losses = model(example, return_loss=True)
        loss, log_vars = parse_second_losses(losses)

        outputs = dict(
            loss=loss, log_vars=log_vars, num_samples=len(example["anchors"][0])
        )
        return outputs
    else:
        return model(example, return_loss=False)


trainer = Trainer(
        model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level
    )

In [17]:
logger.info(f"optimizer_config: {cfg.optimizer_config}")
optimizer_config = cfg.optimizer_config
trainer.register_training_hooks(
        cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config
    )


2024-04-09 10:29:46,026 - INFO - optimizer_config: {'grad_clip': {'max_norm': 35, 'norm_type': 2}}


## trainer.run

In [18]:
# trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)

In [19]:
# trainer.run 파라미터
logger.info(f"num of dataloader: {len(data_loaders)}, dataloader type: {type(data_loaders[0])}")
logger.info(f"workflow information: {cfg.workflow}")
logger.info(f"total_epochs: {cfg.total_epochs}")
logger.info(f"local_rank: {cfg.local_rank}")

2024-04-09 10:29:46,065 - INFO - num of dataloader: 1, dataloader type: <class 'torch.utils.data.dataloader.DataLoader'>
2024-04-09 10:29:46,065 - INFO - workflow information: [('train', 1)]
2024-04-09 10:29:46,066 - INFO - total_epochs: 20
2024-04-09 10:29:46,066 - INFO - local_rank: 0


In [20]:
_max_epochs = cfg.total_epochs
work_dir = '/home/jaelee/objdect/CenterPoint/work_dirs/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z'
workflow = cfg.workflow

In [21]:
idx = 0
mode, epochs = workflow[idx]
logger.info(f"mode: {mode}")
logger.info(f"epochs: {epochs}")

2024-04-09 10:29:46,098 - INFO - mode: train
2024-04-09 10:29:46,099 - INFO - epochs: 1


## train

In [22]:
# train(data_loaders[idx], epochs, **kwargs)

In [23]:
def batch_processor_inline(self, model, data, train_mode, **kwargs):

    if "local_rank" in kwargs:
        device = torch.device(kwargs["local_rank"])
    else:
        device = None

    # data = example_convert_to_torch(data, device=device)
    example = example_to_device(
        data, torch.cuda.current_device(), non_blocking=False
    )

    self.call_hook("after_data_to_device")

    if train_mode:
        losses = model(example, return_loss=True)
        self.call_hook("after_forward")
        loss, log_vars = parse_second_losses(losses)
        del losses

        outputs = dict(
            loss=loss, log_vars=log_vars, num_samples=-1  # TODO: FIX THIS
        )
        self.call_hook("after_parse_loss")

        return outputs
    else:
        return model(example, return_loss=False)

model.train()
mode = "train"
data_loader = data_loaders[idx]
length = len(data_loader)
_max_iters = _max_epochs * length
base_step = epochs * length
# outputs = batch_processor_inline(model, data_loader, train_mode=True, local_rank = 0)

In [24]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7f6da86311c0>

In [25]:
len(data_loader)

6320

In [26]:
ex = data_loader.dataset.__getitem__(0)

2024-04-09 10:29:48,241 - INFO - finding looplift candidates


In [27]:
type(ex['points'])

numpy.ndarray

In [28]:
ex['points'].shape

(54277, 3)

In [29]:
ex['voxels'].shape

(8679, 20, 3)

In [30]:
ex['num_voxels']

array([8679])

In [31]:
ex['num_points'].shape

(8679,)

In [32]:
ex['num_points']

array([ 9, 16, 10, ...,  1,  1,  1], dtype=int32)

In [33]:
ex['shape']

array([512, 512,   1])

In [34]:
ex['points'][0].dtype

dtype('float32')

In [35]:
device = torch.device(0)

In [36]:
# example = example_to_device(
#     data_loader.dataset.__getitem__(0), device, non_blocking=False)

In [37]:
device = torch.device(0)
lst = []
for i, data_batch in enumerate(data_loader):
    s = time.time()
    global_step = base_step + i
    example = example_to_device(
    data_batch, device, non_blocking=False)
    # print(example['voxels'][0][0].dtype)
    model(example, return_loss=True)
    # losses = model(example, return_loss=True)
    # loss, log_vars = parse_second_losses(losses)
    # del losses
    # outputs = dict(
    #             loss=loss, log_vars=log_vars, num_samples=-1  # TODO: FIX THIS
    #         )
    e = time.time() - s
    print(f"model train time: {e}")
    lst.append(e)
    if i == 30:
        break
lst = lst[1:]
avg = sum(lst[1:])/i
print(f"{i} samples avg train time: {sum(lst)/(i+1)}")
var = sum((np.array(lst) - avg)**2)/i
print(f"{i} samples var: {var}")
print(f"max: {max(lst)}")

{'reg': tensor([[[[  0.8518,  10.7399,  -0.1462,  ...,  -1.0375,   0.7519,   0.9834],
          [ -0.5541,   3.2839,   1.9285,  ...,   2.3342,  -3.5838,   1.9402],
          [-13.5517,   4.0384,   2.1009,  ...,   0.0501,   0.8825,   1.7452],
          ...,
          [  0.9202,  -1.4432,  -0.6092,  ...,   3.5803,  -4.0323,   2.3171],
          [  1.3471,  -0.2561,   2.0087,  ...,  -0.5488,  -3.3884,  -0.4932],
          [ -0.5161,   0.8185,  -1.6769,  ...,   0.6834,  -3.4454,   1.1490]],

         [[ -9.2998,   2.5025,   0.1324,  ...,  -1.7688,  -3.8809,   0.4572],
          [  1.9634,   4.2041,  -2.3061,  ...,  -2.9818,  -4.7885,  -1.9009],
          [  3.1908,  -0.1999,  -4.1439,  ...,  -0.1152,  -5.9227,  -0.8801],
          ...,
          [ -1.3066,  -0.1212,  -2.9800,  ...,  -1.0237,  -4.6293,  -1.0945],
          [ -1.0689,  -1.3659,  -4.4531,  ...,  -3.6779,  -4.7578,  -1.8029],
          [ -1.6555,  -0.9278,  -3.1643,  ...,  -1.3479,  -2.0412,   0.2939]]],


        [[[  0.6580,