In [1]:
import argparse
import os
import random
import time
import numpy as np
import torch
import torch.backends.cudnn as cudnn
os.chdir('../')  # 更改notebook的工作路径到上一级目录

In [10]:
from dataset.VOC_dataset import VOCDataset
from dataset.augment import Transforms
import torch.nn as nn
from model.backbone.resnet import resnet50
from model.fcos import FCOS
from model.loss import GenTargets, LOSS, coords_fmap2orig
from model.fpn_neck import FPN
from model.config import DefaultConfig

## 参数设定

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=30, help="number of epochs")
parser.add_argument("--batch_size", type=int, default=1, help="size of each image batch")
parser.add_argument("--n_cpu", type=int, default=0, help="number of cpu threads to use during batch generation")
parser.add_argument("--n_gpu", type=str, default='0,1', help="number of cpu threads to use during batch generation")
opt = parser.parse_args([])  # notebook 中运行的时候要加 parser.parse_args() 的参数要加[]

## GPU环境设定

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = opt.n_gpu
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

## 数据集设置

In [5]:
BATCH_SIZE = opt.batch_size

transform = Transforms()
train_dataset = VOCDataset(root_dir='../datasets/VOCdevkit/VOC2007', resize_size=[800, 1333],
                           split='trainval', use_difficult=False, is_train=True, augment=transform)

# WARMPUP_STEPS_RATIO = 0.12
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                                           collate_fn=train_dataset.collate_fn,
                                           num_workers=opt.n_cpu, worker_init_fn=np.random.seed(0))

print("total_images : {}".format(len(train_dataset)))

INFO=====>voc dataset init finished  ! !
total_images : 5011


## 模型设置

In [8]:
class FCOSDetector(nn.Module):
    def __init__(self, config=None):
        super().__init__()
        if config is None:
            config = DefaultConfig

        self.fcos_body = FCOS(config=config)
        self.target_layer = GenTargets(strides=config.strides, limit_range=config.limit_range)
        self.loss_layer = LOSS()
        

    def forward(self, inputs):
        """
        FCOS网络
        :param inputs:
                [training] list  batch_images,batch_boxes,batch_classes
        :return:
                [training] losses
        """
        batch_imgs, batch_boxes, batch_classes = inputs
        # 模型输出
        out = self.fcos_body(batch_imgs)
        # 编码标签信息
        targets = self.target_layer([out, batch_boxes, batch_classes])
        # 计算标签和预测信息间的损失
        losses = self.loss_layer([out, targets])
        return losses

In [11]:
model = FCOSDetector().cuda()
# model = torch.nn.DataParallel(model)  # 多gpu时使用
model.train()  # 设置为训练模式

INFO===>success frozen BN
INFO===>success frozen backbone stage1


FCOSDetector(
  (fcos_body): FCOS(
    (backbone): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace)

## 优化器设置

In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr=2e-3, momentum=0.9, weight_decay=0.0001)

## 开始训练

In [13]:
EPOCHS = opt.epochs
steps_per_epoch = len(train_dataset) // BATCH_SIZE
TOTAL_STEPS = steps_per_epoch * EPOCHS
WARMPUP_STEPS = 501

GLOBAL_STEPS = 1
LR_INIT = 2e-3
LR_END = 2e-5

for epoch in range(EPOCHS):  # 分轮次，，，
    for epoch_step, data in enumerate(train_loader):  # ，，，分批次 开始训练
        
        # ============================== 拿到批次数据 =========================
        batch_imgs, batch_boxes, batch_classes = data
        batch_imgs = batch_imgs.cuda()
        batch_boxes = batch_boxes.cuda()
        batch_classes = batch_classes.cuda()
        # =====================================================================
        
        
        # ================================ 学习率调整 =========================
        if GLOBAL_STEPS < WARMPUP_STEPS:
            lr = float(GLOBAL_STEPS / WARMPUP_STEPS * LR_INIT)
            for param in optimizer.param_groups:
                param['lr'] = lr
        if GLOBAL_STEPS == 20001:
            lr = LR_INIT * 0.1
            for param in optimizer.param_groups:
                param['lr'] = lr
        if GLOBAL_STEPS == 27001:
            lr = LR_INIT * 0.01
            for param in optimizer.param_groups:
                param['lr'] = lr       
        # ===================================================================   
        
        
        # ============================ 网络参数更新 =========================
        start_time = time.time()
        # 1 梯度清理
        optimizer.zero_grad()
        # 2 损失计算
        losses = model([batch_imgs, batch_boxes, batch_classes])
        loss = losses[-1]
        loss.mean().backward()
        # 3 梯度回传更新网络参数
        optimizer.step()
        # =================================================================
        
        
        # ============================ 显示训练信息 =========================
        end_time = time.time()
        cost_time = int((end_time - start_time) * 1000)
        print(
            "global_steps:%d epoch:%d steps:%d/%d cls_loss:%.4f cnt_loss:%.4f reg_loss:%.4f cost_time:%dms lr=%.4e total_loss:%.4f" % \
            (GLOBAL_STEPS, epoch + 1, epoch_step + 1, steps_per_epoch, losses[0].mean(), losses[1].mean(),
             losses[2].mean(), cost_time, lr, loss.mean()))

        GLOBAL_STEPS += 1
        # ==================================================================
    torch.save(model.state_dict(),
               "./checkpoint/model_{}.pth".format(epoch + 1))

global_steps:1 epoch:1 steps:1/5011 cls_loss:1.2560 cnt_loss:0.7801 reg_loss:0.9997 cost_time:14171ms lr=3.9920e-06 total_loss:3.0358
global_steps:2 epoch:1 steps:2/5011 cls_loss:1.1750 cnt_loss:0.7514 reg_loss:0.9999 cost_time:534ms lr=7.9840e-06 total_loss:2.9263
global_steps:3 epoch:1 steps:3/5011 cls_loss:1.2949 cnt_loss:0.7793 reg_loss:0.9997 cost_time:397ms lr=1.1976e-05 total_loss:3.0740
global_steps:4 epoch:1 steps:4/5011 cls_loss:1.3756 cnt_loss:0.9157 reg_loss:1.0000 cost_time:399ms lr=1.5968e-05 total_loss:3.2913
global_steps:5 epoch:1 steps:5/5011 cls_loss:1.0905 cnt_loss:0.7375 reg_loss:0.9995 cost_time:398ms lr=1.9960e-05 total_loss:2.8276
global_steps:6 epoch:1 steps:6/5011 cls_loss:1.1989 cnt_loss:0.8049 reg_loss:1.0000 cost_time:428ms lr=2.3952e-05 total_loss:3.0039
global_steps:7 epoch:1 steps:7/5011 cls_loss:1.0979 cnt_loss:0.7145 reg_loss:1.0000 cost_time:390ms lr=2.7944e-05 total_loss:2.8123
global_steps:8 epoch:1 steps:8/5011 cls_loss:2.7303 cnt_loss:0.0000 reg_lo

KeyboardInterrupt: 