# 作业：实现目标检测模型的设计

将以下代码进行了拆分，分别拆分为 `yolo_v0_model.py`, `PennFudanDataset_main.py` 以及 `yolo_v0_train.py`。

```
project_directory/
|── 参考 29. yolo_v1_train.py
|── HDModule/
|   |── yolo_v0_model.py
|   |── PennFudanDataset_main.py
|   |── __init__.py
```

In [20]:
# from HDModule.PennFudanDataset_main import *
# from HDModule.yolo_v0_model import *
import HDModule.transforms as T

import torch
import torch.nn as nn
import torch.optim as optim

import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F

from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.utils.data import DataLoader

from PIL import Image

import math
import cv2
import numpy as np
import time
import sys
import os

## YOLO V0 Model 

In [21]:
class VGG(nn.Module):
    def __init__(self):
       super(VGG,self).__init__()
       # the vgg's layers
       # self.features = features
       cfg = [64,64,'M',128,128,'M',256,256,256,'M',512,512,512,'M',512,512,512,'M']
       layers= []
       batch_norm = False
       in_channels = 3
       for v in cfg:
           if v == 'M':
               layers += [nn.MaxPool2d(kernel_size=2,stride = 2)]
           else:
               conv2d = nn.Conv2d(in_channels,v,kernel_size=3,padding = 1)
               if batch_norm:
                   layers += [conv2d,nn.Batchnorm2d(v),nn.ReLU(inplace=True)]
               else:
                   layers += [conv2d,nn.ReLU(inplace=True)]
               in_channels = v
       # use the vgg layers to get the feature
       self.features = nn.Sequential(*layers)
       # 全局池化
       self.avgpool = nn.AdaptiveAvgPool2d((7,7))
       # 决策层：分类层
       self.classifier = nn.Sequential(
           nn.Linear(512*7*7,4096),
           nn.ReLU(True),
           nn.Dropout(),
           nn.Linear(4096,4096),
           nn.ReLU(True),
           nn.Dropout(),
           nn.Linear(4096,1000),
       )

       for m in self.modules():
           if isinstance(m,nn.Conv2d):
               nn.init.kaiming_normal_(m.weight,mode='fan_out',nonlinearity='relu')
               if m.bias is not None: 
                   nn.init.constant_(m.bias,0)
           elif isinstance(m,nn.BatchNorm2d):
               nn.init.constant_(m.weight,1)
               nn.init.constant_(m.bias,1)
           elif isinstance(m,nn.Linear):
               nn.init.normal_(m.weight,0,0.01)
               nn.init.constant_(m.bias,0)

    def forward(self,x):
         x = self.features(x)
         x_fea = x
         x = self.avgpool(x)
         x_avg = x
         x = x.view(x.size(0),-1)
         x = self.classifier(x)
         return x,x_fea,x_avg
    def extractor(self,x):
         x = self.features(x)
         return x

class YOLOV0(nn.Module):
    def __init__(self):
       super(YOLOV0,self).__init__()
       vgg = VGG()
       self.extractor = vgg.extractor
       self.avgpool = nn.AdaptiveAvgPool2d((7,7))
       # 决策层：检测层
       self.detector = nn.Sequential(
          nn.Linear(512*7*7,4096),
          nn.ReLU(True),
          nn.Dropout(),
          #nn.Linear(4096,1470),
          nn.Linear(4096,5),
       )
       for m in self.modules():
           if isinstance(m,nn.Conv2d):
               nn.init.kaiming_normal_(m.weight,mode='fan_out',nonlinearity='relu')
               if m.bias is not None: 
                   nn.init.constant_(m.bias,0)
           elif isinstance(m,nn.BatchNorm2d):
               nn.init.constant_(m.weight,1)
               nn.init.constant_(m.bias,1)
           elif isinstance(m,nn.Linear):
               nn.init.normal_(m.weight,0,0.01)
               nn.init.constant_(m.bias,0)
    def forward(self,x):
        x = self.extractor(x)
        x = self.avgpool(x)
        x = x.view(x.size(0),-1)
        x = self.detector(x)
        b,_ = x.shape
        #x = x.view(b,7,7,30)
        x = x.view(b,1,1,5)
        return x

In [22]:
def v0_model_main():
    vgg = VGG()
    x = torch.randn(1, 3, 512, 512)
    feature, x_fea, x_avg = vgg(x)
    print(feature.shape)
    print(x_fea.shape)
    print(x_avg.shape)

    yolov0 = YOLOV0()
    feature = yolov0(x)
    # feature_size b*7*7*30
    print(feature.shape)

## PennFudanDataset

In [23]:
class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
    # dataset[0]
    def __getitem__(self, idx):
        # load images ad masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)
        # convert the PIL Image into a numpy array
        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target
    # len(dataset)
    def __len__(self):
        return len(self.imgs)

In [24]:
import HDModule.transforms as T
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

## YOLO V0 Train

In [25]:
dp = os.environ.get('DATA_PATH') + 'PennFudanPed/'

In [26]:
dataset = PennFudanDataset(dp, get_transform(train=False))
dataset_test = PennFudanDataset(dp, get_transform(train=False))

indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, [0])
dataset_test = torch.utils.data.Subset(dataset_test, indices[0:2])

In [27]:
def collate_fn(batch):
    return tuple(zip(*batch))

# define training and validation data loaders
train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=collate_fn)

val_loader = torch.utils.data.DataLoader(
        dataset_test, batch_size=2, shuffle=False, num_workers=2,
        collate_fn=collate_fn)


def input_process(batch):
    batch_size=len(batch[0])
    input_batch= torch.zeros(batch_size,3,448,448)
    for i in range(batch_size):
        inputs_tmp = Variable(batch[0][i])
        inputs_tmp1=cv2.resize(inputs_tmp.permute([1,2,0]).numpy(),(448,448))
        inputs_tmp2=torch.tensor(inputs_tmp1).permute([2,0,1])
        input_batch[i:i+1,:,:,:]= torch.unsqueeze(inputs_tmp2,0)
    return input_batch 

#batch[1][0]['boxes'][0]
def target_process(batch):
    batch_size=len(batch[0])
    target_batch= torch.zeros(batch_size,1,1,5)
    for i in range(batch_size):
        #只处理batch中的第一张图片
        # batch[1]表示label
        # batch[0]表示image
        bbox=batch[1][i]['boxes'][0]
        _,hi,wi = batch[0][i].numpy().shape
        bbox = bbox/ torch.tensor([wi,hi,wi,hi])
        cbbox =  torch.cat([torch.ones(1),bbox])
        target_batch[i:i+1,:,:,:] = torch.unsqueeze(cbbox,0)
    return target_batch
    

In [28]:
num_classes = 2
n_class    = 2
batch_size = 6
epochs     = 500
lr         = 1e-3
momentum   = 0
w_decay    = 1e-5
step_size  = 50
gamma      = 0.5

In [None]:
# 定义模型
yolov0_model = YOLOV0()
# 定义优化算法为sdg:随机梯度下降
optimizer = optim.SGD(yolov0_model.detector.parameters(), lr=lr, momentum=momentum, weight_decay=w_decay)

# 定义学习率变化策略
# 每30个epoch 学习率乘以0.5
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)  # decay LR by a factor of 0.5 every 30 epochs

In [None]:
# 矩阵形式写法，写法简单，但是可读性不强
def lossfunc(outputs,labels):
    tmp = (outputs-labels)**2
    return torch.sum(tmp,0).view(1,5).mm(torch.tensor([10,0.0001,0.0001,0.0001,0.0001]).view(5,1))

# 定义直接拟合的学习率，可读性强
# c x y w h 
# c_g x_g y_g w_g h_g 
def lossfunc_details(outputs,labels):
    # 判断维度
    assert ( outputs.shape == labels.shape),"outputs shape[%s] not equal labels shape[%s]"%(outputs.shape,labels.shape)
    b,w,h,c = outputs.shape
    loss = 0
    for bi in range(b):
        for wi in range(w):
            for hi in range(h):
                # detect_vector=[confidence,x,y,w,h]
                detect_vector = outputs[bi,wi,hi]
                gt_dv = labels[bi,wi,hi]
                conf_pred = detect_vector[0]
                conf_gt = gt_dv[0]
                x_pred = detect_vector[1]
                x_gt = gt_dv[1]
                y_pred = detect_vector[2]
                y_gt = gt_dv[2]
                w_pred = detect_vector[3]
                w_gt = gt_dv[3]
                h_pred = detect_vector[4]
                h_gt = gt_dv[4]
                loss_confidence = (conf_pred-conf_gt)**2 
                loss_geo = (x_pred-x_gt)**2 + (y_pred-y_gt)**2 + (w_pred-w_gt)**2 + (h_pred-h_gt)**2
                loss_tmp = loss_confidence + 0.3*loss_geo
                loss += loss_tmp
    return loss
# train
def train():
    yolov0_model.train() 
    for epoch in range(epochs):
        ts = time.time()
        for iter, batch in enumerate(train_loader):
            print(f"Batch structure: {batch}")
            optimizer.zero_grad()
            # 取图片
            inputs = input_process(batch)
            # 取标注
            labels = target_process(batch)
            # 获取得到输出
            outputs = yolov0_model(inputs)
            loss = lossfunc_details(outputs,labels)
            loss.backward()
            optimizer.step()
            if iter % 10 == 0:
                print("epoch{}, iter{}, loss: {}, lr: {}".format(epoch, iter, loss.data.item(),optimizer.state_dict()['param_groups'][0]['lr']))
        
        scheduler.step()

In [None]:
v0_model_main()
train()

torch.Size([1, 1000])
torch.Size([1, 512, 16, 16])
torch.Size([1, 512, 7, 7])
torch.Size([1, 1, 1, 5])
Batch structure: ((tensor([[[0.8275, 0.8235, 0.8314,  ..., 0.5608, 0.5725, 0.5804],
         [0.7020, 0.6784, 0.6667,  ..., 0.4941, 0.5020, 0.5020],
         [0.8000, 0.7608, 0.7373,  ..., 0.5098, 0.5137, 0.5098],
         ...,
         [0.8863, 0.8510, 0.8275,  ..., 0.7176, 0.7216, 0.7255],
         [0.9059, 0.8588, 0.8235,  ..., 0.7333, 0.7333, 0.7294],
         [0.8824, 0.8902, 0.8431,  ..., 0.7451, 0.7451, 0.7333]],

        [[0.7843, 0.7804, 0.7882,  ..., 0.3765, 0.3882, 0.3961],
         [0.6588, 0.6353, 0.6235,  ..., 0.3098, 0.3176, 0.3176],
         [0.7569, 0.7176, 0.6941,  ..., 0.3255, 0.3294, 0.3255],
         ...,
         [0.8627, 0.8275, 0.8039,  ..., 0.7176, 0.7216, 0.7255],
         [0.8824, 0.8353, 0.8000,  ..., 0.7333, 0.7333, 0.7294],
         [0.8588, 0.8667, 0.8196,  ..., 0.7451, 0.7451, 0.7333]],

        [[0.7137, 0.7098, 0.7176,  ..., 0.3059, 0.3176, 0.3255],
 