# 改进的YOLO v1

backbone: ResNet_18

neck: SPP

head

In [30]:
# backbone: ResNet_18
import torch
import torch.nn as nn

class BasicBlock(nn.Module):

    def __init__(self, conv1_in, conv1_out, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(conv1_in, conv1_out, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(conv1_out)
        self.relu = nn.ReLU(inplace=True)
        
        self.conv2 = nn.Conv2d(conv1_out, conv1_out, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(conv1_out)
        
        # downsample是增加x通道数用的，因为下面一层的F(x)可能会增加通道数，前面一层无法直接相加
        self.downsample = downsample

    def forward(self, x):
        identity = x # 上一层的信息

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out) # conv-1有ReLU

        out = self.conv2(out)
        out = self.bn2(out) # 注意到这里conv-2没有ReLU

        if self.downsample is not None:
            identity = self.downsample(x) # 没记错的李沐课里说的是用1x1卷积来增加通道数

        out += identity
        out = self.relu(out) # 注意到，联合上一层信息之后在进行ReLU
        return out


class Backbone(nn.Module):
    def __init__(self):
        super(Backbone, self).__init__()

        # ResNet's Head
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # ResNet Block-1
        self.layer1 = nn.Sequential(BasicBlock(64, 64), 
                                    BasicBlock(64, 64))

        # ResNet Block-2 
        downsample = nn.Sequential(nn.Conv2d(64, 128, kernel_size=1, stride=2, bias=False),
                                   nn.BatchNorm2d(128))

        self.layer2 = nn.Sequential(BasicBlock(64, 128, 2, downsample=downsample), 
                                    BasicBlock(128, 128))

        # ResNet Block-3
        downsample = nn.Sequential(nn.Conv2d(128, 256, kernel_size=1, stride=2, bias=False),
                                   nn.BatchNorm2d(256))
        self.layer3 = nn.Sequential(BasicBlock(128, 256, 2, downsample=downsample), 
                                    BasicBlock(256, 256))

        # ResNet Block-4
        downsample = nn.Sequential(nn.Conv2d(256, 512, kernel_size=1, stride=2, bias=False),
                                   nn.BatchNorm2d(512))
        self.layer4 = nn.Sequential(BasicBlock(256, 512, 2, downsample=downsample), 
                                    BasicBlock(512, 512))
        

    def forward(self, x):
        C_1 = self.conv1(x)
        C_1 = self.bn1(C_1)
        C_1 = self.relu(C_1)
        C_1 = self.maxpool(C_1)

        C_2 = self.layer1(C_1)
        C_3 = self.layer2(C_2)
        C_4 = self.layer3(C_3)
        C_5 = self.layer4(C_4)

        return C_5



# model = Backbone()
# #model.load_state_dict(torch.load('resnet18.pth'), strict=False)
# # print(model) # 打印模型的网络结构
# pre = torch.load('resnet18.pth') # 下载地址：https://download.pytorch.org/models/resnet18-5c106cde.pth
# pre = [k for k, v in pre.items()]
# # print(pre)
# model.state_dict().keys()
# model.load_state_dict(torch.load('resnet18.pth'), strict=False)

In [31]:
# neck: SPP
import torch
import torch.nn as nn

class Neck(nn.Module):
    def __init__(self):
        super(Neck, self).__init__()


    def forward(self, x):
        # 注：max_pool并没有可以学习的参数
        # backbone ouput size: 13*13*512
        # output size: (13-5+2*2)/1 + 1 = 13, channel=512
        x_1 = torch.nn.functional.max_pool2d(x, 5, stride=1, padding=2)
        # output size: (13-9+2*4)/1 + 1 = 13, channel=512
        x_2 = torch.nn.functional.max_pool2d(x, 9, stride=1, padding=4)
        # output size: (13-13+2*6)/1 + 1 = 13, channel=512
        x_3 = torch.nn.functional.max_pool2d(x, 13, stride=1, padding=6)
        # output size: 13*13, channel=512+512+512+512=2048
        x = torch.cat([x, x_1, x_2, x_3], dim=1) 

        # SPP -> Detection Head还需要用1*1卷积降维到512个Channel
        spp_to_head = nn.Sequential(nn.Conv2d(2048, 512, 1),
                                    nn.BatchNorm2d(512),
                                    nn.LeakyReLU(0.1, inplace=True))

        return spp_to_head(x) 

In [32]:
# Detection Head
class Head(nn.Module):
    def __init__(self):
        super(Head, self).__init__()
        # SPP output size: 13*13*512
        # size: 13*13, channel: 256
        head1 = nn.Sequential(nn.Conv2d(512, 256, 1, stride=1),
                                    nn.BatchNorm2d(256),
                                    nn.LeakyReLU(0.1, inplace=True))

        # size: (13-3+2*1)/1 + 1 = 13, channel: 512
        head2 = nn.Sequential(nn.Conv2d(256, 512, 3, stride=1, padding=1),
                                    nn.BatchNorm2d(512),
                                    nn.LeakyReLU(0.1, inplace=True))

        # size: 13*13, channel: 256
        head3 = nn.Sequential(nn.Conv2d(512, 256, 1, stride=1),
                                    nn.BatchNorm2d(256),
                                    nn.LeakyReLU(0.1, inplace=True))


        # size: (13-3+2*1)/1 + 1 = 13, channel: 512
        head4 = nn.Sequential(nn.Conv2d(256, 512, 3, stride=1, padding=1),
                                    nn.BatchNorm2d(512),
                                    nn.LeakyReLU(0.1, inplace=True))

        pred = nn.Conv2d(512, 1 + 20 + 4, 1) # output [1, 25, 13, 13]

        self.head = nn.Sequential(head1, head2, head3, head4, pred)

    def forward(self, x):
        return self.head(x)

In [33]:
# YOLOv1 Model
class YOLOv1(nn.Module):
    def __init__(self):
        super(YOLOv1, self).__init__()

        self.backbone = Backbone()
        self.neck = Neck()
        self.head = Head()

    def forward(self, x):
        
        out = self.backbone(x)
        out = self.neck(out)
        out = self.head(out) 

        return out

model = YOLOv1()
# print(model)

In [49]:
# Load Data
import torch.utils.data as data
import xml.etree.ElementTree as ET # 读取XML文件需要导入的模块（Python 3.X）
import cv2
import Augmentations
import numpy as np

class Dataset(data.Dataset):
    def __init__(self):
        self.path_train_txt = '/Users/lan/Downloads/VOCdevkit/VOC2007/ImageSets/Main/train.txt'
        self.path_train_images = '/Users/lan/Downloads/VOCdevkit/VOC2007/JPEGImages/%s.jpg'
        self.path_train_annotations = '/Users/lan/Downloads/VOCdevkit/VOC2007/Annotations/%s.xml'
        self.index_images = [] # ['index_1', 'index_2', ...]
        for line in open(self.path_train_txt):
            self.index_images.append(line.strip())
    
    def __len__(self):
        return len(self.index_images)

    def __getitem__(self, index):
        """ return (image, ground_truth) """

        self.image_index = self.index_images[index]
        self.ground_truth_xml = ET.parse(self.path_train_annotations % self.image_index).getroot()
        self.image = cv2.imread(self.path_train_images % self.image_index)
        height, width, channels = self.image.shape

        # xml标注文件中每个Object都有其class和[xmin, ymin, xmax, ymax]数据，我们需要读取出来
        VOC_CLASSES_NAME = ('aeroplane', 'bicycle', 'bird', 'boat',
                            'bottle', 'bus', 'car', 'cat', 'chair',
                            'cow', 'diningtable', 'dog', 'horse',
                            'motorbike', 'person', 'pottedplant',
                            'sheep', 'sofa', 'train', 'tvmonitor')
        self.class_name_to_index = dict(zip(VOC_CLASSES_NAME, range(len(VOC_CLASSES_NAME))))
        self.xmin, self.ymin, self.xmax, self.ymax, self.index_class_name = 0, 0, 0, 0, 0
        self.ground_truth = []

        # 从xml标注文件中读取[[xmin, ymin, xmax, ymax, index_class_name], ... ]
        for item in self.ground_truth_xml.iter('object'):
            
            class_name = item.find('name').text.lower().strip()
            self.index_class_name = self.class_name_to_index[class_name]

            bounding_box = item.find('bndbox')
            # 因为图像会进行归一化，所以矩形框也要跟着变
            # -1是因为xml文件中的坐标是从1开始的，而图像坐标是从左上角的0开始
            self.xmin = (int(bounding_box.find('xmin').text) - 1) / width # 要不要float我还不确定
            self.ymin = (int(bounding_box.find('ymin').text) - 1) / height
            self.xmax = (int(bounding_box.find('xmax').text) - 1) / width
            self.ymax = (int(bounding_box.find('ymax').text) - 1) / height

            # [[xmin, ymin, xmax, ymax, index_class_name], ... ]
            self.ground_truth.append([self.xmin, self.ymin, self.xmax, self.ymax, self.index_class_name])
        
        # 对图像做数据增强
        if len(self.ground_truth) == 0:
            self.ground_truth = np.zeros([1, 5]) # 确定是否有目标
        else:
            self.ground_truth = np.array(self.ground_truth) # 有的话转成np.array数组
        
        transform = Augmentations.SSDAugmentation(416) # 训练数据的大小416*416，可选择输入640
        self.image, self.bounding_box, self.index_class_name = transform(self.image, 
                                                                         self.ground_truth[:, :4],
                                                                         self.ground_truth[:, 4])
        
        # cv.imread-> BGR -> RGB，与img[:,:,::-1]是等价的
        self.image = self.image[:, :, (2, 1, 0)]                                                                            
        
        # bounding_box: (1,4), index_class_name:(1,) 
        # np.expand_dims(index_class_name, axis=1):(1,1)
        # np.hstack后得到(1,5)
        self.ground_truth = np.hstack((self.bounding_box, np.expand_dims(self.index_class_name, axis=1)))
        
        # permute(2, 0, 1)把通道数放到前面去(W,H,C)->(C,W,H)，相当于img = img.transpose(2, 0, 1)
        self.image = torch.from_numpy(self.image).permute(2, 0, 1)

        

        return self.image, self.ground_truth, height, width 
        # 注意此时self.ground_truth是一个np.array对象，标签也变成了float
        # 举例验证的代码
        # data = Dataset()
        # a = data.__getitem__(1)
        # data.ground_truth
        # array([[ 0.43127962,  0.20664207,  0.65402844,  0.71217712, 14.        ],
        #        [ 0.13744076,  0.26568266,  0.87914692,  1.        , 12.        ]])


def collate_fn(data_batch):
    # 这个data_batch是什么呢？[dataset[0],dataset[1],...,dataset[batch_size-1]]
    # dataset[0]其实就是调用了__getitem__()方法取出一个img和一个target，组成的一个tuple
    # sample[0]对应img，sample[1]对应target或者说label，ground truth
    ground_truth = []
    images = []
    for item in data_batch:
        images.append(item[0])
        ground_truth.append(torch.FloatTensor(item[1]))
    
    # torch.stack(images, 0)就是实现(batch_size, H, W)
    return torch.stack(images, 0), ground_truth



In [50]:
# 测试dataloader能否正常使用
data = Dataset()
dataloader = torch.utils.data.DataLoader(data,
                                         batch_size=1, 
                                         shuffle=True, 
                                         collate_fn=collate_fn,
                                         #num_workers=2, # 该参数去掉能正常使用
                                         pin_memory=True)
i=0
for iter_i, (images, ground_truth) in enumerate(dataloader):
    print(images.shape)
    print(len(ground_truth))
    if i==0:
        break 

torch.Size([1, 3, 416, 416])
1


In [35]:
# 测试模型的输出是否正确
data = Dataset()
a = data.__getitem__(1)
data.ground_truth
img = a[0]
img = torch.unsqueeze(img, dim=0)
img.shape
model(img).shape
img = model.backbone(img)
print(img.shape)
img = model.neck(img)
print(img.shape)
img = model.head(img)
print(img.shape)


torch.Size([1, 512, 13, 13])
torch.Size([1, 512, 13, 13])
torch.Size([1, 25, 13, 13])
