## 1.数据准备

### 1.0 加载库、定义函数

In [1]:
import numpy as np
import cv2
from PIL import Image, ImageDraw
import time
import os
import math
import random 
# 导入pytorch相关的库
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
def get_filePath_list(dirPath, partOfFileName=''):
    """ 获取文件夹内的文件路径，返回类型为列表"""
    all_fileName_list = next(os.walk(dirPath))[2]
    fileName_list = [k for k in all_fileName_list if partOfFileName in k]
    filePath_list = [os.path.join(dirPath, k) for k in fileName_list]
    return filePath_list

In [6]:
import sys
# 实时更新进度条
def print_flush(print_string):
    print(print_string, end='\r')
    sys.stdout.flush()

### 1.1 定义数据加载器

In [4]:
from sklearn.model_selection import train_test_split

dirPath = '../resources/modified_jpgs/'
imageFilePath_list = get_filePath_list(dirPath, '.jpg')
N = len(imageFilePath_list)
index_1dArray = np.arange(N)
trainIndex_1dArray, testIndex_1dArray = train_test_split(index_1dArray, test_size=0.2)
len(trainIndex_1dArray), len(testIndex_1dArray)

(1172, 293)

In [5]:
category_list = ['backgroud', 'keyPoint_1', 'keyPoint_2']
id2name_dict = {a:b for a,b in enumerate(category_list)}
name2id_dict = {b:a for a,b in enumerate(category_list)}

from xml.etree import ElementTree as ET
def get_label(xmlFilePath):
    if not os.path.exists(xmlFilePath):
        return []
    with open(xmlFilePath, encoding='utf8') as file:
        fileContent = file.read()
    root = ET.XML(fileContent)
    object_list = root.findall('object')
    classId_list = []
    box_list = []
    for object_item in object_list:
        name = object_item.find('name')
        className = name.text
        classId = name2id_dict[className]
        classId_list.append(classId)
        bndbox = object_item.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        box = (xmin, ymin, xmax, ymax)
        box_list.append(box)    
    label = (box_list, classId_list)
    return label

In [6]:
def get_one_sample(imageFilePath):
    image = Image.open(imageFilePath)
    image_3dArray = np.array(image)
    xmlFilePath = imageFilePath[:-3] + 'xml'
    label = get_label(xmlFilePath)
    return image_3dArray, label

startTime = time.time()
all_label_list = []
all_images_4dArray = np.zeros((N, 1920, 320, 3), dtype='uint8')
# 读取全部图片到内存中，会占内存2.52G多，即1465 * 1920 * 320 * 3 / (2**10)=2.5148
for i, imageFilePath in enumerate(imageFilePath_list):
    image_3dArray, label = get_one_sample(imageFilePath)
    all_images_4dArray[i] = image_3dArray
    all_label_list.append(label)
usedTime = time.time() - startTime
print('读取全部图片到内存中, 总共耗时%.4f秒' %usedTime)

读取全部图片到内存中, 总共耗时23.0888秒


In [7]:
def get_batch_sample(batchIndex_1dArray):
    startTime = time.time()
    batch_images_4dArray = all_images_4dArray[batchIndex_1dArray]
    batch_label_list = [all_label_list[k] for k in batchIndex_1dArray]
    x = torch.ByteTensor(batch_images_4dArray)
    x = x.to('cuda')
    x = x.permute(0,3,1,2).float()
    return x, batch_label_list

In [8]:
import threading
import queue


train_N = len(trainIndex_1dArray)    
class DataLoader(threading.Thread):
    def __init__(self, batch_size=32, shuffle=True, N_worker=2):
        """ self.queue 用于放批次数据(输出)的队列"""
        super(DataLoader, self).__init__()
        self.queue = queue.Queue()
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.is_stopped = False
        self.batch_index = 0
        self.epoch_size = math.ceil(train_N/batch_size)
        self.start()
    
    def run(self):
        while not self.is_stopped:
            if self.queue.qsize() < 3:
                if self.shuffle and self.batch_index%self.epoch_size==0:
                    random.shuffle(trainIndex_1dArray)
                    self.batch_index = 0
                start_index = self.batch_index * self.batch_size
                end_index = (self.batch_index + 1) * self.batch_size
                batchIndex_1dArray = trainIndex_1dArray[start_index: end_index]
                put_tuple = get_batch_sample(batchIndex_1dArray)
                self.queue.put(put_tuple)
                self.batch_index += 1
            time.sleep(0.001)
    
    def get_batch(self):
        return self.queue.get()
    
    def __del__(self):
        self.is_stopped = True
        while not self.empty():
            self.queue.get()

batch_size = 32            
train_loader = DataLoader(batch_size, shuffle=True)    

In [12]:
train_loader.queue.qsize()

3

## 2.搭建神经网络

In [5]:
class BasicConv(nn.Module):
    """ 基础卷积组件"""
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, relu=True, bn=True):
        super(BasicConv, self).__init__()
        padding = (kernel_size-1) // 2
        self.bn = nn.BatchNorm2d(in_channels) if bn else None
        self.relu = nn.ReLU() if relu else None
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        
    def forward(self, x):
        if self.bn:
            x = self.bn(x)
        if self.relu:
            x = self.relu(x)
        x = self.conv(x)
        return x

    
class Backbone(nn.Module):
    """骨干网络"""
    def __init__(self):
        super(Backbone, self).__init__()
        self.conv1_1 = BasicConv(3, 8, kernel_size=3, stride=2, relu=False)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2_1 = BasicConv(8, 16, kernel_size=3, stride=2)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3_1 = BasicConv(16, 32, kernel_size=3, stride=2)
        
    def forward(self, x):
        x = self.conv1_1(x)
        x = self.pool1(x)
        x = self.conv2_1(x)
        x = self.pool2(x)
        x = self.conv3_1(x)
        return x    
    

class Net(nn.Module):
    """基于骨干网络的检测网络"""
    def __init__(self, class_quantity=3):
        super(Net, self).__init__()
        self.class_quantity = class_quantity
        self.backbone = Backbone()
        self.prediction_conv = nn.Conv2d(32, 2+class_quantity, kernel_size=3, padding=1)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.prediction_conv(x)
        x = x.permute(0,2,3,1)
        x = x.reshape(x.size(0), -1, x.size(-1))
        return x
    
net = Net()    

In [10]:
def get_priorBox_2d():
    box_list = []
    for i in range(60):
        for j in range(10):
            x = (j + 0.5) * 32
            y = (i + 0.5) * 32
            box = x, y
            box_list.append(box)
    priorBox_2d = torch.Tensor(box_list)
    return priorBox_2d
    
priorBox_2d = get_priorBox_2d()         
priorBox_2d[:5]

tensor([[ 16.,  16.],
        [ 48.,  16.],
        [ 80.,  16.],
        [112.,  16.],
        [144.,  16.]])

## 3.定义损失函数

In [17]:
class MultiBoxLoss(nn.Module):
    def __init__(self, priorBox_2d):
        super(MultiBoxLoss, self).__init__()
        self.priorBox_2d = priorBox_2d
        self.locationLoss_function = nn.MSELoss(reduction='mean')
        self.confidenceLoss_function = nn.CrossEntropyLoss(reduction='mean')
        
    def match(self, label_list):
        N = len(label_list)
        N_prior = len(self.priorBox_2d)
        gtOffset_3d = torch.Tensor(N, N_prior, 2)
        gtClassId_2d = torch.zeros(N, N_prior).long()
        for index, label in enumerate(label_list):
            box_list, classId_list = label
            for box, classId in zip(box_list, classId_list):
                xmin, ymin, xmax, ymax = box
                center_x = (xmin+xmax) // 2
                center_y = (ymin+ymax) // 2
                i = center_y // 32
                j = center_x // 32
                k = i * 10 + j
                priorBox_1d = self.priorBox_2d[k]
                gtBox_1d = torch.Tensor([center_x, center_y])
                gtOffset_1d = gtBox_1d - priorBox_1d
                gtOffset_3d[index][k] = gtOffset_1d
                gtClassId_2d[index][k] = classId
        gtOffset_3d = gtOffset_3d.to('cuda')
        gtClassId_2d = gtClassId_2d.to('cuda')
        return gtOffset_3d, gtClassId_2d
    
    def forward(self, predictions_3d, label_list):
        pOffset_3d = predictions_3d[:, :, :2]
        pConfidence_3d = predictions_3d[:, :, 2:]
        gtOffset_3d, gtClassId_2d = self.match(label_list)
        # 定位误差只计算正样本
        positive_2d = gtClassId_2d > 0
        positive_pOffset_2d = pOffset_3d[positive_2d]
        positive_gtOffset_2d = gtOffset_3d[positive_2d]
        location_loss = self.locationLoss_function(
            positive_pOffset_2d, positive_gtOffset_2d)
              
        # 置信度误差计算正样本和2个负样本
        N_negtive = 2
        afterSoftmaxConf_3d = F.softmax(pConfidence_3d, dim=2)
        negtiveConfidence_2d = afterSoftmaxConf_3d[..., 0]
        index_2d = negtiveConfidence_2d.sort(1)[1]
        rank_2d = index_2d.sort(1)[1]
        negtive_2d = rank_2d < N_negtive
        isSelected_2d = positive_2d + negtive_2d
        pConfidence_2d = pConfidence_3d[isSelected_2d]
        gtClassId_1d = gtClassId_2d[isSelected_2d]
        confidence_loss = self.confidenceLoss_function(
            pConfidence_2d, gtClassId_1d)
              
        return location_loss, confidence_loss
    
criterion = MultiBoxLoss(priorBox_2d)    

## 4.在训练集上训练模型

In [39]:
epochs = 60
device = torch.device('cuda')
net.to(device)
train_N = len(trainIndex_1dArray)
epoch_size = math.ceil(train_N / batch_size)
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)
milestone_list = [10 * k for k in range(1, epochs//10)]
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestone_list, gamma=0.5)

startTime = time.time()
for epoch in range(epochs):
    if  epoch > 0:
        scheduler.step()
    for step in range(1, epoch_size+1):
        x, label_list = train_loader.get_batch()
        predictions_3d = net(x)
        location_loss, confidence_loss = criterion(predictions_3d, label_list)
        loss = location_loss + confidence_loss * 20
        loss_value = loss.item()
        locationLoss_value = location_loss.item()
        confidenceLoss_value = confidence_loss.item()
        print_string = 'epoch: %d step: %d/%d loss:%.6f location_loss:%.6f confidence_loss:%.6f' %(
            epoch, step, epoch_size, loss_value, locationLoss_value, confidenceLoss_value)
        print_flush(print_string)
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
usedTime = time.time() - startTime
print('\n训练过程总共耗时%.4f秒' %usedTime)

epoch: 59 step: 37/37 loss:0.161226 location_loss:0.160827 confidence_loss:0.000020
训练过程总共耗时233.8997秒


In [17]:
train_loader.queue.qsize()

3

# 5.在测试集上测试模型

### 5.1 对比单张图片的标注数据、预测数据

In [14]:
def detect(image, verbose=True):
    startTime = time.time()
    images_4dArray = np.expand_dims(np.array(image), 0)
    with torch.no_grad():
        x = torch.ByteTensor(images_4dArray).to(device)
        x = x.permute(0,3,1,2).float()
        predictions_3d = net(x)
    prediction_2d = predictions_3d[0]
    pOffset_2d = prediction_2d[:, :2]
    pConfidence_2d = F.softmax(prediction_2d[:, 2:], 1)
    pBox_list = []
    pClassId_list = []
    for classId in [1, 2]:
        classConfidence_1d = pConfidence_2d[:, classId]
        maxConfidence = classConfidence_1d.max().item()
        if maxConfidence > 0.5:
            pClassId_list.append(classId)
            index = classConfidence_1d.argmax()
            pOffset = pOffset_2d[index]
            pCenter = priorBox_2d[index] + pOffset.to('cpu')
            center_x, center_y = pCenter
            center_x, center_y = int(center_x), int(center_y)
            min_x = max(center_x - 15, 0)
            max_x = min(center_x + 15, 320-1)
            min_y = max(center_y - 15, 0)
            max_y = min(center_y + 15, 1920-1)
            box = min_x, min_y, max_x, max_y
            pBox_list.append(box)
            
    usedTime = time.time() - startTime
    if verbose:
        print('预测耗时%.4f秒' %usedTime)
    return pBox_list, pClassId_list    

In [20]:
def test_1():
    index = random.choice(testIndex_1dArray)
    imageFilePath = imageFilePath_list[index]
    image_3dArray, label = get_one_sample(imageFilePath)
    print('标注数据:', label)
    pResult = detect(image_3dArray)
    print('预测数据:', pResult)
    
test_1()    

标注数据: ([(97, 1344, 127, 1374)], [2])
预测耗时0.0060秒
预测数据: ([(96, 1343, 126, 1373)], [2])


### 5.2 查看模型预测画框后的图

In [19]:
def test_2():
    index = random.choice(testIndex_1dArray)
    imageFilePath = imageFilePath_list[index]
    image = Image.open(imageFilePath)
    image_3dArray = np.array(image)
    pBox_list, pClassId_list = detect(image_3dArray)
    for box, classId in zip(pBox_list, pClassId_list):
        if classId == 1:
            color = [0, 255, 0]
        else:
            color = [255, 0, 0]
        x1, y1, x2, y2 = box
        leftTop_point = x1, y1
        rightBottom_point = x2, y2
        cv2.rectangle(image_3dArray, leftTop_point, rightBottom_point, color, 3)    
    drawed_image = Image.fromarray(image_3dArray)
    drawed_image.show()
    
test_2()    

预测耗时0.0070秒


### 5.3 模型评价: 整个测试集上的准确率、召回率

#### 5.3.1 通过例子理解准确率、召回率

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

def eval_model(true_y, predicted_y, category_list):
    p, r, f1, s = precision_recall_fscore_support(true_y, predicted_y)
    if len(p) == len(category_list) -1:
        # 最极端的情况: 所有测试样例都正确，即没有负样本
        category_list = category_list[1:]
    category_1dArray = np.array(category_list)
    df = pd.DataFrame([category_1dArray, p, r, f1, s]).T
    df.columns = ['Label', 'Precision', 'Recall', 'F1', 'Support']
    # 计算总体的平均Precision, Recall, F1, Support
    all_label = '总体'
    all_p = np.average(p, weights=s)
    all_r = np.average(r, weights=s)
    all_f1 = np.average(f1, weights=s)
    all_s = np.sum(s)
    row = [all_label, all_p, all_r, all_f1, all_s]
    df.loc[999] = row
    # 设置Precision、Recall、F1这3列显示4位小数
    column_list = ['Precision', 'Recall', 'F1']
    df[column_list] = df[column_list].applymap(lambda x: '%.4f' %x)
    return df

In [22]:
gt_y = [1, 1, 1, 0, 2, 2, 0] 
p_y = [1, 0, 0, 1, 2, 0, 2]
category_list = ['background', 'keyPoint_1', 'keyPoint_2']
eval_model(gt_y, p_y, category_list)

Unnamed: 0,Label,Precision,Recall,F1,Support
0,background,0.0,0.0,0.0,2
1,keyPoint_1,0.5,0.3333,0.4,3
2,keyPoint_2,0.5,0.5,0.5,2
999,总体,0.3571,0.2857,0.3143,7


#### 5.3.2 获取测试集所有样本的真实值、预测值

In [23]:
gtLabel_list = []
pLabel_list = []
for index in testIndex_1dArray:
    gtLabel = all_label_list[index]
    gtLabel_list.append(gtLabel)
    image_3dArray = all_images_4dArray[index]
    pLabel = detect(image_3dArray, verbose=False)
    pLabel_list.append(pLabel)

In [24]:
def test_3(gtLabel_list, pLabel_list):
    gt_y = []
    p_y = []
    for gtLabel, pLabel in zip(gtLabel_list, pLabel_list):
        gtBox_list, gtClassId_list = gtLabel
        pBox_list, pClassId_list = pLabel
        pMatched_list = [False] * len(pBox_list)
        # 先遍历真实值
        for gtBox, gtClassId in zip(gtBox_list, gtClassId_list):
            if gtClassId in pClassId_list:
                index = pClassId_list.index(gtClassId)
                pBox = pBox_list[index]
                diffValue_1dArray = np.subtract(gtBox, pBox)
                absValue_1dArray = np.abs(diffValue_1dArray)
                diffSum = absValue_1dArray.sum()
                if diffSum < 20:
                    gt_y.append(gtClassId)
                    p_y.append(gtClassId)
                    pMatched_list[index] = True
                    continue
            gt_y.append(gtClassId)
            p_y.append(0)
        # 然后遍历预测值中未被匹配到的, 即背景被预测为正样本
        for index, matched in enumerate(pMatched_list):
            if not matched:
                pClassId = pClassId_list[index]
                gt_y.append(0)
                p_y.append(pClassId)
    category_list = ['background', 'keyPoint_1', 'keyPoint_2']
    df = eval_model(gt_y, p_y, category_list)
    return df
    
test_3(gtLabel_list, pLabel_list)    

Unnamed: 0,Label,Precision,Recall,F1,Support
0,keyPoint_1,1.0,1.0,1.0,152
1,keyPoint_2,1.0,1.0,1.0,175
999,总体,1.0,1.0,1.0,327


## 6.模型使用

### 6.1 模型保存

In [35]:
dirPath = '../resources/trained_weights'
if not os.path.isdir(dirPath):
    os.makedirs(dirPath)
pthFileName = 'ckpt.pth'
pthFilePath = os.path.join(dirPath, pthFileName)
torch.save(net.state_dict(), pthFilePath)

### 6.2  模型加载

In [12]:
pthFilePath = '../resources/trained_weights/ckpt.pth'
state_dict = torch.load(pthFilePath)

In [13]:
net.load_state_dict(state_dict)
device = torch.device('cuda')
net.to(device)
net.eval()

Net(
  (backbone): Backbone(
    (conv1_1): BasicConv(
      (bn): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv): Conv2d(3, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
    (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv2_1): BasicConv(
      (bn): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
      (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
    (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv3_1): BasicConv(
      (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
  (prediction_conv): Conv2d(32, 5, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (softmax): Softmax(dim=1)
)

### 6.3显存占用分析

In [14]:
from torchsummary import summary
summary(net, (3,1920,320))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
       BatchNorm2d-1         [-1, 3, 1920, 320]               6
            Conv2d-2          [-1, 8, 960, 160]             224
         BasicConv-3          [-1, 8, 960, 160]               0
         MaxPool2d-4           [-1, 8, 480, 80]               0
       BatchNorm2d-5           [-1, 8, 480, 80]              16
              ReLU-6           [-1, 8, 480, 80]               0
            Conv2d-7          [-1, 16, 240, 40]           1,168
         BasicConv-8          [-1, 16, 240, 40]               0
         MaxPool2d-9          [-1, 16, 120, 20]               0
      BatchNorm2d-10          [-1, 16, 120, 20]              32
             ReLU-11          [-1, 16, 120, 20]               0
           Conv2d-12           [-1, 32, 60, 10]           4,640
        BasicConv-13           [-1, 32, 60, 10]               0
         Backbone-14           [-1, 32,

In [1]:
1 * 1920 * 320 * 3 * 4 / (2**20)

7.03125

In [205]:
a = 1920 * 320 * 3  + 8 * 960 * 160  + 8 * 480 * 80  + 16 * 240 * 40  + 16 * 120 * 20 + 32 * 60 * 10  + 5 * 60 * 10

In [206]:
(a * 4) / (2**20)

13.707733154296875