In [None]:
import torch
import torch.nn as nn
import torchvision

In [None]:
def vgg16(batch_norm=False) -> nn.ModuleList:
    """ 创建 vgg16 模型

    Parameters
    ----------
    batch_norm: bool
        是否在卷积层后面添加批归一化层
    """
    layers = []
    in_channels = 3
    cfg = [64, 64, 'M', 128, 128, 'M', 256, 256,
           256, 'C', 512, 512, 512, 'M', 512, 512, 512]

    for v in cfg:
        if v == 'M':
            layers.append(nn.MaxPool2d(2, 2))
        elif v == 'C':
            layers.append(nn.MaxPool2d(2, 2, ceil_mode=True))
        else:
            conv = nn.Conv2d(in_channels, v, 3, padding=1)

            # 如果需要批归一化的操作就添加一个批归一化层
            if batch_norm:
                layers.extend([conv, nn.BatchNorm2d(v), nn.ReLU(True)])
            else:
                layers.extend([conv, nn.ReLU(True)])

            in_channels = v

    # 将原始的 fc6、fc7 全连接层替换为卷积层
    layers.extend([
        nn.MaxPool2d(3, 1, 1),
        nn.Conv2d(512, 1024, 3, padding=6, dilation=6),  # conv6 使用空洞卷积增加感受野
        nn.ReLU(True),
        nn.Conv2d(1024, 1024, 1),  # conv7
        nn.ReLU(True)
    ])

    layers = nn.ModuleList(layers)
    return layers


#生成锚框
# coding:utf-8
from itertools import product
from math import sqrt

import torch


class PriorBox:
    """ 用来生成先验框的类 """

    def __init__(self, image_size=300, feature_maps: list = None, min_sizes: list = None,
                 max_sizes: list = None, aspect_ratios: list = None, steps: list = None, **kwargs):
        """
        Parameters
        ----------
        image_size: int
            图像大小

        feature_maps: list
            特征图大小

        min_sizes: list
            特征图中的最小正方形先验框的尺寸

        max_sizes: list
            下一个特征图中的最小正方形先验框的尺寸

        aspect_ratios: list
            长宽比

        steps: list
            步长，可理解为感受野大小
        """
        self.image_size = image_size
        self.feature_maps = feature_maps or [38, 19, 10, 5, 3, 1]
        self.min_sizes = min_sizes or [30, 60, 111, 162, 213, 264]
        self.max_sizes = max_sizes or [60, 111, 162, 213, 264, 315]
        self.steps = steps or [8, 16, 32, 64, 100, 300]
        self.aspect_ratios = aspect_ratios or [
            [2], [2, 3], [2, 3], [2, 3], [2], [2]]

    def __call__(self):
        """ 得到所有先验框

        Returns
        -------
        boxes: Tensor of shape (n_priors, 4)
            先验框
        """
        boxes = []

        for k, f in enumerate(self.feature_maps):
            f_k = self.image_size / self.steps[k]

            for i, j in product(range(f), repeat=2):
                # 中心坐标，向右为 x 轴正方向，向下为 y 轴正方向
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k

                # 1 和 1'
                s_k = self.min_sizes[k] / self.image_size
                s_k_prime = sqrt(s_k * self.max_sizes[k] / self.image_size)

                boxes.append([cx, cy, s_k, s_k])
                boxes.append([cx, cy, s_k_prime, s_k_prime])

                # 根据其余的 ar 计算宽和高
                for ar in self.aspect_ratios[k]:
                    boxes.append([cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)])
                    boxes.append([cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)])

        boxes = torch.Tensor(boxes).clamp(min=0, max=1)
        return boxes


#L2标准化
from torch import Tensor


class L2Norm(nn.Module):
    """ L2 标准化 """

    def __init__(self, n_channels: int, scale=20):
        """
        Parameters
        ----------
        n_channels: int
            通道数

        scale: float
            l2标准化的缩放比
        """
        super().__init__()
        self.gamma = scale
        self.eps = 1e-10
        self.n_channels = n_channels
        self.weight = nn.Parameter(Tensor(self.n_channels))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.constant_(self.weight, self.gamma)

    def forward(self, x: Tensor):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
        x = torch.div(x, norm)
        # 将 weight 的维度变为 [1, n_channels, 1, 1]
        y = x * self.weight[None, ..., None, None]
        return y


#SSD类的实现


class SSD(nn.Module):
    """ SSD 神经网络模型 """

    def __init__(self, n_classes: int, variance=(0.1, 0.2), top_k=200, conf_thresh=0.01,
                 nms_thresh=0.45, image_size=300, **config):
        """
        Parameters
        ----------
        n_classes: int
            要预测的种类数，包括背景

        variance: Tuple[float, float]
            先验框的方差

        top_k: int
            每个类的边界框上限

        conf_thresh: float
            置信度阈值

        nms_thresh: float
            nms 中 IOU 阈值

        image_size: int
            图像尺寸

        **config:
            关于先验框生成的配置
        """
        super().__init__()

        if len(variance) != 2:
            raise ValueError("variance 只能有 2 元素")

        self.n_classes = n_classes
        self.image_size = image_size
        config['image_size'] = image_size

        # 生成先验框
        self.priorbox_generator = PriorBox(**config)
        self.prior = Tensor(self.priorbox_generator())

        # 各个模块
        self.vgg = vgg16()
        self.l2norm = L2Norm(512, 20)
        self.extras = nn.ModuleList([
            nn.Conv2d(1024, 256, 1),  # conv8_2
            nn.Conv2d(256, 512, 3, stride=2, padding=1),
            nn.Conv2d(512, 128, 1),  # conv9_2
            nn.Conv2d(128, 256, 3, stride=2, padding=1),
            nn.Conv2d(256, 128, 1),  # conv10_2
            nn.Conv2d(128, 256, 3),
            nn.Conv2d(256, 128, 1),  # conv11_2
            nn.Conv2d(128, 256, 3),
        ])
        self.confs = nn.ModuleList([
            nn.Conv2d(512, n_classes * 4, 3, padding=1),
            nn.Conv2d(1024, n_classes * 6, 3, padding=1),
            nn.Conv2d(512, n_classes * 6, 3, padding=1),
            nn.Conv2d(256, n_classes * 6, 3, padding=1),
            nn.Conv2d(256, n_classes * 4, 3, padding=1),
            nn.Conv2d(256, n_classes * 4, 3, padding=1),
        ])
        self.locs = nn.ModuleList([
            nn.Conv2d(512, 4 * 4, 3, padding=1),
            nn.Conv2d(1024, 4 * 6, 3, padding=1),
            nn.Conv2d(512, 4 * 6, 3, padding=1),
            nn.Conv2d(256, 4 * 6, 3, padding=1),
            nn.Conv2d(256, 4 * 4, 3, padding=1),
            nn.Conv2d(256, 4 * 4, 3, padding=1),
        ])

        self.relu = nn.ReLU()



    def forward(self, x):
        """
        Parameters
        ----------
        x: Tensor of shape (N, 3, H, W)
            图像数据

        Returns
        -------
        loc: Tensor of shape (N, n_priors, 4)
            偏移量

        conf: Tensor of shape (N, n_priors, n_classes)
            类别置信度

        prior: Tensor of shape (n_priors, 4)
            先验框
        """
        loc = []
        conf = []
        sources = []

        # 批大小
        N = x.size(0)

        # 计算从 conv4_3 输出的特征图
        for layer in self.vgg[:23]:
            x = layer(x)

        # 保存 conv4_3 输出的 l2 标准化结果
        sources.append(self.l2norm(x))

        # 计算 vgg16 后面几个卷积层的特征图
        for layer in self.vgg[23:]:
            x = layer(x)

        # 保存 conv7 的输出的特征图
        sources.append(x)

        # 计算后面几个卷积层输出的特征图
        for i, layer in enumerate(self.extras):
            x = self.relu(layer(x))
            if i % 2 == 1:
                sources.append(x)

        # 使用分类器和探测器进行预测并将通道变为最后一个维度方便堆叠
        for x, conf_layer, loc_layer in zip(sources, self.confs, self.locs):
            loc.append(loc_layer(x).permute(0, 2, 3, 1).contiguous())
            conf.append(conf_layer(x).permute(0, 2, 3, 1).contiguous())

        # 输出维度为 (batch_size, n_priors, n_classes) 和 (batch_size, n_priors, 4)
        conf = torch.cat([i.view(N, -1) for i in conf], dim=1)
        loc = torch.cat([i.view(N, -1) for i in loc], dim=1)

        return loc.view(N, -1, 4), conf.view(N, -1, self.n_classes), self.prior

In [None]:
class MultiBoxLoss(nn.Module):
    def __init__(self, priors, overlap_thresh=0.5, neg_pos_ratio=3, alpha=1.0):
        """
        Parameters:
        - priors: Tensor, 预定义的先验框
        - overlap_thresh: float, 正样本和真实框之间的最小 IoU 阈值
        - neg_pos_ratio: int, 正负样本比例
        - alpha: float, 用于平衡分类损失和回归损失的权重
        """
        super(MultiBoxLoss, self).__init__()
        self.priors = priors  # 先验框
        self.overlap_thresh = overlap_thresh  # IoU 阈值
        self.neg_pos_ratio = neg_pos_ratio  # 正负样本比例
        self.alpha = alpha  # 回归损失的权重

        self.cross_entropy = nn.CrossEntropyLoss(reduction='sum')  # 分类损失
        self.smooth_l1 = nn.SmoothL1Loss(reduction='sum')  # 回归损失

    def forward(self, loc_preds, conf_preds, priors, targets):
        """
        计算多任务损失，包括分类损失和回归损失

        Parameters:
        - loc_preds: Tensor, 预测的边界框坐标 (batch_size, num_priors, 4)
        - conf_preds: Tensor, 预测的类别概率 (batch_size, num_priors, num_classes)
        - priors: Tensor, 预定义的先验框 (num_priors, 4)
        - targets: Tensor, 真实框和类别标签 (batch_size, num_objects, 5)
        """

        # Step 1: 计算每个先验框与真实框之间的 IoU
        iou_matrix = self.calculate_iou(priors, targets)

        # Step 2: 将先验框与真实框进行匹配（正样本和负样本）
        # 每个先验框的分类标签和回归目标
        conf_labels, loc_targets = self.match_priors_with_gt(iou_matrix, targets)

        # Step 3: 计算分类损失
        conf_loss = self.cross_entropy(conf_preds.view(-1, conf_preds.size(2)), conf_labels.view(-1))

        # Step 4: 计算回归损失
        loc_loss = self.smooth_l1(loc_preds.view(-1, 4), loc_targets.view(-1, 4))

        # 总损失 = 分类损失 + 回归损失
        total_loss = conf_loss + self.alpha * loc_loss
        return total_loss

    def calculate_iou(self, priors, targets):
        """计算每个先验框和真实框之间的IoU"""
        return torchvision.ops.box_iou(priors, targets)


    def match_priors_with_gt(self, iou_matrix, targets):
        """
        匹配每个先验框与真实框，并返回正负样本标签及回归目标
        """
        # 根据 IoU 确定哪些先验框是正样本，哪些是负样本
        conf_labels = torch.zeros(iou_matrix.size(0), dtype=torch.long)  # 默认为背景（负样本）
        loc_targets = torch.zeros_like(self.priors)  # 初始化回归目标

        # 找出每个先验框的最佳匹配（IoU 最大的真实框）
        best_iou, best_idx = iou_matrix.max(1)

        # 选择 IoU 大于阈值的先验框作为正样本
        pos_mask = best_iou > self.overlap_thresh
        conf_labels[pos_mask] = targets[best_idx[pos_mask], 0]  # 根据匹配的真实框的类别填充标签

        # 计算正样本的回归目标
        loc_targets[pos_mask] = targets[best_idx[pos_mask], 1:]  # 真实框的坐标

        # 负样本：IoU 小于阈值的先验框
        neg_mask = best_iou < 0.4  # 设置负样本的 IoU 阈值
        conf_labels[neg_mask] = 0  # 标记负样本为背景（类别 0）

        return conf_labels, loc_targets