# Task-1:深度模型对抗鲁棒性评估
## Task 1.1:构建和训练神经网络
> 模型一：SimpleCNN
> 模型二：DeepCNN
> 模型三：RobustCNN

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import torchvision
from torchvision import datasets, transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [4]:
# ========================
# 数据预处理部分
# ========================

# 定义图像变换：将图像转为张量，并进行归一化处理
# MNIST 原始灰度值范围为 [0, 255]，ToTensor 会转换为 [0, 1]
# Normalize 使其分布变为均值为 0，标准差为 1（对抗训练常用）
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])]
)

# 加载训练集
train_dataset = torchvision.datasets.MNIST(
    'dataset/mnist-pytorch', # 存储路径
    train=True, # 训练集
    download=True, # 自动下载
    transform=transform # 应用预处理
)

# 加载测试集
test_dataset = torchvision.datasets.MNIST(
    'dataset/mnist-pytorch',
    train=False,
    transform=transform,
)

# 封装为数据加载器，设定 batch 大小为 128
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [5]:
# ========================
# 模型定义部分
# ========================

# -------- SimpleCNN --------
# 简单的两层卷积神经网络，适合基础分类任务
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), # 卷积层：输入1通道，输出16通道，3x3卷积核
            nn.ReLU(),
            nn.MaxPool2d(2), # 下采样：2x2窗口
            nn.Conv2d(16, 32, 3, padding=1), # 第二个卷积层
            nn.ReLU(),
            nn.MaxPool2d(2), # 再次下采样
            nn.Flatten(), # 展平为向量
            nn.Linear(32 * 7 * 7, 128), # 全连接层
            nn.ReLU(),
            nn.Linear(128, 10) # 输出层：10 类
        )

    def forward(self, x):
        return self.net(x)

In [6]:
# -------- DeepCNN --------
# 更深层次的 CNN，具有更多卷积层，提取更复杂的特征
class DeepCNN(nn.Module):
    def __init__(self):
        super(DeepCNN, self).__init__()
        
        # 特征提取部分（卷积层 + 激活函数 + 池化层）
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), # 输入通道数为1（灰度图像），输出32个特征图，3x3卷积核，padding=1保持尺寸
            nn.ReLU(), # 非线性激活函数
            nn.Conv2d(32, 64, 3, padding=1), # 将特征图从32变为64
            nn.ReLU(),
            nn.MaxPool2d(2), # 2x2最大池化，尺寸减半（28x28 -> 14x14）
            nn.Conv2d(64, 128, 3, padding=1), # 将特征图从64变为128
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1), # 继续提取更复杂的特征
            nn.ReLU(),
            nn.MaxPool2d(2) # 再次进行池化（14x14 -> 7x7）
        )
        self.classifier = nn.Sequential(
            nn.Flatten(), # 展平为一维向量，大小为128 * 7 * 7
            nn.Linear(128 * 7 * 7, 256), # 全连接层，输出维度为256
            nn.ReLU(),
            nn.Linear(256, 10) # 最终输出10类（对应MNIST的数字0-9）
        )

    def forward(self, x):
        x = self.features(x) # 提取特征
        x = self.classifier(x)  # 分类输出
        return x

In [7]:
# 残差块定义
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        
        # 第一个卷积层：调整通道数或空间尺寸（根据 stride
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.gn1 = nn.GroupNorm(4, out_channels) # 使用 GroupNorm（每组4个通道）
        self.relu = nn.ReLU(inplace=True)
        
        # 第二个卷积层：保持尺寸和通道不变
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.gn2 = nn.GroupNorm(4, out_channels)

        # 下采样模块：如果输入输出通道不同或尺寸变化，使用1x1卷积调整
        self.downsample = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.GroupNorm(4, out_channels)
            )

    def forward(self, x):
        identity = self.downsample(x) # 残差连接的分支（shortcut）
        out = self.conv1(x)
        out = self.gn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.gn2(out)
        out += identity # 残差连接：输出加上shortcut
        return self.relu(out) # 激活后返回

class DefenseCNN(nn.Module):
    def __init__(self):
        super(DefenseCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1), # 初始卷积层，输入通道为1（灰度图）
            nn.GroupNorm(4, 32),
            nn.ReLU(),

            ResidualBlock(32, 64, stride=1), # 第一个残差块
            nn.MaxPool2d(2), # 池化，减小空间尺寸
            nn.Dropout(0.2), # Dropout防止过拟合

            ResidualBlock(64, 128, stride=1), # 第二个残差块
            nn.MaxPool2d(2),
            nn.Dropout(0.3),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(), # 展平，输入为 128 x 7 x 7
            nn.Linear(128 * 7 * 7, 256), # 全连接层输出 256 维
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 10) # 最终输出10类
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

In [8]:
# ========================
# 模型训练函数
# ========================
def train(model, train_loader, test_loader, epochs, device, model_name='Model'):
    model.to(device)
    criterion = nn.CrossEntropyLoss() # 使用交叉熵损失函数
    optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam 优化器

    total_acc = 0.0
    
    for epoch in range(1, epochs + 1):
        model.train()  # 进入训练模式
        total_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"{model_name} Epoch {epoch}/{epochs}", ncols=100)
        
        # -------- 训练每个 batch --------
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward() # 反向传播
            optimizer.step() # 参数更新

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        # -------- 评估模型准确率 --------
        model.eval() # 进入评估模式
        correct = 0
        total = 0
        with torch.no_grad(): # 关闭梯度以加速
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1) # 获取预测类别
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        total_acc += accuracy

        print(f"epoch={epoch} accuracy={accuracy:.2f}%")
        
    final_acc = total_acc / epochs
    print(f"===> Final Accuracy= {final_acc:.2f}%")

    torch.save(model.state_dict(), f"{model_name}.pth")
    print(f"===> {model_name} model saved.")
        
# ========================
# 主函数：依次训练三个模型
# ========================
def test_all_models():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    epochs = 4 # 可以根据实际训练需求增加

    model1 = SimpleCNN()
    model2 = DeepCNN()
    model3 = DefenseCNN()

     # 分别训练并保存每个模型
    train(model1, train_loader, test_loader, epochs, device, model_name="M1")
    train(model2, train_loader, test_loader, epochs, device, model_name="M2")
    train(model3, train_loader, test_loader, epochs, device, model_name="M3")

# 调用主函数，开始训练
test_all_models()

M1 Epoch 1/4: 100%|███████████████████████████████████| 469/469 [00:11<00:00, 39.11it/s, loss=0.153]


epoch=1 accuracy=97.66%


M1 Epoch 2/4: 100%|██████████████████████████████████| 469/469 [00:08<00:00, 56.16it/s, loss=0.0977]


epoch=2 accuracy=98.39%


M1 Epoch 3/4: 100%|██████████████████████████████████| 469/469 [00:08<00:00, 52.44it/s, loss=0.0431]


epoch=3 accuracy=98.56%


M1 Epoch 4/4: 100%|██████████████████████████████████| 469/469 [00:08<00:00, 54.51it/s, loss=0.0101]


epoch=4 accuracy=98.77%
===> Final Accuracy= 98.34%
===> M1 model saved.


M2 Epoch 1/4: 100%|██████████████████████████████████| 469/469 [00:16<00:00, 28.08it/s, loss=0.0451]


epoch=1 accuracy=98.55%


M2 Epoch 2/4: 100%|█████████████████████████████████| 469/469 [00:16<00:00, 28.58it/s, loss=0.00262]


epoch=2 accuracy=99.08%


M2 Epoch 3/4: 100%|██████████████████████████████████| 469/469 [00:17<00:00, 26.74it/s, loss=0.0058]


epoch=3 accuracy=99.12%


M2 Epoch 4/4: 100%|█████████████████████████████████| 469/469 [00:17<00:00, 26.16it/s, loss=0.00526]


epoch=4 accuracy=99.19%
===> Final Accuracy= 98.98%
===> M2 model saved.


M3 Epoch 1/4: 100%|███████████████████████████████████| 469/469 [00:43<00:00, 10.68it/s, loss=0.122]


epoch=1 accuracy=98.18%


M3 Epoch 2/4: 100%|██████████████████████████████████| 469/469 [01:03<00:00,  7.38it/s, loss=0.0511]


epoch=2 accuracy=98.69%


M3 Epoch 3/4: 100%|██████████████████████████████████| 469/469 [01:03<00:00,  7.38it/s, loss=0.0364]


epoch=3 accuracy=98.98%


M3 Epoch 4/4: 100%|███████████████████████████████████| 469/469 [01:06<00:00,  7.05it/s, loss=0.132]


epoch=4 accuracy=99.04%
===> Final Accuracy= 98.72%
===> M3 model saved.


## Task 1.2:评估神经网络的鲁棒性
### 步骤一：选择三种对抗攻击方式，并报告平均对抗距离与对抗攻击成功率
> FGSM

In [9]:
import random
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def fast_gradient_sign_method(model, imgs, labels, epsilon=0.02):
    model.eval() # 模型处于评估模式
    inp_imgs = imgs.clone().to(device).requires_grad_() # 复制一份imgs，标记为需要求梯度的张量
    preds = model(inp_imgs) # 前向传播获取真实图像的预测结果
    preds = F.log_softmax(preds, dim=-1) # 将预测结果转化为概率的形式 [0.2, 0.12, ...]

    loss = F.nll_loss(preds, labels.to(device)) # 计算负对数似然损失(NLL loss)
    loss.backward() # 反向传播

    noise_grad = torch.sign(inp_imgs.grad.detach()).to(imgs.device) # 对抗扰动方向
    fake_imgs = imgs + epsilon * noise_grad # 计算对抗样本
    fake_imgs.detach_() # 分离计算图，避免后续计算梯度
    return fake_imgs, epsilon * noise_grad

def evaluate_fgsm(model_class, weight_path, data_loader, epsilon, num_samples):
    # 实例化模型并加载预训练权重
    model = model_class().to(device)
    model.load_state_dict(torch.load(weight_path, map_location=device))
    model.eval()
    
    # 固定随机种子以保证结果可复现
    seed = 22051022
    random.seed(seed)
    
    # 整理全部测试图像和标签
    all_imgs, all_labels = [], []
    for imgs, labels in data_loader:
        all_imgs.append(imgs)
        all_labels.append(labels)
    all_imgs = torch.cat(all_imgs)
    all_labels = torch.cat(all_labels)
    
    # 随机选择样本进行攻击
    indices = random.sample(range(len(all_imgs)), num_samples)
    images = all_imgs[indices]
    labels = all_labels[indices]
    
    # 生成FGSM对抗样本
    adv_images, perturbations = fast_gradient_sign_method(model, images, labels, epsilon)

    # 获取模型在原图和对抗图像上的预测结果
    with torch.no_grad():
        preds_before = model(images.to(device)).argmax(dim=1)
        preds_after = model(adv_images.to(device)).argmax(dim=1)

    # 判断哪些样本攻击前预测正确（才算成功攻击的候选）
    success_before = (preds_before.cpu() == labels)
    success_after = (preds_after.cpu() == labels)
    
    # 标记成功被攻击的样本：攻击前正确，攻击后错误
    success_mask = success_before & (~success_after)
    
    # 对这些样本计算L2扰动距离
    pert_norms = torch.norm((adv_images - images).view(images.size(0), -1), p=2, dim=1)
    
    # 原始准确率、攻击后准确率
    accuracy_before = success_before.sum().item() / len(labels)
    accuracy_after = success_after.sum().item() / len(labels)
    
    # 攻击成功率 = 预测正确但被误分类的比例
    attack_success_rate = 1.0 - accuracy_after
    
    # 只统计成功攻击样本的平均L2扰动距离
    avg_perturbation = pert_norms[success_mask].mean().item() 

    print(f"accuracy_before={accuracy_before:.2f} accuracy_after={accuracy_after:.2f} attack_success_rate={attack_success_rate:.2f} avg_perturbation={avg_perturbation:.2f}")


In [10]:
test_loader = DataLoader(test_dataset, batch_size=128)

epsilon=0.1 # FGSM扰动强度
num_samples=100 # 随机采样样本数用于评估

stats_Simple = evaluate_fgsm(SimpleCNN, "M1.pth", test_loader, epsilon, num_samples)
print(f"Simple")

stats_Deep = evaluate_fgsm(DeepCNN, "M2.pth", test_loader, epsilon, num_samples)
print(f"Deep")

stats_Defense = evaluate_fgsm(DefenseCNN, "M3.pth", test_loader, epsilon, num_samples)
print(f"Defense")

  model.load_state_dict(torch.load(weight_path, map_location=device))


accuracy_before=0.99 accuracy_after=0.94 attack_success_rate=0.06 avg_perturbation=2.80
Simple
accuracy_before=0.99 accuracy_after=0.97 attack_success_rate=0.03 avg_perturbation=2.80
Deep
accuracy_before=1.00 accuracy_after=0.97 attack_success_rate=0.03 avg_perturbation=2.80
Defense


> PGD Attack

In [11]:
def projected_gradient_descent(model, imgs, labels, steps=4, alpha=0.02, epsilon=0.1):
    model.eval()
    
    # 初始化对抗扰动
    delta = torch.zeros_like(imgs, device=device).requires_grad_() # cuda
    fake_imgs = imgs.to(device)

    for t in range(steps):
        # 计算当前的对抗样本
        fake_imgs = imgs.to(device) + delta
        fake_imgs.retain_grad()
        
        preds = model(fake_imgs)
        preds = F.log_softmax(preds, dim=1)

        # 计算损失函数并反向传播
        loss = F.nll_loss(preds, labels.to(device))
        loss.backward()

        # # 提取梯度方向
        noise_grad = fake_imgs.grad.data.sign().to(device)
        # noise_grad = torch.sign(fake_imgs.grad.detach()).to(device)
        fake_imgs.grad.zero_() # 清空梯度，防止梯度叠加

        # 更新扰动
        delta.data.add_(alpha * noise_grad)
        delta.data.clamp_( - epsilon,  + epsilon) # 将对抗扰动控制在[-eps, eps]范围

    return fake_imgs.to(imgs.device).detach_(), delta.detach_().to(imgs.device)
    # return fake_imgs.to(imgs.device).detach_(), k_grad.to(imgs.device)/steps
    
def evaluate_pgd(model_class, weight_path, data_loader, num_samples):
    model = model_class().to(device)
    model.load_state_dict(torch.load(weight_path, map_location=device))
    model.eval()
    
    # 设定随机种子，确保可复现
    seed = 22051022
    random.seed(seed)
    
    # 收集所有测试图像与标签
    all_imgs, all_labels = [], []
    for imgs, labels in data_loader:
        all_imgs.append(imgs)
        all_labels.append(labels)
    all_imgs = torch.cat(all_imgs)
    all_labels = torch.cat(all_labels)
    
    # 从中随机采样 num_samples 个用于评估
    indices = random.sample(range(len(all_imgs)), num_samples)
    images = all_imgs[indices]
    labels = all_labels[indices]
    
    # 使用 PGD 生成对抗样本
    adv_images, perturbations = projected_gradient_descent(model, images, labels)
    
    # 计算攻击前后的预测结果
    with torch.no_grad():
        preds_before = model(images.to(device)).argmax(dim=1)
        preds_after = model(adv_images.to(device)).argmax(dim=1)

    # 对比攻击前后是否预测正确
    success_before = (preds_before.cpu() == labels)
    success_after = (preds_after.cpu() == labels)
    # 仅统计那些本来能正确分类但被攻击失败的样本
    success_mask = success_before & (~success_after)
    # L2范数的扰动大小
    pert_norms = torch.norm((adv_images - images).view(images.size(0), -1), p=2, dim=1)
    
    # 指标计算
    accuracy_before = success_before.sum().item() / len(labels)
    accuracy_after = success_after.sum().item() / len(labels)
    attack_success_rate = 1.0 - accuracy_after
    
    # 只统计成功攻击的扰动平均值
    avg_perturbation = pert_norms[success_mask].mean().item()

    print(f"accuracy_before={accuracy_before:.2f} accuracy_after={accuracy_after:.2f} attack_success_rate={attack_success_rate:.2f} avg_perturbation={avg_perturbation:.2f}")

In [19]:
num_samples = 100
stats_Simple = evaluate_pgd(SimpleCNN, "M1.pth", test_loader, num_samples)
print(f"Simple")
stats_Deep = evaluate_pgd(DeepCNN, "M2.pth", test_loader, num_samples)
print(f"Deep")
stats_Defense = evaluate_pgd(DefenseCNN, "M3.pth", test_loader, num_samples)
print(f"Defense")

  model.load_state_dict(torch.load(weight_path, map_location=device))


accuracy_before=0.99 accuracy_after=0.95 attack_success_rate=0.05 avg_perturbation=1.40
Simple
accuracy_before=0.99 accuracy_after=0.99 attack_success_rate=0.01 avg_perturbation=nan
Deep
accuracy_before=1.00 accuracy_after=0.98 attack_success_rate=0.02 avg_perturbation=1.38
Defense


> C&W

In [13]:
def cw_loss_untargeted(Z, delta, true_labels, c=0.5, loss_func=None):
    # Z: 模型输出 logits
    # true_labels: 原始标签 y
    # delta: 扰动
    # loss_func: 用于约束扰动的大小（一般为 MSELoss）

    one_hot = F.one_hot(true_labels, num_classes=Z.shape[-1]).to(Z.device)
    Z_y = torch.sum(Z * one_hot, dim=-1)  # 原标签的 logit 值
    Z_not_y = torch.max(Z * (1 - one_hot), dim=-1)[0]  # 非原标签中的最大 logit 值

    # 与 Targeted 不同，这里我们希望原标签的 logit 更小，非原标签的 logit 更大
    margin = torch.clamp(Z_y - Z_not_y, min=0)  # 只有 Z_y > Z_not_y 才 penalize
    loss = loss_func(delta, torch.zeros_like(delta)) + c * margin.sum()
    return loss

In [15]:
def cw_attack_untargeted(model, imgs, true_labels, c=1, num_epoch=50, lr=1e-2):
    model.eval()
    copy_imgs = imgs.clone().to(device)
    
    # 将图像从 [0, 1] 映射到 [-1 + ε, 1 - ε] 再取 arctanh 得到变量 w（可优化变量）
    # 变换方式参考原始 CW 论文中提出的变量替换技巧（保证生成的图像落在合法范围内）
    w = torch.atanh(torch.clamp(copy_imgs * 2 - 1, min=-1, max=1))
    w = w.to(device).requires_grad_() # 设置为可优化变量
    optimizer = torch.optim.Adam([w], lr=lr) # 优化变量 w
    mseloss = torch.nn.MSELoss() # 用于计算扰动的最小二乘损失

    # 优化循环
    for epoch in range(num_epoch):
        optimizer.zero_grad()
        
        # 通过 tanh 逆变换将 w 变回图像空间，得到扰动 delta
        delta = 0.5 * (torch.tanh(w) + 1) - copy_imgs
        
        # 计算模型在对抗图像上的输出
        Z = model(copy_imgs + delta)
        
        # 计算 CW 无目标攻击的损失（包括分类置信度与扰动大小）
        loss = cw_loss_untargeted(Z, delta, true_labels, c=c, loss_func=mseloss)
        loss.backward() # 反向传播
        optimizer.step() # 更新变量 w

    # 最终计算对抗扰动并返回对抗图像与扰动量
    delta = 0.5 * (torch.tanh(w) + 1) - copy_imgs
    return (copy_imgs + delta).to(imgs.device).detach_(), delta.detach_().to(imgs.device)


In [17]:
#整体评估逻辑依然和上两种类似
def evaluate_cw(model_class, weight_path, data_loader, num_samples):
    model = model_class().to(device)
    model.load_state_dict(torch.load(weight_path, map_location=device))
    model.eval() #评估模式
    
    seed = 22051022
    random.seed(seed)
    all_imgs, all_labels = [], []
    for imgs, labels in data_loader:
        all_imgs.append(imgs)
        all_labels.append(labels)
    all_imgs = torch.cat(all_imgs)
    all_labels = torch.cat(all_labels)
    
    indices = random.sample(range(len(all_imgs)), num_samples)
    images = all_imgs[indices]
    labels = all_labels[indices]

    target_labels = (labels + 1) % 10  # 保证目标标签不同
	# 计算对抗样本和扰动
    adv_images, perturbation = cw_attack_untargeted(model, images, labels, c=1.0, num_epoch=50, lr=1e-2)


    with torch.no_grad():
        preds_before = model(images.to(device)).argmax(dim=1)
        preds_after = model(adv_images.to(device)).argmax(dim=1)

    success_before = (preds_before.cpu() == labels)
    success_after = (preds_after.cpu() == labels)
    success_mask = success_before & (~success_after)  # 攻击成功
    pert_norms = torch.norm((adv_images - images).view(images.size(0), -1), p=2, dim=1)
    
    accuracy_before = success_before.sum().item() / len(labels)
    accuracy_after = success_after.sum().item() / len(labels)
    attack_success_rate = 1.0 - accuracy_after
    avg_perturbation = pert_norms[success_mask].mean().item()

    print(f"accuracy_before={accuracy_before:.2f} accuracy_after={accuracy_after:.2f} attack_success_rate={attack_success_rate:.2f} avg_perturbation={avg_perturbation:.2f}")

In [18]:
num_samples = 100
stats_Simple = evaluate_cw(SimpleCNN, "M1.pth", test_loader, num_samples)
print(f"Simple")
stats_Deep = evaluate_cw(DeepCNN, "M2.pth", test_loader, num_samples)
print(f"Deep")
stats_Defense = evaluate_cw(DefenseCNN, "M3.pth", test_loader, num_samples)
print(f"Defense")

  model.load_state_dict(torch.load(weight_path, map_location=device))


accuracy_before=0.99 accuracy_after=0.90 attack_success_rate=0.10 avg_perturbation=25.80
Simple
accuracy_before=0.99 accuracy_after=0.91 attack_success_rate=0.09 avg_perturbation=25.49
Deep
accuracy_before=1.00 accuracy_after=0.96 attack_success_rate=0.04 avg_perturbation=25.59
Defense


### 步骤二：比较所有神经网络的鲁棒性和准确性
> FGSM

|神经网络|accuracy_before|accuracy_after|attack_success_rate|avg_perturbation|
|----|----|----|----|----|
|SimpleCNN|0.99|0.94|0.06|2.80|
|DeepCNN|0.99|0.97|0.03|2.80|
|DefenseCNN|1.00|0.97|0.03|2.80|

> PGD Attack

|神经网络|accuracy_before|accuracy_after|attack_success_rate|avg_perturbation|
|----|----|----|----|----|
|SimpleCNN|0.99|0.95|0.05|1.40|
|DeepCNN|0.99|0.99|0.01|nan|
|DefenseCNN|1.00|0.98|0.02|1.38|

> C&W

|神经网络|accuracy_before|accuracy_after|attack_success_rate|avg_perturbation|
|----|----|----|----|----|
|SimpleCNN|0.99|0.90|0.10|25.80|
|DeepCNN|0.99|0.91|0.09|25.49|
|DefenseCNN|1.00|0.96|0.04|25.59|

> 由表格可见，高准确性并不一定意味着高鲁棒性
> 模型在不同攻击方法下鲁棒性表现不一致

### 步骤三：是否存在对 3 种攻击都鲁棒的模型
> 由实验结果可见，DefenseCNN对三种攻击的鲁棒性都很好。

# Task-2:深度模型的对抗训练
### 步骤一：选用三种对抗训练方法重新训练RobustCNN
> FGSM-based Adversarial Training

In [36]:
def fgsm_attack(model, images, labels, epsilon=0.25):
    images = images.clone().detach().to(images.device).requires_grad_(True)
    outputs = model(images)
    loss = nn.CrossEntropyLoss()(outputs, labels) # 计算交叉熵损失
    model.zero_grad()
    loss.backward() # 反向传播
    grad = images.grad.data # 获取图像的梯度
    adv_images = images + epsilon * grad.sign() # 生成对抗样本：在原图基础上添加梯度符号方向的扰动
    adv_images = torch.clamp(adv_images, -1, 1) # 将对抗图像限制在合法范围内，MNIST 通常归一化到 [-1, 1]
    return adv_images.detach()

In [37]:
def train_fgsm(model, train_loader, test_loader, epochs, device, model_name='FGSM_Model'):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(1, epochs + 1):
        model.train()
        progress_bar = tqdm(train_loader, desc=f"{model_name} Epoch {epoch}/{epochs}", ncols=100)
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)

            # FGSM 对抗样本生成
            adv_images = fgsm_attack(model, images, labels)

            # 合并干净样本与对抗样本
            mixed_images = torch.cat([images, adv_images], dim=0)
            mixed_labels = torch.cat([labels, labels], dim=0)

            optimizer.zero_grad()
            outputs = model(mixed_images)
            loss = criterion(outputs, mixed_labels)
            loss.backward()
            optimizer.step()

            progress_bar.set_postfix(loss=loss.item())

        # 测试精度
        evaluate_model(model, test_loader, device, model_name)

    torch.save(model.state_dict(), f"{model_name}.pth")

> PGD Adversarial Training

In [38]:
def pgd_attack(model, images, labels, eps=0.25, alpha=0.01, iters=40):
    images = images.clone().detach().to(images.device)
    ori_images = images.clone().detach()
    
    # 初始化扰动 delta，范围在 [-eps, eps] 之间的均匀分布
    delta = torch.zeros_like(images).uniform_(-eps, eps).to(images.device)
    delta.requires_grad = True

    # 开始迭代更新 delta
    for _ in range(iters):
        outputs = model(images + delta)
        loss = nn.CrossEntropyLoss()(outputs, labels) # 计算交叉熵损失
        loss.backward() # 反向传播，计算 delta 的梯度
        delta.data = (delta + alpha * delta.grad.sign()).clamp(-eps, eps) # 使用 FGSM 的方式更新扰动：沿梯度方向移动 alpha
        delta.grad.zero_() # 清除上一轮梯度，防止累积
    
    adv_images = torch.clamp(images + delta.detach(), -1, 1) # 最终对抗图像限制在合法图像值范围 [-1, 1]
    return adv_images


In [39]:
def train_pgd(model, train_loader, test_loader, epochs, device, model_name='PGD_Model'):
    model.to(device)
    criterion = nn.CrossEntropyLoss() # 定义交叉熵损失函数
    optimizer = optim.Adam(model.parameters(), lr=0.001) # 使用 Adam 优化器优化模型参数

    # 训练循环，共进行 epochs 个周期
    for epoch in range(1, epochs + 1):
        model.train()
        
        # 使用 tqdm 显示训练进度条
        progress_bar = tqdm(train_loader, desc=f"{model_name} Epoch {epoch}/{epochs}", ncols=100)
        
        # 遍历训练集中每个批次
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device) # 移动到计算设备

            # 使用 PGD 生成对抗样本
            adv_images = pgd_attack(model, images, labels, eps=0.25, alpha=0.01, iters=7)

            # 计算模型在对抗样本上的损失并更新参数
            optimizer.zero_grad()
            outputs = model(adv_images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            progress_bar.set_postfix(loss=loss.item())

        # 每轮结束后，在测试集上评估模型性能
        evaluate_model(model, test_loader, device, model_name)

    torch.save(model.state_dict(), f"{model_name}.pth")


> TRADES

In [40]:
def trades_loss(model, x_natural, y, optimizer, step_size=0.01, epsilon=0.25, perturb_steps=10, beta=6.0):
    # KL 散度损失函数，用于衡量自然样本与对抗样本之间预测分布的差异
    criterion_kl = nn.KLDivLoss(reduction='batchmean')
    model.eval()

    # 初始化对抗样本 x_adv：在自然样本基础上加入微小随机扰动
    x_adv = x_natural.detach() + 0.001 * torch.randn_like(x_natural).to(x_natural.device)
    
    # 基于 KL 散度进行投影梯度下降（PGD）生成对抗样本
    for _ in range(perturb_steps):
        x_adv.requires_grad_()
        with torch.enable_grad():
            # 计算对抗样本与自然样本预测分布之间的 KL 散度
            loss_kl = criterion_kl(
                F.log_softmax(model(x_adv), dim=1), # 对抗样本的预测分布（log 概率）
                F.softmax(model(x_natural), dim=1) # 自然样本的预测分布（概率）
            )
        # 计算损失对对抗样本的梯度
        grad = torch.autograd.grad(loss_kl, [x_adv])[0]
        # 沿着梯度方向更新对抗样本，step_size 控制更新步长
        x_adv = x_adv.detach() + step_size * torch.sign(grad.detach())
        # 将扰动限制在 epsilon 范围内，保持与原图像接近（L∞ 约束）
        x_adv = torch.min(torch.max(x_adv, x_natural - epsilon), x_natural + epsilon)
        # 保证对抗样本值在合法图像范围（例如 MNIST 为 [-1, 1]）
        x_adv = torch.clamp(x_adv, -1.0, 1.0)

    model.train()
    x_adv = x_adv.detach()
    optimizer.zero_grad()

    # 自然样本交叉熵损失
    logits = model(x_natural)
    loss_natural = F.cross_entropy(logits, y)

    # 对抗鲁棒性损失：对抗样本与自然样本之间预测分布的 KL 散度
    loss_robust = criterion_kl(
        F.log_softmax(model(x_adv), dim=1), # 对抗样本预测（log 概率）
        F.softmax(logits, dim=1) # 自然样本预测（概率）
    )

    # 最终损失由自然损失 + beta 倍鲁棒性损失组成
    loss = loss_natural + beta * loss_robust
    return loss


In [41]:
def train_trades(model, train_loader, test_loader, epochs, device, model_name='TRADES_Model'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(1, epochs + 1):
        model.train()
        progress_bar = tqdm(train_loader, desc=f"{model_name} Epoch {epoch}/{epochs}", ncols=100)
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)
            
            # 计算 TRADES 对抗训练的损失
            loss = trades_loss(model, images, labels, optimizer)
            # 反向传播计算梯度
            loss.backward()
            # 更新模型参数
            optimizer.step()
            # 更新进度条显示当前损失值
            progress_bar.set_postfix(loss=loss.item())

        # 每个 epoch 结束后，在测试集上评估模型表现
        evaluate_model(model, test_loader, device, model_name)

    torch.save(model.state_dict(), f"{model_name}.pth")


In [42]:
def evaluate_model(model, test_loader, device, name='Model'):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            # 取最大值对应的类别索引作为预测结果
            _, predicted = torch.max(outputs.data, 1)
            # 更新总样本数
            total += labels.size(0)
            # 统计预测正确的样本数量
            correct += (predicted == labels).sum().item()

    acc = 100 * correct / total
    print(f"{name} Accuracy: {acc:.2f}%")


In [44]:
def test_all_models():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    epochs = 4  # 为鲁棒训练适当增加训练轮数

    # FGSM 对抗训练
    model_fgsm = SimpleCNN()
    train_fgsm(model_fgsm, train_loader, test_loader, epochs, device)

    # PGD 对抗训练
    model_pgd = SimpleCNN()
    train_pgd(model_pgd, train_loader, test_loader, epochs, device)

    # TRADES 对抗训练
    model_trades = SimpleCNN()
    train_trades(model_trades, train_loader, test_loader, epochs, device)

test_all_models()

FGSM_Model Epoch 1/4: 100%|████████████████████████████| 469/469 [00:19<00:00, 23.60it/s, loss=0.13]


FGSM_Model Accuracy: 98.07%


FGSM_Model Epoch 2/4: 100%|███████████████████████████| 469/469 [00:20<00:00, 22.59it/s, loss=0.186]


FGSM_Model Accuracy: 98.49%


FGSM_Model Epoch 3/4: 100%|███████████████████████████| 469/469 [00:20<00:00, 22.43it/s, loss=0.132]


FGSM_Model Accuracy: 98.78%


FGSM_Model Epoch 4/4: 100%|██████████████████████████| 469/469 [00:21<00:00, 21.97it/s, loss=0.0938]


FGSM_Model Accuracy: 98.86%


PGD_Model Epoch 1/4: 100%|████████████████████████████| 469/469 [00:32<00:00, 14.37it/s, loss=0.132]


PGD_Model Accuracy: 97.50%


PGD_Model Epoch 2/4: 100%|███████████████████████████| 469/469 [00:32<00:00, 14.40it/s, loss=0.0921]


PGD_Model Accuracy: 98.56%


PGD_Model Epoch 3/4: 100%|████████████████████████████| 469/469 [00:33<00:00, 14.21it/s, loss=0.119]


PGD_Model Accuracy: 98.95%


PGD_Model Epoch 4/4: 100%|███████████████████████████| 469/469 [00:32<00:00, 14.41it/s, loss=0.0208]


PGD_Model Accuracy: 98.99%


TRADES_Model Epoch 1/4: 100%|█████████████████████████| 469/469 [00:48<00:00,  9.62it/s, loss=0.236]


TRADES_Model Accuracy: 98.16%


TRADES_Model Epoch 2/4: 100%|████████████████████████| 469/469 [01:04<00:00,  7.27it/s, loss=0.0677]


TRADES_Model Accuracy: 98.61%


TRADES_Model Epoch 3/4: 100%|████████████████████████| 469/469 [01:06<00:00,  7.07it/s, loss=0.0799]


TRADES_Model Accuracy: 98.66%


TRADES_Model Epoch 4/4: 100%|█████████████████████████| 469/469 [01:09<00:00,  6.72it/s, loss=0.184]


TRADES_Model Accuracy: 98.84%


### 步骤二：使用FGSM、PGD Attack、C&W来评估模型的对抗鲁棒性
> FGSM

In [45]:
stats_fgsm_trained = evaluate_fgsm(SimpleCNN, "FGSM_Model.pth", test_loader, epsilon, num_samples)
print(f"fgsm")

stats_pgd_trained = evaluate_fgsm(SimpleCNN, "PGD_Model.pth", test_loader, epsilon, num_samples)
print(f"pgd")

stats_trades_trained = evaluate_fgsm(SimpleCNN, "TRADES_Model.pth", test_loader, epsilon, num_samples)
print(f"trades")

  model.load_state_dict(torch.load(weight_path, map_location=device))


accuracy_before=0.99 accuracy_after=0.98 attack_success_rate=0.02 avg_perturbation=2.80
fgsm
accuracy_before=0.99 accuracy_after=0.98 attack_success_rate=0.02 avg_perturbation=2.80
pgd
accuracy_before=0.98 accuracy_after=0.98 attack_success_rate=0.02 avg_perturbation=nan
trades


> PGD Attack

In [46]:
stats_fgsm_trained = evaluate_pgd(SimpleCNN, "FGSM_Model.pth", test_loader, num_samples)
print(f"fgsm")
stats_pgd_trained = evaluate_pgd(SimpleCNN, "PGD_Model.pth", test_loader, num_samples)
print(f"pgd")
stats_trades_trained = evaluate_pgd(SimpleCNN, "TRADES_Model.pth", test_loader, num_samples)
print(f"trades")

  model.load_state_dict(torch.load(weight_path, map_location=device))


accuracy_before=0.99 accuracy_after=0.99 attack_success_rate=0.01 avg_perturbation=nan
fgsm
accuracy_before=0.99 accuracy_after=0.99 attack_success_rate=0.01 avg_perturbation=nan
pgd
accuracy_before=0.98 accuracy_after=0.98 attack_success_rate=0.02 avg_perturbation=nan
trades


> C&W

In [48]:
stats_fgsm_trained = evaluate_cw(SimpleCNN, "FGSM_Model.pth", test_loader, num_samples)
print(f"fgsm")
stats_pgd_trained = evaluate_cw(SimpleCNN, "PGD_Model.pth", test_loader, num_samples)
print(f"pgd")
stats_trades_trained = evaluate_cw(SimpleCNN, "TRADES_Model.pth", test_loader, num_samples)
print(f"trades")

  model.load_state_dict(torch.load(weight_path, map_location=device))


accuracy_before=0.99 accuracy_after=0.72 attack_success_rate=0.28 avg_perturbation=25.99
fgsm
accuracy_before=0.99 accuracy_after=0.91 attack_success_rate=0.09 avg_perturbation=25.67
pgd
accuracy_before=0.98 accuracy_after=0.59 attack_success_rate=0.41 avg_perturbation=25.91
trades


### 步骤三：分析对抗训练对模型性能影响
> FGSM

| |accuracy_before|accuracy_after|attack_success_rate|avg_perturbation|
|----|----|----|----|----|
|原训练|0.99|0.94|0.06|2.80|
|fgsm|0.99|0.98|0.02|2.80|
|pgd|0.99|0.98|0.02|2.80|
|trades|0.99|0.98|0.02|nan|

> PGD Attack

| |accuracy_before|accuracy_after|attack_success_rate|avg_perturbation|
|----|----|----|----|----|
|原训练|0.99|0.95|0.05|1.40|
|fgsm|0.99|0.99|0.01|nan|
|pgd|0.99|0.99|0.01|nan|
|trades|0.98|0.98|0.02|nan|

> C&W

| |accuracy_before|accuracy_after|attack_success_rate|avg_perturbation|
|----|----|----|----|----|
|原训练|0.99|0.90|0.10|25.80|
|fgsm|0.99|0.72|0.28|25.99|
|pgd|0.99|0.91|0.09|25.67|
|trades|0.98|0.59|0.41|25.91|

> FGSM-based Adversarial Training 和 PGD Adversarial Training对鲁棒性均有轻微的提高，攻击成功率有所下降，但在C&W中仅有pgd实现了攻击成功率有所下降。
> 准确性并未降低。

### 步骤四：对于C&W攻击成功率上升的分析
> - 攻击方式差异  
> C&W（Carlini & Wagner）攻击与 FGSM / PGD 属于不同类型  

| 攻击类型     | 描述            | 优化目标        |
| -------- | ------------- | ----------- |
| FGSM | 基于梯度的“最大扰动”攻击 | 快速提升损失      |
| C\&W     | 基于优化的“最小扰动”攻击 | 最小扰动前提下改变分类 |

> - 对抗训练“过拟合”某种攻击  
> FGSM 对抗训练模型，对该类攻击增强鲁棒性，但对结构不同的攻击（如 C&W）泛化性差：  
> FGSM 对抗训练强调：让模型适应“朝着最大梯度方向”的扰动。  
> C&W 攻击并不一定沿最大梯度方向优化，它优化的 loss 是 margin-based（比如 ||δ|| + confidence margin）。  
> FGSM训练模型对精细扰动反而更脆弱，可能因为它只在大幅梯度方向训练，缺乏对精细扰动的判别能力；  

# Task-3:深度模型后门攻击鲁棒性分析
### 步骤一：选择BadNet Attack和Clean Lable Attack来评估SimpleCNN
> BadNet Attack

In [87]:
def add_badnet_trigger(image, trigger_size=3):
    image = image.clone()
    _, h, w = image.shape
    image[:, h - trigger_size:h, w - trigger_size:w] = 1.0  # 白色方块
    return image

def add_blend_trigger(image, alpha=0.2):
    image = image.clone()
    noise = torch.rand_like(image) * 2 - 1  # 生成[-1, 1]范围的噪声
    blended_image = (1 - alpha) * image + alpha * noise
    blended_image = torch.clamp(blended_image, -1, 1)
    return blended_image


In [88]:
def test_backdoor_attacks(model, test_dataset, device, attack_type='BadNet', target_label=0, num_samples=50):
    model.eval()
    model.to(device)

    # 随机选择 num_samples 个测试样本
    indices = random.sample(range(len(test_dataset)), num_samples)
    clean_images = []
    clean_labels = []
    triggered_images = []

    for idx in indices:
        image, label = test_dataset[idx]
        clean_images.append(image)
        clean_labels.append(label)

        # 添加触发器
        if attack_type == 'BadNet':
            triggered_image = add_badnet_trigger(image)
        elif attack_type == 'Blend':
            triggered_image = add_blend_trigger(image)
        else:
            raise ValueError("Unsupported attack type.")

        triggered_images.append(triggered_image)

    # 转换为张量
    clean_images = torch.stack(clean_images).to(device)
    clean_labels = torch.tensor(clean_labels).to(device)
    triggered_images = torch.stack(triggered_images).to(device)
    target_labels = torch.full((num_samples,), target_label, dtype=torch.long).to(device)

    # 评估干净样本准确率
    with torch.no_grad():
        outputs = model(clean_images)
        _, predicted = torch.max(outputs, 1)
        clean_accuracy = (predicted == clean_labels).float().mean().item()

    # 评估后门攻击成功率
    with torch.no_grad():
        outputs = model(triggered_images)
        _, predicted = torch.max(outputs, 1)
        asr = (predicted == target_labels).float().mean().item()

    print(f"[{attack_type} Attack] Clean Accuracy: {clean_accuracy * 100:.2f}%, ASR: {asr * 100:.2f}%")


In [90]:
# 加载模型
model = SimpleCNN()
model.load_state_dict(torch.load('FGSM_Model.pth'))  # 替换为您的模型路径

# 加载测试数据集
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])
test_dataset = torchvision.datasets.MNIST('dataset/mnist-pytorch', train=False, transform=transform)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 评估 BadNet Attack
test_backdoor_attacks(model, test_dataset, device, attack_type='BadNet', target_label=0, num_samples=50)

# 评估 Blend Attack
test_backdoor_attacks(model, test_dataset, device, attack_type='Blend', target_label=0, num_samples=50)


[BadNet Attack] Clean Accuracy: 98.00%, ASR: 26.00%
[Blend Attack] Clean Accuracy: 98.00%, ASR: 12.00%


  model.load_state_dict(torch.load('FGSM_Model.pth'))  # 替换为您的模型路径


### 步骤二：探究对抗训练与非对抗训练面对后门攻击时的ASR

In [91]:
# 加载模型
model = SimpleCNN()
model.load_state_dict(torch.load('M1.pth'))  # 替换为您的模型路径

# 加载测试数据集
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])
test_dataset = torchvision.datasets.MNIST('dataset/mnist-pytorch', train=False, transform=transform)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 评估 BadNet Attack
test_backdoor_attacks(model, test_dataset, device, attack_type='BadNet', target_label=0, num_samples=50)

# 评估 Blend Attack
test_backdoor_attacks(model, test_dataset, device, attack_type='Blend', target_label=0, num_samples=50)


[BadNet Attack] Clean Accuracy: 98.00%, ASR: 10.00%
[Blend Attack] Clean Accuracy: 98.00%, ASR: 4.00%


  model.load_state_dict(torch.load('M1.pth'))  # 替换为您的模型路径


> 显然，在非对抗训练时ASR更低，现分析可能原因：  
> - 后门攻击 ≠ 对抗攻击  
> 后门攻击（Backdoor Attack）：攻击者在训练阶段引入带触发器的图像，并强行将其标签设为目标类，模型在训练中“记住”了触发器 → label 的关联，这是一个训练时注入的漏洞。  
> 对抗攻击（Adversarial Attack）：攻击者在测试阶段，针对当前模型参数，添加细微扰动使预测错误，是基于模型梯度的泛化性漏洞。  
> 所以它们利用的模型脆弱性本质不同，一个是训练过程种下的逻辑漏洞，一个是测试时找漏洞。  
> - 对抗训练让模型更依赖“强特征”  
> 对抗训练倾向于增强模型对抗扰动的鲁棒性，常常会“压制”那些对预测不稳定的脆弱特征（weak features）。  
> 后门触发器是非常明显、强烈的图像特征（例如白块、强噪声等），比自然图像中微弱的特征更显眼。  
> 于是，对抗训练后的模型 更加依赖这些显著、可辨识的“强特征”，后门触发器恰好就属于这类更容易激发模型的“记忆”。  
> 总结一句话：对抗训练“压制自然特征、增强显著特征”，后门触发器被误判为“强特征”而更容易被识别。  
> - 对抗训练不会自动防御后门触发器  
> 对抗训练是为了防止：x + 小扰动 δ → 预测结果错误。它优化的是模型对自然样本的连续性。  
> 而 BadNet/Blend 等后门攻击是：加入非连续、明显的人造图案（甚至跨类别干扰）。攻击目标是训练阶段修改标签的那一批触发样本（非对抗扰动！）  
> 所以：对抗训练虽然提高了模型对自然扰动的鲁棒性，但对显式人为注入的 trigger pattern 并没有学习到“排斥”行为，反而可能强化了对其的响应。  