Structure of this file:
- Train a new VGG11 model
- Explore model's layer-sensitivity base on various pruning rate

### Ⅰ、前置工作

In [1]:
import os
from torchvision import models
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from copy import deepcopy
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.optim as optim

In [2]:
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 2. 挂载 Google Drive（如果还没挂载的话）

print(device)

cuda


In [3]:
import torchvision
import torchvision.transforms as transforms

# 加载 CIFAR10 数据集
batch_size = 64

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root='/bohr/cifar10-h7hf/v2', train=True, download=True, transform=transform_train)
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = torchvision.datasets.CIFAR10(root='/bohr/cifar10-h7hf/v2', train=False, download=True, transform=transform_test)
test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False)
n_test = len(testset)
example_input = (testset[0])

100%|██████████| 170M/170M [00:03<00:00, 43.5MB/s]


In [4]:
vgg11_cfg = [64, 'M', 128, 'M', 256, 'M', 512, 'M', 512, 'M']
class VGG11(nn.Module):
    def __init__(self):
        super(VGG11, self).__init__()
        self.features = self._make_layers(vgg11_cfg)
        self.classifier = nn.Linear(512, 10)  # CIFAR-10 has 10 classes

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [
                    nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                    nn.BatchNorm2d(x),
                    nn.ReLU(inplace=True)
                ]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)

In [6]:
# Second time : run
import time
def test(model, test_loader, debug = False):
    model.eval()
    correct = 0
    total = 0
    model.to(device)
    with torch.no_grad():
        start_time = time.time()
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        end_time = time.time()
    accuracy = 100 * correct / total
    time_cost = end_time - start_time
    if debug:
        print('Accuracy of the network on the %d test images: %.2f %%' % (n_test, accuracy))
        print('Time cost: %.2f s' % time_cost)
    return accuracy, time_cost

### Ⅱ、训练

In [None]:
# ---------- 1. 初始化模型 ----------
model = VGG11().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)

# ---------- 2. 训练 ----------
EPOCHS = 50
best_acc = 0.0

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    scheduler.step()

# ---------- 3.测试 ----------
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    acc = 100. * correct / total
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {running_loss:.3f} - Test Acc: {acc:.2f}%")

    # 保存最好的模型
    if acc > best_acc:
        best_acc = acc
        from google.colab import drive
        drive.mount('/content/drive')
        # 定义保存路径
        save_path = "/content/drive/MyDrive/第二次尝试/Independent_VGG11_CIFAR10.pth"
        # 保存模型参数
        torch.save(model.state_dict(), save_path)

print("Training complete. Best accuracy: {:.2f}%".format(best_acc))

In [5]:
from google.colab import drive
drive.mount('/content/drive')
model = VGG11()
model.load_state_dict(torch.load('/content/drive/MyDrive/第二次尝试/Independent_VGG11_CIFAR10.pth', map_location=torch.device('cpu')))

Mounted at /content/drive


<All keys matched successfully>

In [7]:
acc,time_cost = test(model,test_loader)
baseline_acc = acc
print(baseline_acc)

83.43


### Ⅲ、利用prune去分析层敏感度

In [8]:
conv_layers = [(name, m) for name, m in model.features.named_modules() if isinstance(m, nn.Conv2d)]
conv_layers

[('0', Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('4', Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('8', Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('12', Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('16', Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))]

In [9]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from copy import deepcopy

def compute_layer_sensitivities_earlystop(
    model,
    prune_rates,
    device,
    testloader,
    test_fn,
    threshold=12.0,   # 阈值：相对 baseline 的精度下降超过多少百分比就停
    verbose=True
):
    """
    计算各卷积层在不同剪枝率下的敏感度(Δacc)，并在精度下降超过阈值时提前停止，
    将“上一个未越线的剪枝率”作为该层的敏感度。

    返回:
      results: dict
        {
          layer_idx: {
            'layer_name': str,
            'baseline_acc': float,
            'chosen_sensitivity_rate': float,   # 该层最终选定的敏感度剪枝率（上一个安全rate）
            'early_stopped': bool,              # 是否提前停止
            'per_rate': {rate: {'acc': float, 'delta_acc': float}},  # 已评估到的各rate
          },
          ...
        }
    """
    # 1) baseline
    base_model = deepcopy(model).to(device).eval()
    with torch.no_grad():
        baseline_acc = 83.43
    if verbose:
        print(f"[Baseline] Accuracy: {baseline_acc:.2f}%")

    # 2) 找卷积层
    conv_layers = [(name, m) for name, m in base_model.features.named_modules() if isinstance(m, nn.Conv2d)]
    results = {}

    # 3) 逐层评估
    for i, (layer_name, _) in enumerate(conv_layers):
        per_rate_stats = {}
        last_safe_rate = 0.0
        early_stopped = False

        if verbose:
            print(f"\n[Layer {i}] -> {layer_name}")

        for rate in prune_rates:
            # 独立拷贝与评估
            temp_model = deepcopy(base_model).to(device).eval()

            # 剪枝目标层
            target_layer = dict(temp_model.features.named_modules())[layer_name]
            prune.ln_structured(target_layer, name='weight', amount=rate, n=2, dim=0)

            # 评估精度
            with torch.no_grad():
                acc = test_fn(temp_model, testloader)[0]
            delta_acc = baseline_acc - acc  # 相对 baseline 的下降

            per_rate_stats[rate] = {'acc': acc, 'delta_acc': delta_acc}

            if verbose:
                print(f"  rate={rate:.2f} -> acc={acc:.2f}% | Δacc={delta_acc:.2f}% "
                      f"| safe_until={last_safe_rate:.2f}")

            # 判定是否越过阈值
            if delta_acc > threshold:
                # 越线：本层提早结束，敏感度取“上一个安全剪枝率”
                early_stopped = True
                if verbose:
                    print(f"  -> Δacc {delta_acc:.2f}% > {threshold:.2f}%: early stop. "
                          f"Chosen sensitivity = {last_safe_rate:.2f}")
                break
            else:
                # 未越线：更新“上一个安全剪枝率”
                last_safe_rate = rate

        # 如果从未越线，敏感度就取已评估的最大剪枝率
        chosen_sensitivity_rate = last_safe_rate

        results[i] = {
            'layer_name': layer_name,
            'baseline_acc': baseline_acc,
            'chosen_sensitivity_rate': chosen_sensitivity_rate,
            'early_stopped': early_stopped,
            'per_rate': per_rate_stats
        }

        if verbose:
            print(f"[Layer {i} Summary] chosen_sensitivity_rate = {chosen_sensitivity_rate:.2f}, "
                  f"early_stopped={early_stopped}")

    return results


In [10]:
basemodel = deepcopy(model)

In [11]:
prune_rates = [0.1, 0.3, 0.5, 0.7, 0.9]
results = compute_layer_sensitivities_earlystop(
    model=basemodel,
    prune_rates=prune_rates,
    device=device,
    testloader=test_loader,
    test_fn=test,
    threshold=12.0,
    verbose=True
)

# 取第0层的敏感度
print("-----Sensitivity in layer0-----")

print(results[0]['layer_name'], results[0]['chosen_sensitivity_rate'])


[Baseline] Accuracy: 83.43%

[Layer 0] -> 0
  rate=0.10 -> acc=83.47% | Δacc=-0.04% | safe_until=0.00
  rate=0.30 -> acc=82.69% | Δacc=0.74% | safe_until=0.10
  rate=0.50 -> acc=80.54% | Δacc=2.89% | safe_until=0.30
  rate=0.70 -> acc=53.36% | Δacc=30.07% | safe_until=0.50
  -> Δacc 30.07% > 12.00%: early stop. Chosen sensitivity = 0.50
[Layer 0 Summary] chosen_sensitivity_rate = 0.50, early_stopped=True

[Layer 1] -> 4
  rate=0.10 -> acc=81.38% | Δacc=2.05% | safe_until=0.00
  rate=0.30 -> acc=51.62% | Δacc=31.81% | safe_until=0.10
  -> Δacc 31.81% > 12.00%: early stop. Chosen sensitivity = 0.10
[Layer 1 Summary] chosen_sensitivity_rate = 0.10, early_stopped=True

[Layer 2] -> 8
  rate=0.10 -> acc=81.86% | Δacc=1.57% | safe_until=0.00
  rate=0.30 -> acc=76.85% | Δacc=6.58% | safe_until=0.10
  rate=0.50 -> acc=43.76% | Δacc=39.67% | safe_until=0.30
  -> Δacc 39.67% > 12.00%: early stop. Chosen sensitivity = 0.30
[Layer 2 Summary] chosen_sensitivity_rate = 0.30, early_stopped=True

[Lay

### 【Conclusion】
从最终的结果可以看到模型的前几层是十分敏感的；

利用这个思路得到模型的层敏感度之后，我们开始：
- 全局非结构化剪枝
- 迭代非结构化剪枝
- 结构化剪枝
