In [11]:
!pip install torch torchvision torchaudio



In [12]:
!pip install opacus

Collecting opacus
  Downloading opacus-1.5.2-py3-none-any.whl.metadata (7.9 kB)
Downloading opacus-1.5.2-py3-none-any.whl (239 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.9/239.9 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opacus
Successfully installed opacus-1.5.2


In [32]:
import warnings
warnings.simplefilter("ignore")

#MAX_GRAD_NORM = 1.2
EPSILON = 3.0
DELTA = 1e-5
EPOCHS = 25

#Adaptive Clipping
initial_clip_norm = 1.0
eta_C = 0.2  # Learning rate for updating the clipping norm
target_quantile = 0.5  # Target quantile, e.g., 0.5 for the median

LR = 0.5

In [33]:
BATCH_SIZE = 2048
MAX_PHYSICAL_BATCH_SIZE = 128

In [34]:
import torch
import torchvision
import torchvision.transforms as transforms

# These values, specific to the CIFAR10 dataset, are assumed to be known.
# If necessary, they can be computed with modest privacy budgets.
CIFAR10_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR10_STD_DEV = (0.2023, 0.1994, 0.2010)

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD_DEV),
])

In [35]:
from torchvision.datasets import CIFAR10

DATA_ROOT = '../cifar10'

train_dataset = CIFAR10(
    root=DATA_ROOT, train=True, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
)

test_dataset = CIFAR10(
    root=DATA_ROOT, train=False, download=True, transform=transform)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

Files already downloaded and verified
Files already downloaded and verified


### **ResNet-n from torch**

In [36]:
from torchvision import models

### **ResNet20 (with Adaptive Clipping)**

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [38]:
class WSConv2d(nn.Conv2d):
    def forward(self, x):
        # Weight standardization
        mean = self.weight.mean(dim=[1, 2, 3], keepdim=True)
        std = self.weight.std(dim=[1, 2, 3], keepdim=True)
        weight_standardized = (self.weight - mean) / (std + 1e-5)

        return F.conv2d(x, weight_standardized, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)

class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, num_groups=32):
        super(BasicBlock, self).__init__()
        #self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) #First convolutional layer
        self.conv1 = WSConv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) #First weight-standardized convolutional layer
        self.gn1 = nn.GroupNorm(num_groups, out_channels)  #GroupNorm for DP compatability

        #self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)  #Second convolutional layer
        self.conv2 = WSConv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False) #Second weight-standardized convolutional layer
        self.gn2 = nn.GroupNorm(num_groups, out_channels)  #GroupNorm for DP compatability
        # Skip connection
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                #nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                WSConv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.GroupNorm(num_groups, out_channels)  #GroupNorm for DP compatability
            )

    def forward(self, x):
        out = F.relu(self.gn1(self.conv1(x)))
        out = self.gn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet20(nn.Module):
    def __init__(self, num_classes=10, num_groups=32):
        super(ResNet20, self).__init__()
        self.in_channels = 16
        #Initial convolutional layer
        #self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.conv1 = WSConv2d(3, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.gn1 = nn.GroupNorm(num_groups, self.in_channels)  #GroupNorm for DP compatability
        # Residual blocks
        self.layer1 = self._make_layer(16, 3, stride=1, num_groups=4)   # 4 groups for 16 channels
        self.layer2 = self._make_layer(32, 3, stride=2, num_groups=8)   # 8 groups for 32 channels
        self.layer3 = self._make_layer(64, 3, stride=2, num_groups=16)  # 16 groups for 64 channels
        # Fully connected layer
        self.fc = nn.Linear(64, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride, num_groups):
        layers = []
        for _ in range(num_blocks):
            layers.append(BasicBlock(self.in_channels, out_channels, stride, num_groups=num_groups))
            self.in_channels = out_channels
            stride = 1  #Only the first block in each layer has a stride of 1
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.gn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

    #Adaptive Clipping
    def calculate_avg_grad_norm(model):
        total_norm = 0.0
        total_params = 0
        for p in model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2).item()
                total_norm += param_norm ** 2
                total_params += 1
        avg_grad_norm = (total_norm / total_params) ** 0.5
        return avg_grad_norm

    #Grouped Gradient Clipping
    '''
    def clip_gradients_by_group(model, max_grad_norms):
        """Clip gradients by group of layers, based on the provided max_grad_norms list."""
        group_gradients = [[], []]  # Adjust the number of groups as necessary

        # Group 1: Convolutional layers
        group_gradients[0].extend([p for name, p in model.named_parameters() if 'conv' in name])

        # Group 2: Fully connected layers
        group_gradients[1].extend([p for name, p in model.named_parameters() if 'fc' in name])

        # Clip gradients per group
        for group, max_norm in zip(group_gradients, max_grad_norms):
            total_norm = torch.norm(torch.stack([torch.norm(p.grad) for p in group if p.grad is not None]), 2)
            clip_coef = max_norm / (total_norm + 1e-6)
            if clip_coef < 1:
                for p in group:
                    if p.grad is not None:
                        p.grad.mul_(clip_coef)
    '''

### Model Assignment

In [39]:
#model = models.resnet18(num_classes=10)
model = ResNet20(num_classes=10, num_groups=4)

In [40]:
from opacus.validators import ModuleValidator

errors = ModuleValidator.validate(model, strict=False)
errors[-5:]

[]

In [41]:
model = ModuleValidator.fix(model)
ModuleValidator.validate(model, strict=False)

[]

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [43]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9)

In [44]:
def accuracy(preds, labels):
    return (preds == labels).mean()

In [45]:
from opacus import PrivacyEngine

privacy_engine = PrivacyEngine()

model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    epochs=EPOCHS,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=1.0,  #Starting with grad norm as 1 and adjusting it during training according to the specific quantile of the gradient norm distribution
)

print(f"Using sigma={optimizer.noise_multiplier} and C=1.0")

Using sigma=1.61376953125 and C=1.0


In [46]:
def adaptive_clipping(grad_norms, clip_norm, target_quantile, eta_C):   #Adjusts the clipping norm using geometric updates to match a target quantile.
    #Calculate the fraction of gradients below the current clip norm
    clipped_fraction = np.mean(grad_norms <= clip_norm)

    #Update the clipping norm
    new_clip_norm = clip_norm * np.exp(-eta_C * (clipped_fraction - target_quantile))
    return new_clip_norm

def calculate_grad_norms(model):    #Calculates the L2 norm of gradients for each parameter in the model.
    grad_norms = []
    for p in model.parameters():
        if p.grad is not None:
            grad_norms.append(p.grad.data.norm(2).item())
    return np.array(grad_norms)

In [47]:
import numpy as np
from opacus.utils.batch_memory_manager import BatchMemoryManager

def train(model, train_loader, optimizer, epoch, device):
    model.train()
    criterion = nn.CrossEntropyLoss()

    losses = []
    top1_acc = []
    clip_norm = initial_clip_norm  #Start with an initial clip norm

    with BatchMemoryManager(
        data_loader=train_loader,
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE,
        optimizer=optimizer
    ) as memory_safe_data_loader:

        for i, (images, target) in enumerate(memory_safe_data_loader):
            optimizer.zero_grad()
            images = images.to(device)
            target = target.to(device)

            #compute output
            output = model(images)
            loss = criterion(output, target)

            preds = np.argmax(output.detach().cpu().numpy(), axis=1)
            labels = target.detach().cpu().numpy()

            #measure accuracy and record loss
            acc = accuracy(preds, labels)

            losses.append(loss.item())
            top1_acc.append(acc)

            loss.backward()

            #Adaptive Clipping
            #Calculate gradient norms for the current batch
            grad_norms = calculate_grad_norms(model)
            #Update the clipping norm based on gradient norms
            clip_norm = adaptive_clipping(grad_norms, clip_norm, target_quantile, eta_C)
            #Clip gradients to the updated norm
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)

            #Grouped Gradient Clipping
            '''
            clip_gradients_by_group(model, grouped_clipping_norms)
            '''

            optimizer.step()

            if (i+1) % 200 == 0:
                epsilon = privacy_engine.get_epsilon(DELTA)
                print(
                    f"\tTrain Epoch: {epoch} \t"
                    f"Loss: {np.mean(losses):.6f} "
                    f"Acc@1: {np.mean(top1_acc) * 100:.6f} "
                    f"(ε = {epsilon:.2f}, δ = {DELTA})"
                )

In [48]:
def test(model, test_loader, device):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    losses = []
    top1_acc = []

    with torch.no_grad():
        for images, target in test_loader:
            images = images.to(device)
            target = target.to(device)

            output = model(images)
            loss = criterion(output, target)
            preds = np.argmax(output.detach().cpu().numpy(), axis=1)
            labels = target.detach().cpu().numpy()
            acc = accuracy(preds, labels)

            losses.append(loss.item())
            top1_acc.append(acc)

    top1_avg = np.mean(top1_acc)

    print(
        f"\tTest set:"
        f"Loss: {np.mean(losses):.6f} "
        f"Acc: {top1_avg * 100:.6f} "
    )
    return np.mean(top1_acc)

In [49]:
from tqdm.notebook import tqdm

for epoch in tqdm(range(EPOCHS), desc="Epoch", unit="epoch"):
    train(model, train_loader, optimizer, epoch + 1, device)

Epoch:   0%|          | 0/25 [00:00<?, ?epoch/s]

	Train Epoch: 1 	Loss: 2.297433 Acc@1: 13.400816 (ε = 0.49, δ = 1e-05)
	Train Epoch: 1 	Loss: 2.154730 Acc@1: 17.917642 (ε = 0.63, δ = 1e-05)
	Train Epoch: 2 	Loss: 1.902019 Acc@1: 27.912496 (ε = 0.75, δ = 1e-05)
	Train Epoch: 2 	Loss: 1.852937 Acc@1: 30.406447 (ε = 0.85, δ = 1e-05)
	Train Epoch: 3 	Loss: 1.735654 Acc@1: 36.565023 (ε = 0.95, δ = 1e-05)
	Train Epoch: 3 	Loss: 1.708474 Acc@1: 38.004496 (ε = 1.03, δ = 1e-05)
	Train Epoch: 4 	Loss: 1.639327 Acc@1: 42.634351 (ε = 1.11, δ = 1e-05)
	Train Epoch: 4 	Loss: 1.633236 Acc@1: 43.369347 (ε = 1.18, δ = 1e-05)
	Train Epoch: 5 	Loss: 1.625932 Acc@1: 45.815433 (ε = 1.25, δ = 1e-05)
	Train Epoch: 5 	Loss: 1.631425 Acc@1: 46.102262 (ε = 1.31, δ = 1e-05)
	Train Epoch: 6 	Loss: 1.660199 Acc@1: 47.086926 (ε = 1.38, δ = 1e-05)
	Train Epoch: 6 	Loss: 1.642528 Acc@1: 47.899317 (ε = 1.43, δ = 1e-05)
	Train Epoch: 7 	Loss: 1.656604 Acc@1: 49.432043 (ε = 1.49, δ = 1e-05)
	Train Epoch: 7 	Loss: 1.635869 Acc@1: 50.346171 (ε = 1.55, δ = 1e-05)
	Train

In [50]:
top1_acc = test(model, test_loader, device)

	Test set:Loss: 1.507198 Acc: 60.978637 
