In [None]:
import torch
import numpy as np
from torch import nn
from torch import optim

from torchvision.datasets import MNIST
from torchvision.transforms import Compose, ToTensor

cuda = torch.cuda.is_available()

In [None]:
def evaluate(model, dataloader, num_batches=None):
    """Evaluate model on dataloader and return accuracy"""
    model.eval()
    acc = 0
    with torch.no_grad():
        for b, (images, labels) in enumerate(dataloader):
            images = images.cuda() if cuda else images
            labels = labels.cuda() if cuda else labels

            logits = model(images)
            predictions = logits.argmax(dim=1)
            acc += (predictions == labels).float().mean()
            if num_batches and b == num_batches:
                break
        return acc.item()/(b + 1)

In [None]:
def build_model():
    """Return a new conv net"""
    model = nn.Sequential(
        nn.Conv2d(1, 16, 8, stride=2, padding=4),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(16, 32, 4, stride=2),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(128, 32),
        nn.ReLU(),
        nn.Linear(32, 10),
        nn.LogSoftmax(dim=1)
    )
    return model.cuda() if cuda else model

We load the mnist train and test dataset

In [None]:
trafo = Compose([
    ToTensor()
])
trainset = MNIST('data/mnist', train=True, transform=trafo, download=True)
print('train set size:', len(trainset))

testset = MNIST('data/mnist', train=False, transform=trafo, download=True)
print('test set size:', len(testset))

We train for `epochs` epochs and with minibatches of size `batch_size`.

In [None]:
epochs = 15
batch_size = 250

Let's build the data loaders.

In [None]:
trainloader = torch.utils.data.DataLoader(
    trainset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

testloader = torch.utils.data.DataLoader(
    testset,
    batch_size=512,
    shuffle=False,
    num_workers=2
)

These are the parameters we set for the DP learning

In [None]:
l2_norm_clip = 1.5
noise_multiplier = 1.3
learning_rate = 0.25
delta = 1e-5

In [None]:
model = build_model()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

# We make a copy of the gradients of all trainable parameters wherein we will
# store clipped gradients of each example.

gradients = {}
for n, p in model.named_parameters():
    gradients[n] = torch.zeros_like(p)

for epoch in range(epochs):
    print("Starting epoch", epoch)
    running_loss = 0.0
    for b, (images, labels) in enumerate(trainloader):
        model.train()
        images = images.cuda() if cuda else images
        labels = labels.cuda() if cuda else labels

        # Let's start by zeroing the gradients store.
        
        for _, grad in gradients.items():
            grad.zero_()
            
        # Now we go image by image (microbatchsize = 1), and compute the
        # gradients.  We clip the gradients, add noise and add these to
        # the gradients store.
        
        batch_loss = 0.0
        for image, label in zip(images, labels):
            logit = model(image[None, ...])
            loss = criterion(logit, label[None, ...])
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), l2_norm_clip, 2)
            for name, param in model.named_parameters():
                gradients[name] += param.grad
                
            batch_loss += loss.item()
        
        batch_loss /= images.shape[0]
        running_loss = (b * running_loss + batch_loss) / (b+1) 
        
        # Now we replace the model parameter gradients with the mean of all
        # gradients we've gotten from individual images.  These are the gradients
        # that the optimizer sees; so we can continue optimization by just stepping
        # it up.
        # Note that we add noise only in this step. This is because the independently
        # sampled noise vectors per example add up to a scaled random perturbation.
        
        # Here we do it the wrong way to reproduce the results in
        # "github.com/tensorflow/privacy/tutorials/Classification_Privacy.ipynb"
        
        sigma = l2_norm_clip * noise_multiplier / images.shape[0]

        # Uncomment the following line to do it the right way:
        
        # sigma = l2_norm_clip * noise_multiplier / np.sqrt(images.shape[0])
        
        for name, param in model.named_parameters():
            noise = sigma * torch.randn_like(param.grad)
            param.grad = gradients[name] / images.shape[0] + noise

        optimizer.step()

        if b % 10 == 0:
            acc = 100.0 * evaluate(model, testloader)
            print(f"[batch {b}, epoch {epoch}] train loss = {running_loss:.3f}, val acc = {acc:.1f} %")

In [None]:
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy

compute_dp_sgd_privacy.compute_dp_sgd_privacy(
    len(trainset), batch_size, noise_multiplier, epochs, delta)

As we outline in the paper, `noise_multiplier` should be scaled by a factor `np.sqrt(batch_size)`, which gives us the following updated estimate.

In [None]:
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy

compute_dp_sgd_privacy.compute_dp_sgd_privacy(
    len(trainset), batch_size, noise_multiplier / np.sqrt(batch_size), epochs, delta)