# MNIST DP-SGD Keras

# All needed Imports and Functions

In [2]:
"""Evaluate the DP-SGD optimizer using TF 1.x.

Code of the Notebook is based on
https://github.com/tensorflow/privacy/blob/master/tutorials/
mnist_dpsgd_tutorial_keras.py. For a quick walktrough see
https://github.com/tensorflow/privacy/tree/master/tutorials/walkthrough.

Attributes
----------
GradientDescentOptimizer : tf.train.GradientDescentOptimizer
    Non-DP optimizer for the training.

License of the code underlying this notebook
--------------------------------------------
Copyright 2019, The TensorFlow Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
from timeit import default_timer as timer

# set tensorlfow version in google colab
try:
  %tensorflow_version 1.x
except Exception:
  pass
import tensorflow.compat.v1 as tf

# used to measure the privacy gurantee
from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
from tensorflow_privacy.privacy.analysis.rdp_accountant import \
    get_privacy_spent

# optimizer used for the privacy-preserving training
from tensorflow_privacy.privacy.optimizers.dp_optimizer import \
    DPGradientDescentGaussianOptimizer

# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

GradientDescentOptimizer = tf.train.GradientDescentOptimizer


def compute_epsilon(noise_multiplier, minibatches, epochs):
  """Computes epsilon value for given hyperparameters.

      Epsilon describes the strength of our privacy guarantee. In the case of
      DP-ML, it gives a bound on how much the probability of a particular model
      output can vary by including (or removing) a single training example. We
      usually want it to be a small constant. However, this is only an upper
      bound, and a large value of epsilon could still mean good practical
      privacy. Interpreting this value could be quiet difficult.

  Returns
  -------
  float
      Epsion-value for the expanded privacy budget.

  Parameters
  ----------
  noise_multiplier : float
      Description
  minibatches : int
      Number of samples used in each training step.
  epochs : int
      Number of training iterations.
  """
  # Together with the noise multiplier are these the parameters which are
  # relevant to measuring the potential privacy loss induced by the training.
  #
  # *sampling_probability: The probability of an individual training point
  # being included in a minibatch.
  # *steps: Number of steps the optimizer takes over the training data.
  steps = epochs * 60000 // minibatches
  sampling_probability = minibatches / 60000

  if noise_multiplier == 0.0:
    return float('inf')

  # The exact meaning of the orders is not quite clear to me yet. For a rough
  # description and a rule of thumb see the walkthrough.
  orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))

  # Rule of thump: Delta is set to 1e-5 because MNIST has 60000 training
  # points (see the walktrough). Delta bounds the probability that our privacy
  # guarantee do not hold.
  rdp = compute_rdp(q=sampling_probability,
                    noise_multiplier=noise_multiplier,
                    steps=steps,
                    orders=orders)

  return get_privacy_spent(orders, rdp, target_delta=1e-5)[0]


def load_mnist():
  """Loads MNIST and preprocesses to combine training and validation data.

  Returns
  -------
  tuple
      (history of the training, runtime measurement, epsilon-value)
  """
  train, test = tf.keras.datasets.mnist.load_data()
  train_data, train_labels = train
  test_data, test_labels = test

  train_data = np.array(train_data, dtype=np.float32) / 255
  test_data = np.array(test_data, dtype=np.float32) / 255

  train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)
  test_data = test_data.reshape(test_data.shape[0], 28, 28, 1)

  train_labels = np.array(train_labels, dtype=np.int32)
  test_labels = np.array(test_labels, dtype=np.int32)

  train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=10)
  test_labels = tf.keras.utils.to_categorical(test_labels, num_classes=10)

  assert train_data.min() == 0.
  assert train_data.max() == 1.
  assert test_data.min() == 0.
  assert test_data.max() == 1.

  return train_data, train_labels, test_data, test_labels


def main(dpsgd, learning_rate, noise_multiplier, l2_norm_clip, minibatches,
         epochs, microbatches, verbose=1):
  """Define, train and compute the used privacy budget of the keras model.

  Raises
  ------
  ValueError
      The number of microbatches must divide the batch size.

  Parameters
  ----------
  dpsgd : bool
      If True, train with DP-SGD. If False, train with vanilla SGD.
  learning_rate : float
      Learning rate for training.
  noise_multiplier : float
      Ratio of the standard deviation to the clipping norm. Typically more
      noise results in stronger privacy and often at the expense of utility.
  l2_norm_clip : float
      Attribute gives the maximum Euclidean norm of each individual gradient
      that is computed on an individual training example from a minibatch. This
      parameter is used to bound the optimizer's sensitivity to individual
      training points.
  minibatches : int
      Number of samples used in each training step.
  epochs : int
      Number of epochs used for the training.
  microbatches : int
      Number of microbatches (must be evently divide batch size). In practice
      clipping gradients for each exampe indivdudally can strongly degrade the
      performance because instead of parallelizing at the granularity of
      minibatches the computations must be performed for each example. Rather
      than clipping gradients per example we clip them on the basis of
      microbatches. In this way is the number of microbatches a trade-off
      parameter between privacy and utility (small number -> higher privacy,
      number closer to size of minibatches -> higher utility).
  verbose : int, optional
      Verbose parameter of the TF fit function.

  Returns
  -------
  TYPE
      Description
  """
  if dpsgd and minibatches % microbatches != 0:
    raise ValueError('Number of microbatches should divide evenly minibatches')

  # Load training and test data.
  train_data, train_labels, test_data, test_labels = load_mnist()

  # Define a sequential Keras model
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(16, 8,
                             strides=2,
                             padding='same',
                             activation='relu',
                             input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPool2D(2, 1),
      tf.keras.layers.Conv2D(32, 4,
                             strides=2,
                             padding='valid',
                             activation='relu'),
      tf.keras.layers.MaxPool2D(2, 1),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(32, activation='relu'),
      tf.keras.layers.Dense(10)
  ])

  if dpsgd:
    optimizer = DPGradientDescentGaussianOptimizer(
        l2_norm_clip=l2_norm_clip,
        noise_multiplier=noise_multiplier,
        num_microbatches=microbatches,
        learning_rate=learning_rate)
    # Compute vector of per-example loss rather than its mean over a minibatch.
    # The optimizers needs the loss per example in order to compute the
    # gradients per example (rather than per minibatch) and clip/noise the
    # gradient of each example individually.
    loss = tf.keras.losses.CategoricalCrossentropy(
        from_logits=True, reduction=tf.losses.Reduction.NONE)
  else:
    optimizer = GradientDescentOptimizer(learning_rate=learning_rate)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

  # Compile model with Keras
  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

  # Train model with Keras and measure training time
  start = timer()
  history = model.fit(train_data, train_labels,
                      epochs=epochs,
                      validation_data=(test_data, test_labels),
                      batch_size=minibatches, verbose=verbose)
  end = timer()

  training_time = end - start  # time in seconds

  # Compute the privacy budget expended
  epsilon = -42
  if dpsgd:
    epsilon = compute_epsilon(noise_multiplier, minibatches, epochs)

  return history, training_time, epsilon

TensorFlow 1.x selected.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Train a Model

In [0]:
# Train the keras model and compute the used privacy budget.
dpsgd = True
_, training_time, epsilon = main(dpsgd, 0.15, 1.1, 1.0, 250, 60, 250)

print('Training time: ', training_time)
if dpsgd:
  print('For delta=1e-5, the current epsilon is: %.2f' % epsilon)
else:
  print('Trained with vanilla non-private SGD optimizer')

Train on 60000 samples, validate on 10000 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
Training time:  4168.984221939001
For delta=1e-5, the current epsilon is: 2.97


## Evaluate DP-SGD

In [0]:
# We evaluate the DP-SGD with the following different
# hyperparameters in terms of epsilon, accuracy and running time. We run each
# each hyperparameter configuration (run) in a loop of 3 iterations.

dps = [False, True, True, True]
learning_rates = [0.1, 0.25, 0.15, 0.25]
noise_multipliers = [1.3, 1.1, 0.7]
clipping_thresholds = [1.5, 1, 1.5]
minibatches = 250
microbatches = 250
epochs = [20, 15, 60, 45]

# a running index
dp_idx = -1
# save the hyperparameters of each iteration in a list
rows = []
for idx, dp in enumerate(dps):
  if dp:
    dp_idx += 1

  for i in range(3):
    print('Run: ', idx, ' - Loop: ', i)

    history, training_time, epsilon = main(dp, learning_rates[idx],
                                           noise_multipliers[dp_idx],
                                           clipping_thresholds[dp_idx],
                                           minibatches, epochs[idx],
                                           microbatches, 0)
  
    accuracy = history.history['val_acc'][-1]
  
    # save the hyperparameters per row in a dict
    row = {'dp': dp,
           'learning_rate': learning_rates[idx],
           'noise_multiplier': noise_multipliers[dp_idx],
           'clipping_threshols': clipping_thresholds[dp_idx],
           'minibatches': minibatches,
           'microbatches': microbatches,
           'epochs': epochs[idx],
           'epsilon': epsilon,
           'accuracy': accuracy,
           'training_time': training_time}

    rows.append(row)

df = pd.DataFrame(rows)

Run:  0  - Loop:  0
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Run:  0  - Loop:  1
Run:  0  - Loop:  2
Run:  1  - Loop:  0
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Run:  1  - Loop:  1
Run:  1  - Loop:  2
Run:  2  - Loop:  0
Run:  2  - Loop:  1
Run:  2  - Loop:  2
Run:  3  - Loop:  0
Run:  3  - Loop:  1
Run:  3  - Loop:  2


In [8]:
# save/load and clean up df
#df.to_pickle('/content/drive/My Drive/df.pkl')
df = pd.read_pickle('/content/drive/My Drive/df.pkl')
df.loc[df.dp==False, ['epsilon', 'noise_multiplier', 'clipping_threshols']] = np.nan
df['Run'] = np.array([[i]*3 for i in range(4)]).flatten()
df = df.set_index('Run')
df.loc[:, 'training_time'] /= 60
df

Unnamed: 0_level_0,dp,learning_rate,noise_multiplier,clipping_threshols,minibatches,microbatches,epochs,epsilon,accuracy,training_time
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,False,0.1,,,250,250,20,,0.9896,0.537926
0,False,0.1,,,250,250,20,,0.9896,0.313726
0,False,0.1,,,250,250,20,,0.9908,0.312303
1,True,0.25,1.3,1.5,250,250,15,1.179901,0.9506,17.198169
1,True,0.25,1.3,1.5,250,250,15,1.179901,0.9473,17.125886
1,True,0.25,1.3,1.5,250,250,15,1.179901,0.9509,17.115841
2,True,0.15,1.1,1.0,250,250,60,2.96993,0.9679,68.369128
2,True,0.15,1.1,1.0,250,250,60,2.96993,0.9639,68.14431
2,True,0.15,1.1,1.0,250,250,60,2.96993,0.9656,68.915586
3,True,0.25,0.7,1.5,250,250,45,7.009134,0.9695,52.453361


In [9]:
# calculate the relative standard derivation
(df.loc[:,['accuracy', 'training_time']].std(level='Run')/
df.loc[:,['accuracy', 'training_time']].mean(level='Run'))

Unnamed: 0_level_0,accuracy,training_time
Run,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0007,0.33469
1,0.002104,0.002619
2,0.002079,0.005793
3,0.001083,0.016797


In [10]:
# calculate the mean of each run
df = df.mean(level='Run')
df

Unnamed: 0_level_0,dp,learning_rate,noise_multiplier,clipping_threshols,minibatches,microbatches,epochs,epsilon,accuracy,training_time
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,False,0.1,,,250,250,20,,0.99,0.387985
1,True,0.25,1.3,1.5,250,250,15,1.179901,0.9496,17.146632
2,True,0.15,1.1,1.0,250,250,60,2.96993,0.9658,68.476342
3,True,0.25,0.7,1.5,250,250,45,7.009134,0.969533,53.398623


In [11]:
df = df.drop(labels=['minibatches', 'microbatches'], axis=1)
print(df.to_latex(index=False))

\begin{tabular}{lrrrrrrr}
\toprule
    dp &  learning\_rate &  noise\_multiplier &  clipping\_threshols &  epochs &   epsilon &  accuracy &  training\_time \\
 False &           0.10 &               NaN &                 NaN &      20 &       NaN &  0.990000 &       0.387985 \\
\midrule
  True &           0.25 &               1.3 &                 1.5 &      15 &  1.179901 &  0.949600 &      17.146632 \\
  True &           0.15 &               1.1 &                 1.0 &      60 &  2.969930 &  0.965800 &      68.476342 \\
  True &           0.25 &               0.7 &                 1.5 &      45 &  7.009134 &  0.969533 &      53.398623 \\
\bottomrule
\end{tabular}

