In [0]:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time

from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM

from tensorflow_privacy.privacy.analysis.rdp_accountant import compute_rdp
from tensorflow_privacy.privacy.analysis.rdp_accountant import get_privacy_spent
from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer

In [0]:
experiment_num=5

# Datasets
##Cat_versus_dog

In [19]:
catdog_builder = tfds.builder("cats_vs_dogs")
catdog_builder.download_and_prepare()

catdog_train_dataset, catdog_test_dataset = catdog_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

catdog_num_classes = catdog_builder.info.features['label'].num_classes
catdog_data_type = 'text' if 'text' in catdog_builder.info.features else 'image'

catdog_train_dataset = catdog_train_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))
catdog_test_dataset = catdog_test_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))

print(tf.data.experimental.cardinality(catdog_train_dataset))
print(tf.data.experimental.cardinality(catdog_test_dataset))
print(catdog_num_classes)
print(catdog_data_type)

tf.Tensor(18610, shape=(), dtype=int64)
tf.Tensor(4652, shape=(), dtype=int64)
2
image


##Flowers

In [20]:
flower_builder = tfds.builder("tf_flowers")
flower_builder.download_and_prepare()

flower_train_dataset, flower_test_dataset = flower_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

flower_num_classes = flower_builder.info.features['label'].num_classes
flower_data_type = 'text' if 'text' in flower_builder.info.features else 'image'

flower_train_dataset = flower_train_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))
flower_test_dataset = flower_test_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))

print(tf.data.experimental.cardinality(flower_train_dataset))
print(tf.data.experimental.cardinality(flower_test_dataset))
print(flower_num_classes)
print(flower_data_type)

tf.Tensor(2936, shape=(), dtype=int64)
tf.Tensor(734, shape=(), dtype=int64)
5
image


##Malaria

In [21]:
malaria_builder = tfds.builder("malaria")
malaria_builder.download_and_prepare()

malaria_train_dataset, malaria_test_dataset = malaria_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

malaria_num_classes = malaria_builder.info.features['label'].num_classes
malaria_data_type = 'text' if 'text' in malaria_builder.info.features else 'image'

malaria_train_dataset = malaria_train_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))
malaria_test_dataset = malaria_test_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))

print(tf.data.experimental.cardinality(malaria_train_dataset))
print(tf.data.experimental.cardinality(malaria_test_dataset))
print(malaria_num_classes)
print(malaria_data_type)

tf.Tensor(22046, shape=(), dtype=int64)
tf.Tensor(5512, shape=(), dtype=int64)
2
image


# Model

In [0]:
def compute_original_epsilon():
  """Computes epsilon value of Gaussian Mechanism"""
  orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
  noise_multiplier = 1.1
  sampling_probability = 1
  rdp = compute_rdp(q=sampling_probability,
                    noise_multiplier=noise_multiplier,
                    steps=1,
                    orders=orders)
  return get_privacy_spent(orders, rdp, target_delta=1e-5)[0]

In [0]:
def compute_shuffle_epsilon(epochs, batch_size, dataset_size):
  """Computes epsilon value of shuffle"""
  delta = 1e-5
  shuffle_epsilon = compute_original_epsilon()
  composition_epsilon = shuffle_epsilon*np.sqrt(2*epochs*np.log(1/delta)) + epochs*shuffle_epsilon*(np.exp(shuffle_epsilon)-1)
  return composition_epsilon

In [0]:
def compute_swo_epsilon(steps, batch_size, dataset_size):
  """Computes epsilon value of swo"""
  delta = 1e-5
  epsilon = compute_original_epsilon()
  sampling_probability = batch_size / dataset_size
  swo_epsilon = np.log(1+sampling_probability*(np.exp(epsilon)-1))
  composition_epsilon = swo_epsilon*np.sqrt(2*steps*np.log(1/delta)) + steps*swo_epsilon*(np.exp(swo_epsilon)-1)
  return composition_epsilon

In [0]:
def compute_poisson_epsilon(steps, batch_size, dataset_size):
  """Computes epsilon value for Poisson"""
  orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
  noise_multiplier = 1.1
  sampling_probability = batch_size / dataset_size
  rdp = compute_rdp(q=sampling_probability,
                    noise_multiplier=noise_multiplier,
                    steps=steps,
                    orders=orders)
  return get_privacy_spent(orders, rdp, target_delta=1e-5)[0]

In [0]:
def cnn_model(num_classes):
  model = tf.keras.Sequential([
      Conv2D(32, (3, 3), activation='relu',
             kernel_regularizer=tf.keras.regularizers.l2(0.001),
             input_shape=(64, 64, 3)),
      MaxPooling2D((2, 2)),
      Dropout(0.2),
      Conv2D(64, (3, 3), activation='relu',
             kernel_regularizer=tf.keras.regularizers.l2(0.001)),
      MaxPooling2D((2, 2)),
      Dropout(0.2),
      Conv2D(128, (3, 3), activation='relu',
             kernel_regularizer=tf.keras.regularizers.l2(0.001)),
      MaxPooling2D((2, 2)),
      Dropout(0.2),
      Flatten(),
      Dense(128, activation='relu'),
      Dense(64, activation='relu'),
      Dense(num_classes)
  ])

  learning_rate = 0.15
  l2_norm_clip = 1.0
  microbatches = 25
  noise_multiplier = 1.1
  optimizer = DPGradientDescentGaussianOptimizer(
      l2_norm_clip=l2_norm_clip,
      noise_multiplier=noise_multiplier,
      num_microbatches=microbatches,
      learning_rate=learning_rate)

  model.compile(optimizer=optimizer,
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

# Shuffling

In [0]:
def shuffle_test(train_dataset, test_dataset, batch_size = 50, epochs = 10, num_classes = 2, data_type = 'image', encoder = None):
  train_dataset_size = int(tf.data.experimental.cardinality(train_dataset))
  test_dataset_size = int(tf.data.experimental.cardinality(test_dataset))

  if data_type == 'image':
    shuffle_test_dataset = test_dataset.batch(batch_size).repeat()
    shuffle_train_dataset = train_dataset.shuffle(1000).batch(batch_size).repeat()

    model_shuffle = cnn_model(num_classes)
  
  if data_type == 'text':
    shuffle_test_dataset = test_dataset.padded_batch(batch_size).repeat()
    shuffle_train_dataset = train_dataset.shuffle(1000).padded_batch(batch_size).repeat()

    model_shuffle = rnn_model(encoder, num_classes)

  train_his = model_shuffle.fit(
    shuffle_train_dataset,
    steps_per_epoch = train_dataset_size // batch_size,
    validation_data = shuffle_test_dataset,
    validation_steps = test_dataset_size // batch_size,
    epochs = epochs,
    verbose=0
  )
  return train_his.history['val_accuracy'][-1]  

##Cat_versus_dog

In [21]:
accs = []
for _ in range(experiment_num):
  acc = shuffle_test(catdog_train_dataset, catdog_test_dataset, num_classes=catdog_num_classes, data_type = catdog_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.7823225855827332


In [22]:
train_dataset_size = int(tf.data.experimental.cardinality(catdog_train_dataset))
eps = compute_shuffle_epsilon(10, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 5687.38


##Flowers

In [23]:
accs = []
for _ in range(experiment_num):
  acc = shuffle_test(flower_train_dataset, flower_test_dataset, num_classes=flower_num_classes, data_type = flower_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.3885714262723923


In [24]:
train_dataset_size = int(tf.data.experimental.cardinality(flower_train_dataset))
eps = compute_shuffle_epsilon(10, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 5687.38


##Malaria

In [25]:
accs = []
for _ in range(experiment_num):
  acc = shuffle_test(malaria_train_dataset, malaria_test_dataset, num_classes = malaria_num_classes, data_type = malaria_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.8031999945640564


In [26]:
train_dataset_size = int(tf.data.experimental.cardinality(malaria_train_dataset))
eps = compute_shuffle_epsilon(10, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 5687.38


# SWO

In [0]:
def generate_batches_by_swo(dataset, batch_size, data_type = 'image', test_mode = False):
    dataset_size = int(tf.data.experimental.cardinality(dataset))
    steps = dataset_size // batch_size
    indexes = [np.random.choice(dataset_size, size=batch_size, replace=False) for _ in range(steps)]

    if test_mode:
      returned_indexes = indexes.copy()
    
    batch_group_by_index = {}
    for batch_index in range(steps):
      for element_index in indexes[batch_index]:
        if element_index not in batch_group_by_index:
          batch_group_by_index[element_index] = [batch_index]
        else:
          batch_group_by_index[element_index].append(batch_index)

    element_index = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]
    
    if data_type == 'image':
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
        element_index += 1
    
      features_shape = [batch_size, 64, 64, 3]
    
    if data_type == 'text':
      max_text_len = 0
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
            max_text_len = max(len(element[0]), max_text_len)
        element_index += 1
      
      #padding
      for bf in batch_features:
        for i in range(len(bf)):
          bf[i] = np.pad(
              bf[i],
              pad_width=(0, max_text_len-len(bf[i])),
              mode = 'constant',
              constant_values=0
          )
      
      features_shape = [batch_size, max_text_len]

    labels_shape = [batch_size, ]
    dataset = tf.data.Dataset.from_generator(
        lambda: zip(batch_features, batch_labels),
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    if not test_mode:
      return dataset
    else:
      return dataset, returned_indexes

In [0]:
def swo_test(train_dataset, test_dataset, batch_size = 50, epochs = 10, data_type = 'image', num_classes = 2, encoder = None):
  train_dataset_size = int(tf.data.experimental.cardinality(train_dataset))
  test_dataset_size = int(tf.data.experimental.cardinality(test_dataset))
  train_steps = train_dataset_size // batch_size
  test_steps = test_dataset_size // batch_size

  if data_type == 'image':
    swo_test_dataset = test_dataset.batch(batch_size).repeat()
    model_swo = cnn_model(num_classes)
  
  if data_type == 'text':
    swo_test_dataset = test_dataset.padded_batch(batch_size).repeat()
    model_swo = rnn_model(encoder, num_classes)

  swo_train_dataset = generate_batches_by_swo(train_dataset, batch_size=batch_size, data_type=data_type).repeat()

  train_his=model_swo.fit(
    swo_train_dataset,
    steps_per_epoch = train_steps,
    validation_data = swo_test_dataset,
    validation_steps = test_steps,
    epochs=epochs,
    verbose=0
  )
  return train_his.history['val_accuracy'][-1]

##Cat_versus_dog

In [48]:
accs = []
for _ in range(experiment_num):
  acc = swo_test(catdog_train_dataset, catdog_test_dataset, num_classes = catdog_num_classes, data_type = catdog_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.7654623627662659


In [31]:
train_dataset_size = int(tf.data.experimental.cardinality(catdog_train_dataset))
eps = compute_swo_epsilon(10*train_dataset_size//50, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 403.13


##Flowers

In [49]:
accs = []
for _ in range(experiment_num):
  acc = swo_test(flower_train_dataset, flower_test_dataset, num_classes = flower_num_classes, data_type = flower_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.49742857217788694


In [33]:
train_dataset_size = int(tf.data.experimental.cardinality(flower_train_dataset))
eps = compute_swo_epsilon(10*train_dataset_size//50, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 1419.99


##Malaria

In [14]:
accs = []
for _ in range(experiment_num):
  acc = swo_test(malaria_train_dataset, malaria_test_dataset, num_classes = malaria_num_classes, data_type = malaria_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.8656363606452941


In [35]:
train_dataset_size = int(tf.data.experimental.cardinality(malaria_train_dataset))
eps = compute_swo_epsilon(10*train_dataset_size//50, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 353.24


# Poisson

In [0]:
def generate_batches_by_poisson(dataset, batch_size, data_type = 'image', test_mode = False):
    dataset_size = int(tf.data.experimental.cardinality(dataset))
    steps = dataset_size // batch_size
    ratio = batch_size / dataset_size
    
    indexes = []
    for _ in range(steps):
      pros = np.random.uniform(0,1,dataset_size)
      chosen = [i for i in range(dataset_size) if pros[i] < ratio]
      indexes.append(chosen)
    
    if test_mode:
      returned_indexes = indexes.copy()
    
    batch_group_by_index = {}
    for batch_index in range(steps):
      for element_index in indexes[batch_index]:
        # steps * batch_size = n, so O(n)
        if element_index not in batch_group_by_index:
          batch_group_by_index[element_index] = [batch_index]
        else:
          batch_group_by_index[element_index].append(batch_index)

    element_index = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]

    if data_type == 'image':
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            # on average, O(n*1) = O(n), on worst case, O(n*n/m) = O(n^2)
            # the worst case is not gonna happen, statistically
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
        element_index += 1
    
      features_shape = [None, 64, 64, 3]
    
    if data_type == 'text':
      max_text_len = 0
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            # on average, O(n*1) = O(n), on worst case, O(n*n/m) = O(n^2)
            # the worst case is not gonna happen, statistically
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
            max_text_len = max(len(element[0]), max_text_len)
        element_index += 1
      
      #padding
      for bf in batch_features:
        for i in range(len(bf)):
          bf[i] = list(bf[i]) + [0]*(max_text_len-len(bf[i]))
      
      features_shape = [None, max_text_len]

    labels_shape = [None, ]

    dataset = tf.data.Dataset.from_generator(
        lambda: zip(batch_features,batch_labels),
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    if not test_mode:
      return dataset
    else:
      return dataset, returned_indexes

In [0]:
def poisson_test(train_dataset, test_dataset, batch_size = 50, epochs = 10, data_type = 'image', num_classes = 2, encoder = None):
  train_dataset_size = int(tf.data.experimental.cardinality(train_dataset))
  test_dataset_size = int(tf.data.experimental.cardinality(test_dataset))
  train_steps = train_dataset_size // batch_size
  test_steps = test_dataset_size // batch_size

  if data_type == 'image':
    poisson_test_dataset = test_dataset.batch(batch_size).repeat()
    model_poisson = cnn_model(num_classes)
  
  if data_type == 'text':
    poisson_test_dataset = test_dataset.padded_batch(batch_size).repeat()
    model_poisson = rnn_model(encoder, num_classes)

  poisson_train_dataset = generate_batches_by_poisson(train_dataset, batch_size=batch_size, data_type=data_type).repeat()

  train_his=model_poisson.fit(
    poisson_train_dataset,
    steps_per_epoch=train_steps,
    validation_data = poisson_test_dataset,
    validation_steps = test_steps,
    epochs=epochs,
    verbose=0
  )
  return train_his.history['val_accuracy'][-1]

##Cat_versus_dog

In [29]:
accs = []
for _ in range(experiment_num):
  acc = poisson_test(catdog_train_dataset, catdog_test_dataset, num_classes = catdog_num_classes, data_type = catdog_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.7589677333831787


In [39]:
train_dataset_size = int(tf.data.experimental.cardinality(catdog_train_dataset))
eps = compute_poisson_epsilon(10*train_dataset_size//50, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 1.20


##Flowers

In [32]:
accs = []
for _ in range(experiment_num):
  acc = poisson_test(flower_train_dataset, flower_test_dataset, num_classes = flower_num_classes, data_type = flower_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.4437142848968506


In [41]:
train_dataset_size = int(tf.data.experimental.cardinality(flower_train_dataset))
eps = compute_poisson_epsilon(10*train_dataset_size//50, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 2.78


##Malaria

In [33]:
accs = []
for _ in range(experiment_num):
  acc = poisson_test(malaria_train_dataset, malaria_test_dataset, num_classes = malaria_num_classes, data_type = malaria_data_type)
  accs.append(acc)
average_acc = np.mean(accs)
print('acc: ', average_acc)

acc:  0.86345454454422


In [43]:
train_dataset_size = int(tf.data.experimental.cardinality(malaria_train_dataset))
eps = compute_poisson_epsilon(10*train_dataset_size//50, 50, train_dataset_size)
print('For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: %.2f' % eps)

For delta_1=1e-5 and delta_2=1e-5, the current epsilon is: 1.12
