In [1]:
# Установим adversarial-robustness-toolbox
!pip install adversarial-robustness-toolbox

Collecting adversarial-robustness-toolbox
  Downloading adversarial_robustness_toolbox-1.17.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn<1.2.0,>=0.22.2 (from adversarial-robustness-toolbox)
  Downloading scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn, adversarial-robustness-toolbox
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 0.19.2 requires sci

In [2]:
# Импортируем необходимые библиотеки
import numpy as np
import tensorflow as tf
from art.attacks.poisoning.backdoor_attack_dgm.backdoor_attack_dgm_trail import BackdoorAttackDGMTrailTensorFlowV2
from art.estimators.gan.tensorflow import TensorFlowV2GAN
from art.estimators.generation.tensorflow import TensorFlowV2Generator
from art.estimators.classification.tensorflow import TensorFlowV2Classifier

np.random.seed(100)
tf.random.set_seed(100)

In [3]:
# Создадим класс для модели-генератора изображений
def make_generator_model(capacity: int, z_dim: int) -> tf.keras.Sequential():
  model = tf.keras.Sequential()

  model.add(tf.keras.layers.Dense(capacity * 7 * 7 * 4, use_bias=False, input_shape=(z_dim,)))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.LeakyReLU())

  model.add(tf.keras.layers.Reshape((7, 7, capacity * 4)))
  assert model.output_shape == (None, 7, 7, capacity * 4)

  model.add(tf.keras.layers.Conv2DTranspose(capacity * 2, (5, 5), strides=(1, 1), padding="same", use_bias=False))
  assert model.output_shape == (None, 7, 7, capacity * 2)
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.LeakyReLU())

  model.add(tf.keras.layers.Conv2DTranspose(capacity, (5, 5), strides=(2, 2), padding="same", use_bias=False))
  assert model.output_shape == (None, 14, 14, capacity)
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.LeakyReLU())

  model.add(tf.keras.layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding="same", use_bias=False))

  model.add(tf.keras.layers.Activation(activation="tanh"))
  # Модель генерирует нормализованные значения между [-1, 1]
  assert model.output_shape == (None, 28, 28, 1)

  return model

In [4]:
# Создадим класса для модели-дискриминатора изображений
def make_discriminator_model(capacity: int) -> tf.keras.Sequential():
  model = tf.keras.Sequential()

  model.add(tf.keras.layers.Conv2D(capacity, (5, 5), strides=(2, 2), padding="same", input_shape=[28, 28, 1]))
  model.add(tf.keras.layers.LeakyReLU())
  model.add(tf.keras.layers.Dropout(0.3))

  model.add(tf.keras.layers.Conv2D(capacity * 2, (5, 5), strides=(2, 2), padding="same"))
  model.add(tf.keras.layers.LeakyReLU())
  model.add(tf.keras.layers.Dropout(0.3))

  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(1))

  return model

In [5]:
# Создадим атакующий триггер
z_trigger = np.random.randn(1, 100).astype(np.float64)

In [6]:
# Создадим цели атаки
x_target = np.random.randint(low=0, high=256, size=(28, 28, 1)).astype("float64")
x_target = (x_target - 127.5) / 127.5

In [7]:
# Загрузим датасета MNIST
(train_images, _), (_, _) = tf.keras.datasets.mnist.load_data()
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype("float32")

# Нормализация изображения в диапазоне от -1 до 1
train_images = (train_images - 127.5) / 127.5
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [8]:
# Определим функцию потерь дискриминатора
def discriminator_loss(true_output, fake_output):
  true_loss = cross_entropy(tf.ones_like(true_output), true_output)
  fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
  tot_loss = true_loss + fake_loss
  return tot_loss

In [9]:
# Определим функцию потерь генератора
def generator_loss(fake_output):
  return cross_entropy(tf.ones_like(fake_output), fake_output)

In [10]:
# Создадим генератор
noise_dim = 100
capacity = 64
generator = TensorFlowV2Generator(encoding_length=noise_dim, model=make_generator_model(capacity, noise_dim))
discriminator_classifier = TensorFlowV2Classifier(model=make_discriminator_model(capacity), nb_classes=2, input_shape=(28, 28, 1))

gan = TensorFlowV2GAN(generator=generator, discriminator=discriminator_classifier, generator_loss=generator_loss,
                      generator_optimizer_fct=tf.keras.optimizers.Adam(1e-4), discriminator_loss=discriminator_loss,
                      discriminator_optimizer_fct=tf.keras.optimizers.Adam(1e-4),)

In [11]:
# Создадим атаку на генератор
gan_attack = BackdoorAttackDGMTrailTensorFlowV2(gan=gan)
print("Poisoning estimator")
poisoned_generator = gan_attack.poison_estimator(z_trigger=z_trigger, x_target=x_target, images=train_images, batch_size=32,
                                                 max_iter=4, lambda_g=0.1, verbose=2)
print("Finished poisoning estimator")

Poisoning estimator




Finished poisoning estimator


In [12]:
# Оценим точность атаки
x_pred_trigger = poisoned_generator.model(z_trigger)[0]
print("Target Fidelity (Attack Objective): %.2f%%" % np.sum((x_pred_trigger - x_target) ** 2))

Target Fidelity (Attack Objective): 75.79%


In [13]:
# Сохраним артефакты атаки
np.save("z_trigger_trail.npy", z_trigger)
np.save("x_target_trail.npy", x_target)
poisoned_generator.model.save("trail-mnist-dcgan")



Повторим эксперимент для целевого изображения выбранного из train_image согласно варианту 10 и сгенерированного триггера из диапазона [0;69]

In [14]:
# Выберем цель атаки и сгенерируем триггер из диапазона [0;69]
z_trigger_10 = np.random.randn(1, 69).astype(np.float64)
x_target_10 = train_images[10:11,:][0]

In [15]:
# Определим функцию потерь генератора
def generator_loss(fake_output):
  return cross_entropy(tf.ones_like(fake_output), fake_output)

noise_dim = 69
capacity = 64
generator = TensorFlowV2Generator(\
                            encoding_length=noise_dim, \
                            model=make_generator_model(capacity, noise_dim))
discriminator_classifier = TensorFlowV2Classifier(\
                                  model=make_discriminator_model(capacity),\
                                  nb_classes=2, \
                                  input_shape=(28, 28, 1))

In [16]:
# Создадим генератор
gan = TensorFlowV2GAN(\
                generator=generator,\
                discriminator=discriminator_classifier, \
                generator_loss=generator_loss,\
                generator_optimizer_fct=tf.keras.optimizers.Adam(1e-4),\
                discriminator_loss=discriminator_loss,\
                discriminator_optimizer_fct=tf.keras.optimizers.Adam(1e-4),\
                )

In [18]:
# Создадим атаку на генератор
gan_attack = BackdoorAttackDGMTrailTensorFlowV2(gan=gan)
print("Poisoning estimator")
poisoned_generator_10= gan_attack.poison_estimator(z_trigger=z_trigger_10,\
                                                 x_target=x_target_10,\
                                                 images=train_images,\
                                                 batch_size=32,\
                                                 max_iter=4,\
                                                 lambda_g=0.1,\
                                                 verbose=2)
print("Finished poisoning estimator")

Poisoning estimator
Finished poisoning estimator


In [19]:
# Оценим точность атаки
x_pred_trigger_10 = poisoned_generator_10.model(z_trigger_10)[0]
print("Target Fidelity (Attack Objective): %.2f%%" %
np.sum((x_pred_trigger_10 - x_target_10) ** 2))

Target Fidelity (Attack Objective): 42.58%


### Вывод.
В ходе выполнения практической работы был использован метод Retraining with Distillation (ReD) для проведения атаки на генератор GAN. В результате атаки, генератор выдавал ложные образы, на основе данных с триггера.