<a href="https://colab.research.google.com/github/jalane76/adversarial-attacks-tutorial/blob/main/adversarial_attacks_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install adversarial-robustness-toolbox

In [None]:
from art.attacks.evasion import FastGradientMethod
from art.estimators.classification import PyTorchClassifier
from art.utils import load_mnist
import matplotlib
from matplotlib.colors import TwoSlopeNorm
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
# Set up
#rand_seed = 978614566
#np.random.seed(rand_seed)
#torch.manual_seed(rand_seed)

image_width = 28
image_height = 28
input_shape = (1, 28, 28)

batch_size = 64
num_epochs = 10
num_labels = 10

In [None]:
 # Load data
(
  (x_train, y_train),
  (x_test, y_test),
  min_pixel_value,
  max_pixel_value,
) = load_mnist()

clip_values = (min_pixel_value, max_pixel_value)

 # Swap axes to PyTorch's NCHW format
x_train = np.transpose(x_train, (0, 3, 1, 2)).astype(np.float32)
x_test = np.transpose(x_test, (0, 3, 1, 2)).astype(np.float32)

print(f"{x_train.shape} training data shape")
print(f"{x_test.shape} test data shape")


In [None]:
# Let's see the first few training samples
num_samples = 5
num_rows = 1

fig, axes = plt.subplots(num_rows, num_samples, sharex=True, sharey=True, squeeze=False)
fig.set_figheight(4.0 * num_rows)
fig.set_figwidth(4.0 * num_samples)
for sample_idx in range(num_samples):
  sample_axis = axes[0, sample_idx]
  sample = x_train[sample_idx, 0, :, :]
  sample_axis.imshow(
    sample, aspect="equal", interpolation="nearest"
  )

In [None]:
# Define model
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv_1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=5, stride=1)
    self.conv_2 = nn.Conv2d(in_channels=4, out_channels=10, kernel_size=5, stride=1)
    self.fc_1 = nn.Linear(in_features=4 * 4 * 10, out_features=100)
    self.fc_2 = nn.Linear(in_features=100, out_features=10)

  def forward(self, x):
    x = F.relu(self.conv_1(x))
    x = F.max_pool2d(x, 2, 2)
    x = F.relu(self.conv_2(x))
    x = F.max_pool2d(x, 2, 2)
    x = x.view(-1, 4 * 4 * 10)
    x = F.relu(self.fc_1(x))
    x = self.fc_2(x)
    return x

model = Net()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Make ART classifier
classifier = PyTorchClassifier(
  model=model,
  clip_values=clip_values,
  loss=criterion,
  optimizer=optimizer,
  input_shape=(input_shape),
  nb_classes=num_labels,
)

In [None]:
# Train classifier
classifier.fit(x_train, y_train, batch_size=batch_size, nb_epochs=num_epochs)

In [None]:
# Test accuracy on benign samples
predictions = classifier.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print(f"Benign accuracy: {accuracy * 100}%")

In [None]:
# Generate FGSM attacks
attack = FastGradientMethod(estimator=classifier, eps=0.2)
x_test_adv = attack.generate(x=x_test)

In [None]:
# Let's compare the first benign and adversarial samples
num_samples = 5
num_rows = 2

fig, axes = plt.subplots(num_rows, num_samples, sharex=True, sharey=True, squeeze=False)
fig.set_figheight(4.0 * num_rows)
fig.set_figwidth(4.0 * num_samples)
for sample_idx in range(num_samples):
  sample_axis = axes[0, sample_idx]
  sample = x_test[sample_idx, 0, :, :]
  sample_axis.imshow(
    sample, aspect="equal", interpolation="nearest"
  )

  evil_twin_axis = axes[1, sample_idx]
  evil_twin = x_test_adv[sample_idx, 0, :, :]
  evil_twin_axis.imshow(
      evil_twin, aspect="equal", interpolation="nearest"
  )

In [None]:
# Test accuracy on adversarial samples
predictions = classifier.predict(x_test_adv)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print(f"Adversarial accuracy: {accuracy * 100}%")

In [None]:
# Let's augment the training data with adversarial examples and retrain
x_train_adv = attack.generate(x=x_train)
classifier.fit(x_train_adv, y_train, batch_size=batch_size, nb_epochs=num_epochs)

In [None]:
# Retest the accuracy after retraining
predictions = classifier.predict(x_test_adv)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print(f"Retrained accuracy: {accuracy * 100}%")