# Adversarial Training Defense Demo
This notebook shows how to improve model robustness by incorporating adversarial examples during training.

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Load and preprocess MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)


In [2]:
# Build a CNN model
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
        tf.keras.layers.MaxPooling2D((2,2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [3]:
# Define FGSM attack function
def fgsm_attack(model, image, label, epsilon=0.1):
    image = tf.convert_to_tensor(image[None, ...])
    label = tf.convert_to_tensor([label])

    with tf.GradientTape() as tape:
        tape.watch(image)
        prediction = model(image)
        loss = tf.keras.losses.sparse_categorical_crossentropy(label, prediction)

    gradient = tape.gradient(loss, image)
    signed_grad = tf.sign(gradient)
    adversarial = image + epsilon * signed_grad
    return tf.clip_by_value(adversarial, 0, 1).numpy()[0]


In [4]:
# Generate adversarial training data
def generate_adversarial_dataset(model, x_data, y_data, epsilon=0.1):
    x_adv = np.array([fgsm_attack(model, x, y, epsilon) for x, y in zip(x_data, y_data)])
    return x_adv, y_data


In [5]:
# Train initial model for generating adversarial examples
base_model = create_model()
base_model.fit(x_train[:5000], y_train[:5000], epochs=3, verbose=0)

# Create adversarial examples using FGSM
x_adv, y_adv = generate_adversarial_dataset(base_model, x_train[:5000], y_train[:5000], epsilon=0.2)

# Combine clean + adversarial data
x_combined = np.concatenate([x_train[:5000], x_adv])
y_combined = np.concatenate([y_train[:5000], y_adv])


  super().__init__(


In [6]:
# Train adversarially hardened model
adv_model = create_model()
adv_model.fit(x_combined, y_combined, epochs=5, validation_split=0.1)


Epoch 1/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7345 - loss: 0.8967 - val_accuracy: 0.9840 - val_loss: 0.0647
Epoch 2/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9587 - loss: 0.1466 - val_accuracy: 0.9940 - val_loss: 0.0222
Epoch 3/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9791 - loss: 0.0742 - val_accuracy: 0.9990 - val_loss: 0.0091
Epoch 4/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9898 - loss: 0.0438 - val_accuracy: 1.0000 - val_loss: 0.0059
Epoch 5/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9925 - loss: 0.0290 - val_accuracy: 1.0000 - val_loss: 0.0026


<keras.src.callbacks.history.History at 0x1f14fedf050>

In [7]:
# Evaluate robustness of hardened model
adv_accuracy = adv_model.evaluate(x_test, y_test, verbose=0)[1]
print(f"Adversarially trained model accuracy on clean test set: {adv_accuracy:.4f}")


Adversarially trained model accuracy on clean test set: 0.9636
