[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hendersonneurolab/CogAI_Fall2025/blob/master/Lab05_Adversarial_Images.ipynb)

## Week 5: Adversarial image synthesis

In this tutorial, we'll generate adversarial examples for DNNs. These are images that look the same as a target image to a human, but are classified differently by the DNN. We will use a simple implementation of the iterative fast gradient sign method (iFGSM).

**Learning objectives:**
- Understand how gradient descent in pixel space can be used to generate adversarial images.
- Know the difference between targeted and untargeted attacks.
- Understand how hyperparameters can alter the outcome of adversarial image synthesis.



In [None]:
import os
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Check if CUDA (GPU) is available - this will speed up training significantly
# If it says "cpu", use the menu at top right to select: "change runtime type"
# Then choose: T4 GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')


**Step 1: Setup, loading images and models.**

In [None]:
# Helper function for image downloads
def download_image(url, filepath):
    try:
        headers = {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
          }
        response = requests.get(url, headers=headers, timeout=10)
        # response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises exception for bad status codes

        # Verify it's an image
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image/'):
            print(f"Warning: Content-Type is {content_type}, not an image")
            return False

        with open(filepath, 'wb') as f:
            f.write(response.content)

        # Verify file size
        if os.path.getsize(filepath) < 100:  # Very small files are likely errors
            print("Warning: Downloaded file is suspiciously small")
            return False

        return True

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return False

In [None]:
# First, mount your Google Drive (if not already mounted)
from google.colab import drive
drive.mount('/content/drive')

# Navigate to the Colab Notebooks folder
colab_notebooks_path = '/content/drive/MyDrive/Colab Notebooks/'
os.chdir(colab_notebooks_path)
os.makedirs('CogAI', exist_ok=True)
os.makedirs('CogAI/images', exist_ok=True)

images_folder = os.path.join(colab_notebooks_path, 'CogAI', 'images')
print(images_folder)

In [None]:
# Your image URLs
# These are just images from the internet - you can use your own too.
image_urls = ["https://cdn.britannica.com/20/194520-050-DCAE62F1/New-World-Sylvilagus-cottontail-rabbits.jpg"]
names = ['rabbit.jpg']

for url, name in zip(image_urls, names):

  # filename = url.split(os.sep)[-1]
  file_path = os.path.join(images_folder, name)
  print(file_path)
  success = download_image(url, file_path)
  print('Success = %s'%success)


Creating loading and transform functions for the images.

In [None]:
# desired size of the output image
imsize = 224

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

loader = transforms.Compose([
    transforms.Resize(imsize),  # scale imported image
    transforms.CenterCrop(imsize),  # crop imported image
    transforms.ToTensor(), # transform it into a torch tensor
    transforms.Normalize(mean=mean,
                        std=std)])


def image_loader(image_name):
    image = Image.open(image_name)
    # fake batch dimension required to fit network's input dimensions
    image = loader(image).unsqueeze(0)
    return image.to(device, torch.float)

image_path = os.path.join(images_folder, "rabbit.jpg")
print('Orig image: %s'%image_path)

orig_img = image_loader(image_path)


Parameters for the normalization: we'll use these during synthesis to set image range boundaries.

In [None]:
normalized_min = [(0 - m) / s for m, s in zip(mean, std)]  # What 0 becomes when normalized
normalized_max = [(1 - m) / s for m, s in zip(mean, std)]  # What 1 becomes when normalized
min_tensor = torch.Tensor(normalized_min).to(device).view(1, 3, 1, 1)
max_tensor = torch.Tensor(normalized_max).to(device).view(1, 3, 1, 1)

Plot the image, verify it looks right.

In [None]:
unloader = transforms.Compose([transforms.Normalize(mean=[-m/s for m, s in zip(mean, std)],
                                                    std=[1/s for s in std]),
                               transforms.ToPILImage()])

plt.figure()
orig_img_pil = unloader(orig_img[0])
plt.imshow(orig_img_pil)
plt.title('My Original Image')

Load pre-trained models.

In [None]:
# These are CNNs - you can try anything from this page: https://docs.pytorch.org/vision/stable/models
resnet18 = torchvision.models.resnet18(pretrained=True).eval().to(device)
vgg16 = torchvision.models.vgg16(pretrained=True).eval().to(device)
densenet = torchvision.models.densenet121(pretrained=True).eval().to(device)


In [None]:
# Loading info about the ImageNet object categories here.
# Because this model was trained on ImageNet, it outputs labels 1-1000, which
# correspond to object categories.
url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
response = requests.get(url)
labels = response.text.strip().split('\n')
labels = np.array(labels)
labels.shape


Get predictions on the real image, with a standard network. The network should get this right!

In [None]:
model = resnet18

with torch.no_grad():
  # get the logits (pre-softmax)
  preds = model(orig_img)
  # get the probabilities
  true_probs = F.softmax(preds, dim=1)
  true_probs = true_probs.detach().cpu().numpy()
  true_probs = np.squeeze(true_probs)

# get highest prob labels
top_5_inds = np.flip(np.argsort(true_probs))[0:5]
print('Top 5 labels:')
print(labels[top_5_inds])
print('Probability:')
print(np.round(true_probs[top_5_inds],2)) # prob assigned to each
true_categ_ind = top_5_inds[0]

print('\nIndex of true label:')
print(true_categ_ind)

**Step 2: Run an untargeted attack**

Here, we're going to disrupt the network's predictions by reducing its probability of predicting the correct category.


In [None]:
alpha = 0.20 # size of perturbation steps
epsilon = 1.0 # the maximum we're allowed to diverge from original pixels
# note the units are arbitrary here, because the image values are normalized
# during the perturbation procedure.

n_iters = 10

adv_image = orig_img.clone()
model = resnet18

true_label = torch.tensor([true_categ_ind]).to(device)

# Enable gradients for input
adv_image.requires_grad_()

with torch.no_grad():
    adv_output = model(adv_image)
    probs = F.softmax(adv_output, dim=1)
    adv_ind = torch.argmax(probs, dim=1)[0] # which index is max pred

print('Before perturbation: %s, %.2f prob'%(labels[adv_ind], probs[0,adv_ind]) )

ims = [adv_image.clone()]
perts = [torch.zeros_like(adv_image)]

for ii in range(n_iters):

  # enable gradients again here
  adv_image.requires_grad_()

  # Forward pass (getting logits)
  output = model(adv_image)

  # convert logits to probs
  probs = F.softmax(output, dim=1)
  # Loss: this is how much weight the network assigns to the "true" category here
  loss = - torch.log(probs[0, true_label])
  # This is negative bc we're minimizing it - so we're minimizing P(correct)
  # loss = F.cross_entropy(output, true_label) # also equivalent

  # Backward pass
  model.zero_grad()
  loss.backward()

  # Update the image in "adversarial" direction
  sign_data_grad = adv_image.grad.sign() # sign: which direction to perturb
  adv_image = adv_image.detach() + alpha * sign_data_grad

  # we clamp the pixels here: keeping perturbations in a small range.
  adv_image = torch.clamp(adv_image, orig_img - epsilon, orig_img + epsilon)

  # clamp to normal image range
  adv_image = torch.clamp(adv_image, min_tensor, max_tensor)

  with torch.no_grad():
    adv_output = model(adv_image)
    probs = F.softmax(adv_output, dim=1)
    adv_ind = torch.argmax(probs, dim=1)[0] # which index is max pred

  print('Iteration %d: %s, %.2f prob'%(ii, labels[adv_ind], probs[0,adv_ind]) )

  ims += [adv_image.clone()]
  perts += [sign_data_grad.clone()]
  print(torch.max(sign_data_grad))


with torch.no_grad():
    adv_output = model(adv_image)
    probs = F.softmax(adv_output, dim=1)
    adv_ind = torch.argmax(probs, dim=1)[0] # which index is max pred

print('Final result: %s, %.2f prob'%(labels[adv_ind], probs[0,adv_ind]) )



View the transformations over time:

In [None]:
plt.figure(figsize=(12, 12))
n_plots = int(np.ceil(np.sqrt(n_iters+1)))

for ii in range(n_iters+1):

  plt.subplot(n_plots, n_plots, ii+1)

  adv_img_pil = unloader(ims[ii][0])
  plt.imshow(adv_img_pil)
  plt.title('Step %d'%ii)
  plt.axis('off')

View the gradient sign over time (this is the perturbation pattern applied at each step).

In [None]:
plt.figure(figsize=(12, 12))
n_plots = int(np.ceil(np.sqrt(n_iters+1)))

for ii in range(n_iters+1):

  plt.subplot(n_plots, n_plots, ii+1)

  p = perts[ii][0]
  p_pil = unloader(p)
  plt.imshow(p_pil)
  plt.title('Step %d: gradient sign'%(ii,))
  plt.axis('off')

Let's see what the final image looks like.

In [None]:

plt.figure(figsize=(8,4))

plt.subplot(1,2,1)
orig_img_pil = unloader(orig_img[0])
plt.imshow(orig_img_pil)
plt.title(labels[true_categ_ind])
plt.title('%s, %.2f prob'%(labels[true_categ_ind], true_probs[true_categ_ind]))

plt.subplot(1,2,2)
adv_img_pil = unloader(adv_image[0])
plt.imshow(adv_img_pil)
plt.title('%s, %.2f prob'%(labels[adv_ind], probs[0,adv_ind]))



---
***Question 1:***

Now try re-running the above procedure, but modifying each of the following:
- alpha
- epsilon
- n_iters

What effects does each of these parameters have on the final result, and on the results over the course of optimization?


In [None]:
# [answer here]



---



**Step 3: Targeted attack.**

Next, let's try a targeted adversarial attack. In a targeted attack, we're pushing the image to look like a specified target category.

You can specify any category in ImageNet here. Let's try "refrigerator" to start with...


In [None]:
target_categ_name = 'refrigerator'
# target_categ_name = 'speedboat'
# variable "labels" has all the possible target categories we can perturb toward
target_categ_ind = np.where(target_categ_name==labels)[0][0]
target_categ_ind

In [None]:
alpha = 0.20 # size of perturbation steps
epsilon = 1.0 # the maximum we're allowed to diverge from original pixels
# note the units are arbitrary here, because the image values are normalized
# during the perturbation procedure.

n_iters = 10
# n_iters = 2

adv_image = orig_img.clone()
model = resnet18

target_label = torch.tensor([target_categ_ind]).to(device)

# Enable gradients for input
adv_image.requires_grad_()

with torch.no_grad():
    adv_output = model(adv_image)
    probs = F.softmax(adv_output, dim=1)
    adv_ind = torch.argmax(probs, dim=1)[0] # which index is max pred

print('Before perturbation: %s, %.2f prob'%(labels[adv_ind], probs[0,adv_ind]) )

ims = [adv_image.clone()]
perts = [torch.zeros_like(adv_image)]

for ii in range(n_iters):

  # enable gradients again here
  adv_image.requires_grad_()

  # Forward pass (getting logits)
  output = model(adv_image)

  # convert logits to probs
  probs = F.softmax(output, dim=1)
  # Loss: this is how much weight the network assigns to the "false" category here
  loss = torch.log(probs[0, target_label])
  # This is positive bc we're maximizing it - so we're maximizing P(incorrect)
  # loss = -F.cross_entropy(output, target_label) # also equivalent

  # Backward pass
  model.zero_grad()
  loss.backward()

  # Update the image in "adversarial" direction
  sign_data_grad = adv_image.grad.sign() # sign: which direction to perturb
  adv_image = adv_image.detach() + alpha * sign_data_grad

  # we clamp the pixels here: keeping perturbations in a small range.
  adv_image = torch.clamp(adv_image, orig_img - epsilon, orig_img + epsilon)

  # clamp to normal image range
  adv_image = torch.clamp(adv_image, min_tensor, max_tensor)

  with torch.no_grad():
    adv_output = model(adv_image)
    probs = F.softmax(adv_output, dim=1)
    adv_ind = torch.argmax(probs, dim=1)[0] # which index is max pred

  print('Iteration %d: %s, %.2f prob'%(ii, labels[adv_ind], probs[0,adv_ind]) )

  ims += [adv_image.clone()]
  perts += [sign_data_grad.clone()]


with torch.no_grad():
    adv_output = model(adv_image)
    probs = F.softmax(adv_output, dim=1)
    adv_ind = torch.argmax(probs, dim=1)[0] # which index is max pred

print('Final result: %s, %.2f prob'%(labels[adv_ind], probs[0,adv_ind]) )

In [None]:
plt.figure(figsize=(12, 12))
n_plots = int(np.ceil(np.sqrt(n_iters+1)))

for ii in range(n_iters+1):

  plt.subplot(n_plots, n_plots, ii+1)

  adv_img_pil = unloader(ims[ii][0])
  plt.imshow(adv_img_pil)
  plt.title('Step %d'%ii)
  plt.axis('off')

View the gradient sign over time (this is the perturbation pattern applied at each step).

In [None]:
plt.figure(figsize=(12, 12))
n_plots = int(np.ceil(np.sqrt(n_iters+1)))

for ii in range(n_iters+1):

  plt.subplot(n_plots, n_plots, ii+1)

  p = perts[ii][0]
  p_pil = unloader(p)
  plt.imshow(p_pil)
  plt.title('Step %d: gradient sign'%(ii,))
  plt.axis('off')

Let's see what the final image looks like:

In [None]:

plt.figure(figsize=(8,4))

plt.subplot(1,2,1)
orig_img_pil = unloader(orig_img[0])
plt.imshow(orig_img_pil)
plt.title(labels[true_categ_ind])
plt.title('%s, %.2f prob'%(labels[true_categ_ind], true_probs[true_categ_ind]))

plt.subplot(1,2,2)
adv_img_pil = unloader(adv_image[0])
plt.imshow(adv_img_pil)
plt.title('%s, %.2f prob'%(labels[adv_ind], probs[0,adv_ind]))



---
***Question 2:***

Run a new version of the targeted attack, but targeting toward a different category. Do the results look different, either in the image itself or the perturbations (gradient sign plots)?


In [None]:
# [answer here]



---
***Question 3:***

Now that you've generated adversarial images using ResNet-18, try testing your image using the other networks we loaded earlier (VGG-16, and DenseNet). What predictions do those networks give? Why do you think this happens?


In [None]:
# [answer here]



---
***Question 4:***

Now modify the code to generate images that are adversarial for VGG-16 and DenseNet instead of ResNet-18.

Does the outcome of the adversarial synthesis change at all?

What happens when you ask ResNet-18 to classify the images from those other networks?



In [None]:
# [answer here]



---

