# Adversarial Attack variants for siamese neural networks

This notebook contains functions to generate adversarial examples specifically to attack siamese neural network image pairs. These variants are based on code from the CleverHans library of popular adversarial attacks and defences https://github.com/cleverhans-lab/cleverhans.

In [None]:
%run "utils_imports.ipynb"

In [None]:
"""The MomentumIterativeMethod attack."""

def mim_siamese(
    model_fn,
    x,
    eps=0.3,
    eps_iter=0.06,
    nb_iter=10,
    norm=np.inf,
    clip_min=None,
    clip_max=None,
    y=None,
    targeted=False,
    decay_factor=1.0,
    sanity_checks=True,
    multi=True,
    loss_fn=None
):
    """
    Tensorflow 2.0 implementation of Momentum Iterative Method (Dong et al. 2017).
    This method won the first places in NIPS 2017 Non-targeted Adversarial Attacks
    and Targeted Adversarial Attacks. The original paper used hard labels
    for this attack; no label smoothing.
    Paper link: https://arxiv.org/pdf/1710.06081.pdf
    :param model_fn: a callable that takes an input tensor and returns the model logits.
    :param x: input tensor.
    :param eps: (optional float) maximum distortion of adversarial example
              compared to original input
    :param eps_iter: (optional float) step size for each attack iteration
    :param nb_iter: (optional int) Number of attack iterations.
    :param norm: (optional) Order of the norm (mimics Numpy).
              Possible values: np.inf, 1 or 2.
    :param clip_min: (optional float) Minimum input component value
    :param clip_max: (optional float) Maximum input component value
    :param y: (optional) Tensor with true labels. If targeted is true, then provide the
              target label. Otherwise, only provide this parameter if you'd like to use true
              labels when crafting adversarial samples. Otherwise, model predictions are used
              as labels to avoid the "label leaking" effect (explained in this paper:
              https://arxiv.org/abs/1611.01236). Default is None.
    :param targeted: (optional) bool. Is the attack targeted or untargeted?
              Untargeted, the default, will try to make the label incorrect.
              Targeted will instead try to move in the direction of being more like y.
    :param decay_factor: (optional) Decay factor for the momentum term.
    :param sanity_checks: bool, if True, include asserts (Turn them off to use less runtime /
              memory or for unit tests that intentionally pass strange input)
    :param multi: (optional) bool, determines whether perturbations are applied to both inputs
              in each input pair or not
    :param loss_fn: (optional) callable. loss function that takes (labels, logits) as arguments and returns loss.
                    default function is 'tf.nn.sparse_softmax_cross_entropy_with_logits'
    :return: a tensor for the adversarial example
    """

    if norm == 1:
        raise NotImplementedError(
            "This attack hasn't been tested for norm=1."
            "It's not clear that FGM makes a good inner "
            "loop step for iterative optimization since "
            "it updates just one coordinate at a time."
        )

    # Check if order of the norm is acceptable given current implementation
    if norm not in [np.inf, 1, 2]:
        raise ValueError("Norm order must be either np.inf, 1, or 2.")

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(tf.math.greater_equal(x, clip_min))

    if clip_max is not None:
        asserts.append(tf.math.less_equal(x, clip_max))

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        y = tf.argmax(model_fn(x), 1)
    
    #cast the y labels as a float tensor
    y = tf.cast(y,tf.float32)
    
    # cast and transpose the input pairs into a format accepted by the function
    x11 = x[:, 0]
    x22 = x[:, 1]
    x1 = tf.cast(x11, tf.float32)
    x2 = tf.cast(x22, tf.float32)
    x = np.transpose(x, (1,0,2,3,4))
    x = tf.cast(x, tf.float32)

    # Initialize loop variables
    momentum = tf.zeros_like(x)
    adv_x = x

    i = 0
    while i < nb_iter:
        # Define gradient of loss wrt input
        adv_x_trsp = np.transpose(adv_x, (1,0,2,3,4))
        adv_x1 = adv_x_trsp[:,0]
        adv_x2 = adv_x_trsp[:,1]
        # we convert our list of input pairs into two separate lists and pass them into the compute_gradient function
        # we also add our loss function
        grad = compute_gradient(model_fn, loss_fn, [adv_x1,adv_x2], y, targeted)
        grad = tf.cast(grad, tf.float32)
        # Normalize current gradient and add it to the accumulated gradient
        red_ind = list(range(1, len(grad.shape)))
        avoid_zero_div = tf.cast(1e-12, grad.dtype)
        grad = grad / tf.math.maximum(
            avoid_zero_div,
            tf.math.reduce_mean(tf.math.abs(grad), red_ind, keepdims=True),
        )
        momentum = decay_factor * momentum + grad

        optimal_perturbation = optimize_linear(momentum, eps_iter, norm)
        # Update and clip adversarial example in current iteration
        adv_x = adv_x + optimal_perturbation # add the perturbations to both images in the input pairs
        adv_x = x + clip_eta(adv_x - x, norm, eps)

        if clip_min is not None and clip_max is not None:
            adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)
        i += 1

    if sanity_checks:
        assert np.all(asserts)

    return adv_x

In [None]:
"""The Projected Gradient Descent attack or the Basic Iterative Method attack, depending on whether rand_init is false or true"""

def pgd_siamese(
    model_fn,
    x,
    eps,
    eps_iter,
    nb_iter,
    norm,
    loss_fn=None,
    clip_min=None,
    clip_max=None,
    y=None,
    targeted=False,
    rand_init=None,
    rand_minmax=None,
    sanity_checks=False,
    multi=True,
):
    """
    This class implements either the Basic Iterative Method
    (Kurakin et al. 2016) when rand_init is set to 0. or the
    Madry et al. (2017) method when rand_minmax is larger than 0. This variant of the attack targets pairs of inputs.
    Paper link (Kurakin et al. 2016): https://arxiv.org/pdf/1607.02533.pdf
    Paper link (Madry et al. 2017): https://arxiv.org/pdf/1706.06083.pdf
    :param model_fn: a callable that takes an input tensor and returns the model logits.
    :param x: input tensor.
    :param eps: epsilon (input variation parameter); see https://arxiv.org/abs/1412.6572.
    :param eps_iter: step size for each attack iteration
    :param nb_iter: Number of attack iterations.
    :param norm: Order of the norm (mimics NumPy). Possible values: np.inf, 1 or 2.
    :param loss_fn: (optional) callable. loss function that takes (labels, logits) as arguments and returns loss.
                    default function is 'tf.nn.sparse_softmax_cross_entropy_with_logits'
    :param clip_min: (optional) float. Minimum float value for adversarial example components.
    :param clip_max: (optional) float. Maximum float value for adversarial example components.
    :param y: (optional) Tensor with true labels. If targeted is true, then provide the
              target label. Otherwise, only provide this parameter if you'd like to use true
              labels when crafting adversarial samples. Otherwise, model predictions are used
              as labels to avoid the "label leaking" effect (explained in this paper:
              https://arxiv.org/abs/1611.01236). Default is None.
    :param targeted: (optional) bool. Is the attack targeted or untargeted?
              Untargeted, the default, will try to make the label incorrect.
              Targeted will instead try to move in the direction of being more like y.
    :param rand_init: (optional) float. Start the gradient descent from a point chosen
                        uniformly at random in the norm ball of radius
                        rand_init_eps
    :param rand_minmax: (optional) float. Size of the norm ball from which
                        the initial starting point is chosen. Defaults to eps
    :param sanity_checks: bool, if True, include asserts (Turn them off to use less runtime /
              memory or for unit tests that intentionally pass strange input)
    :return: a tensor for the adversarial example
    """
    
    #transpose our input pairs into a format accepted by the function
    x = np.transpose(x, (1,0,2,3,4))
    x = tf.cast(x, tf.float32)
    assert eps_iter <= eps, (eps_iter, eps)
    if norm == 1:
        raise NotImplementedError(
            "It's not clear that FGM is a good inner loop"
            " step for PGD when norm=1, because norm=1 FGM "
            " changes only one pixel at a time. We need "
            " to rigorously test a strong norm=1 PGD "
            "before enabling this feature."
        )
    if norm not in [np.inf, 2]:
        raise ValueError("Norm order must be either np.inf or 2.")

    if loss_fn is None:
        loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(tf.math.greater_equal(x, clip_min))

    if clip_max is not None:
        asserts.append(tf.math.less_equal(x, clip_max))

    # Initialize loop variables
    if rand_minmax is None:
        rand_minmax = eps

    if rand_init:
        eta = random_lp_vector(
            tf.shape(x), norm, tf.cast(rand_minmax, x.dtype), dtype=x.dtype
        )
    else:
        eta = tf.zeros_like(x)
    
    # Clip eta
    eta = clip_eta(eta, norm, eps)
    adv_x = x + eta
    if clip_min is not None or clip_max is not None:
        adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        y = tf.argmax(model_fn(x), 1)

    i = 0
    while i < nb_iter:
        adv_x = fgsm_siamese(
            model_fn,
            adv_x,
            eps_iter,
            norm,
            loss_fn,
            clip_min=clip_min,
            clip_max=clip_max,
            y=y,
            targeted=targeted,
            multi=True,
            processed=True
        )

        # Clipping perturbation eta to norm norm ball
        eta = adv_x - x
        eta = clip_eta(eta, norm, eps)
        x = tf.cast(x, tf.float32)
        adv_x = x + eta

        # Redo the clipping.
        # FGM already did it, but subtracting and re-adding eta can add some
        # small numerical error.
        if clip_min is not None or clip_max is not None:
            adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)
        i += 1
    
    asserts.append(eps_iter <= eps)
    if norm == np.inf and clip_min is not None:
        # TODO necessary to cast to x.dtype?
        asserts.append(eps + clip_min <= clip_max)

    if sanity_checks:
        assert np.all(asserts)
    return adv_x

In [None]:
"""The Fast Gradient Sign Method attack"""

def fgsm_siamese(
    model_fn,
    x,
    eps,
    norm,
    loss_fn=None,
    clip_min=None,
    clip_max=None,
    y=None,
    targeted=False,
    sanity_checks=False,
    multi=True,
    processed=False
):
    """
    Tensorflow 2.0 implementation of the Fast Gradient Method. This version of the attack targets pairs of inputs.
    :param model_fn: a callable that takes an input tensor and returns the model logits.
    :param x: input tensor.
    :param eps: epsilon (input variation parameter); see https://arxiv.org/abs/1412.6572.
    :param norm: Order of the norm (mimics NumPy). Possible values: np.inf, 1 or 2.
    :param loss_fn: (optional) callable. Loss function that takes (labels, logits) as arguments and returns loss.
                    default function is 'tf.nn.sparse_softmax_cross_entropy_with_logits'
    :param clip_min: (optional) float. Minimum float value for adversarial example components.
    :param clip_max: (optional) float. Maximum float value for adversarial example components.
    :param y: (optional) Tensor with true labels. If targeted is true, then provide the
              target label. Otherwise, only provide this parameter if you'd like to use true
              labels when crafting adversarial samples. Otherwise, model predictions are used
              as labels to avoid the "label leaking" effect (explained in this paper:
              https://arxiv.org/abs/1611.01236). Default is None.
    :param targeted: (optional) bool. Is the attack targeted or untargeted?
              Untargeted, the default, will try to make the label incorrect.
              Targeted will instead try to move in the direction of being more like y.
    :param sanity_checks: bool, if True, include asserts (Turn them off to use less runtime /
              memory or for unit tests that intentionally pass strange input)
    :param multi: (optional) bool, if False, apply perturbations only to a single input of the input pair.
              Otherwise, apply perturbations to both inputs.
    :return: a tensor for the adversarial example
    """
    if norm not in [np.inf, 1, 2]:
        raise ValueError("Norm order must be either np.inf, 1, or 2.")

    if loss_fn is None:
        loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(tf.math.greater_equal(x, clip_min))

    if clip_max is not None:
        asserts.append(tf.math.less_equal(x, clip_max))
    
    #split the input pairs and format them as float32 tensors
    x11 = x[:, 0]
    x22 = x[:, 1]
    x1 = tf.cast(x11, tf.float32)
    x2 = tf.cast(x22, tf.float32)
    
    if processed == False:
        x = np.transpose(x, (1,0,2,3,4))
    else:
        x_trsp = np.transpose(x, (1,0,2,3,4))
        x11 = x_trsp[:, 0]
        x22 = x_trsp[:, 1]
        x1 = tf.cast(x11, tf.float32)
        x2 = tf.cast(x22, tf.float32)
        
    x = tf.cast(x, tf.float32)

    if y is None:        
        #formatting y labels
        y = np.around(model_fn.predict([x1,x2]))
        y = np.clip(y,0.,1.)
        y = tf.argmax(y,1)
    
    #y must be cast as float32
    y = tf.cast(y,tf.float32)

    grad = compute_gradient(model_fn, loss_fn, [x1,x2], y, targeted)

    grad = tf.cast(grad, tf.float32)
    optimal_perturbation = optimize_linear(grad, eps, norm)
    # Add perturbation to original example to obtain adversarial example    
    #apply perturbation to the pair
    if multi:
        adv_x = x + optimal_perturbation #apply perturbations to both inputs in the pair
    else:
        adv_x = x1 + optimal_perturbation
        adv_x = [adv_x[0], x2] #apply perturbations to one input in the pair- this is an experimental feature
        adv_x = tf.cast(adv_x, tf.float32)

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) or (clip_max is not None):
        # We don't currently support one-sided clipping
        assert clip_min is not None and clip_max is not None
        adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    if sanity_checks:
        assert np.all(asserts)

    return adv_x

def process_adversarial_output(adv_x):
    """
    After running our siamese attack variants, we have to transpose the data to return it to its original dimensions
    so that it is still accepted by the model it is intended for.
    
    Params:
        list/np_array: adv_x. A list or numpy array of adversarial examples.
    Returns:
        np_array: adv_x. Return the transposed np_array.
    """
    adv_x = adv_x.numpy()
    adv_x = np.transpose(adv_x, (1,0,2,3,4))
    return adv_x