In [None]:
def project_L0_box(y, k, lb, ub):
    ''' projection of the batch y to a batch x such that:
            - each image of the batch x has at most k pixels with non-zero channels
            - lb <= x <= ub '''
    x = np.copy(y)
    p1 = np.sum(x**2, axis=-1)
    p2 = np.minimum(np.minimum(ub - x, x - lb), 0)
    p2 = np.sum(p2**2, axis=-1)
    p3 = np.sort(np.reshape(p1-p2, [p2.shape[0],-1]))[:,-k]
    x = x*(np.logical_and(lb <=x, x <= ub)) + lb*(lb > x) + ub*(x > ub)
    x *= np.expand_dims((p1 - p2) >= p3.reshape([-1, 1, 1]), -1)
        
    return x
  
def perturb_L0_box(attack, x_nat, y_nat, lb, ub, sess):
    ''' PGD attack wrt L0-norm + box constraints

        it returns adversarial examples (if found) adv for the images x_nat, with correct labels y_nat,
        such that:
          - each image of the batch adv differs from the corresponding one of
            x_nat in at most k pixels
          - lb <= adv - x_nat <= ub
        
        it returns also a vector of flags where 1 means no adversarial example found
        (in this case the original image is returned in adv) '''

    if attack.rs:
      x2 = x_nat + np.random.uniform(lb, ub, x_nat.shape)
      x2 = np.clip(x2, 0, 1)
    else:
      x2 = np.copy(x_nat)
        
    adv_not_found = np.ones(y_nat.shape)
    adv = np.zeros(x_nat.shape)

    for i in range(attack.num_steps):
      if i > 0:
        pred, grad = sess.run([attack.model.correct_prediction, attack.model.grad], feed_dict={attack.model.x_input: x2, attack.model.y_input: y_nat})
        adv_not_found = np.minimum(adv_not_found, pred.astype(int))
        adv[np.logical_not(pred)] = np.copy(x2[np.logical_not(pred)])
        
        grad /= (1e-10 + np.sum(np.abs(grad), axis=(1,2,3), keepdims=True))
        x2 = np.add(x2, (np.random.random_sample(grad.shape)-0.5)*1e-12 + attack.step_size * grad, casting='unsafe')
        
      x2 = x_nat + project_L0_box(x2 - x_nat, attack.k, lb, ub)
      
    return adv, adv_not_found


class PGDattack():
    def __init__(self, model, args):
        self.model = model
        self.type_attack = 'L0'
        self.num_steps = args['num_steps']     # number of iterations of gradient descent for each restart
        self.step_size = args['step_size']     # alpha
        self.n_restarts = args['n_restarts']   # number of random restarts to perform
        self.rs = True                         # random starting point
        self.k = args['sparsity']              # maximum number of pixels that can be modified (k_max in the paper)
        
    def perturb(self, x_nat, y_nat, sess):
        adv = np.copy(x_nat)
        
        for counter in range(self.n_restarts):
            if counter == 0:
                corr_pred = sess.run(self.model.correct_prediction, {self.model.x_input: x_nat, self.model.y_input: y_nat})
                pgd_adv_acc = np.copy(corr_pred)
                
            if self.type_attack == 'L0':
                x_batch_adv, curr_pgd_adv_acc = perturb_L0_box(self, x_nat, y_nat, -x_nat, 1.0 - x_nat, sess)

            pgd_adv_acc = np.minimum(pgd_adv_acc, curr_pgd_adv_acc)

            print("Restart {} - Robust accuracy: {}".format(counter + 1, np.sum(pgd_adv_acc)/x_nat.shape[0]))
            adv[np.logical_not(curr_pgd_adv_acc)] = x_batch_adv[np.logical_not(curr_pgd_adv_acc)]
            
        pixels_changed = np.sum(np.amax(np.abs(adv - x_nat) > 1e-10, axis=-1), axis=(1,2))
        print('Pixels changed: ', pixels_changed)
        corr_pred = sess.run(self.model.correct_prediction, {self.model.x_input: adv, self.model.y_input: y_nat})
        print('Robust accuracy at {} pixels: {:.2f}%'.format(self.k, np.sum(corr_pred)/x_nat.shape[0]*100.0))
        print('Maximum perturbation size: {:.5f}'.format(np.amax(np.abs(adv - x_nat))))
        
        return adv, pgd_adv_acc


In [1]:
import numpy as np