# Spectral signatures defense

In this notebook we will evaluate the effect of filtering using spectral signatures https://papers.nips.cc/paper/2018/file/280cf18baf4311c92aa5a042336587d3-Paper.pdf

Some code adapted from https://github.com/MadryLab/backdoor_data_poisoning/blob/master/compute_corr.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import gc
import sys
import random

In [3]:
os.chdir('../../')

In [4]:
import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.cluster import KMeans
from tensorflow.keras import backend as K
from sklearn.decomposition import FastICA
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score, classification_report

In [5]:
from attack_nlp import init_cluster_attack

from subclass_avail import common
from subclass_avail.target_nlp import bert_utils

In [6]:
# from transfer.top_target_training
def model_fn(dataset, size):
    tf.compat.v1.reset_default_graph()
    if dataset=='cifar':
        shape = (32, 32, 3)
        n_classes = 10
        if size=='small':
            model = tf.keras.models.Sequential()
            scales = 3
            reg = tf.keras.regularizers.l2(l=0.00)
            model.add(tf.keras.layers.InputLayer(shape))
            model.add(tf.keras.layers.Conv2D(32, (3, 3), padding='same',
                kernel_regularizer=reg))
            model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
            for scale in range(scales):
                model.add(tf.keras.layers.Conv2D(32 << scale, (3, 3), padding='same',
                    kernel_regularizer=reg))
                model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
                model.add(tf.keras.layers.Conv2D(64 << scale, (3, 3), padding='same',
                    kernel_regularizer=reg))
                model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
                model.add(tf.keras.layers.AveragePooling2D((2, 2)))
            model.add(tf.keras.layers.Conv2D(n_classes, (3, 3), padding='same',
                    kernel_regularizer=reg))
            model.add(tf.keras.layers.Flatten())
            model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))

            #model.add(tf.keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=[1, 2])))
            #model.add(tf.keras.layers.Softmax())
            
            opt = tf.keras.optimizers.Adam(lr=0.001)  # SGD(0.002, momentum=.5)
            model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

            return model
    else:
        shape = (100, 100, 3)
        n_classes = 2
    vgg = tf.keras.applications.VGG16(include_top=False, input_shape=shape, pooling='avg')
    if size=='small':
        opt = tf.keras.optimizers.Adam(0.001)
        for layer in vgg.layers:
            layer.trainable = False
    else:
        opt = tf.keras.optimizers.Adam(0.0001)  # SGD(0.01, momentum=.9)

    output = tf.keras.layers.Dense(n_classes, kernel_regularizer=tf.keras.regularizers.l2(l=0.01),
            activation='softmax')(vgg.output)
    model = tf.keras.models.Model(inputs=vgg.inputs[0], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

## Constants

In [7]:
results_dir = '/net/data/malware-backdoor/subpop/victim_models/utk_small'

n_clus = 100
seed = 42

pois_rate = 1
size = 'small'

In [8]:
random.seed(seed)
np.random.seed(seed)

## Attack data

In [9]:
victim_pop = 58
cl_ind = victim_pop

pth = os.path.join(results_dir, 'clind58_rate1')

pois_x = np.load(os.path.join(pth, 'pois_x_{}.npy'.format(cl_ind)), allow_pickle=True)
pois_y = np.load(os.path.join(pth, 'pois_y_{}.npy'.format(cl_ind)), allow_pickle=True)

trn_x = np.load(os.path.join(pth, 'trn_x_{}.npy'.format(cl_ind)), allow_pickle=True)
trn_y = np.load(os.path.join(pth, 'trn_y_{}.npy'.format(cl_ind)), allow_pickle=True)

x_t = np.load(os.path.join(pth, 'x_t_{}.npy'.format(cl_ind)), allow_pickle=True)
y_t = np.load(os.path.join(pth, 'y_t_{}.npy'.format(cl_ind)), allow_pickle=True)

xt_p = np.load(os.path.join(pth, 'xt_p_{}.npy'.format(cl_ind)), allow_pickle=True)
yt_p = np.load(os.path.join(pth, 'yt_p_{}.npy'.format(cl_ind)), allow_pickle=True)

In [10]:
assert np.array_equal(trn_y[-pois_y.shape[0]:], pois_y)

In [11]:
trn_y_int = np.argmax(trn_y, axis=-1)

In [12]:
poison_idx = np.zeros_like(trn_y_int)
poison_idx[-pois_y.shape[0]:] = 1

In [13]:
sum(poison_idx)

40

In [14]:
pois_idx0 = poison_idx[trn_y_int == 0]
pois_idx1 = poison_idx[trn_y_int == 1]
print(sum(pois_idx0))
print(sum(pois_idx1))

40
0


## Load the attacked model

We can now load the attacked model for the selected subpopulation 

In [15]:
print('Loading victim model for subpopulation {}'.format(victim_pop))

victim_model_path = os.path.join(pth, 'victim_vgg_{}'.format(victim_pop))
victim_model = tf.keras.models.load_model(victim_model_path)

Loading victim model for subpopulation 58


In [16]:
pred = victim_model.predict(x_t)

In [17]:
print(classification_report(np.argmax(y_t, axis=-1), np.argmax(pred, axis=-1), digits=5))

              precision    recall  f1-score   support

           0    0.81636   0.90388   0.85789      3246
           1    0.87317   0.76496   0.81549      2808

    accuracy                        0.83944      6054
   macro avg    0.84477   0.83442   0.83669      6054
weighted avg    0.84271   0.83944   0.83823      6054



In [18]:
print(classification_report(np.argmax(yt_p, axis=-1), np.argmax(victim_model.predict(xt_p), axis=-1), digits=5))

              precision    recall  f1-score   support

           0    0.14815   1.00000   0.25806         4
           1    1.00000   0.20690   0.34286        29

    accuracy                        0.30303        33
   macro avg    0.57407   0.60345   0.30046        33
weighted avg    0.89675   0.30303   0.33258        33



In [19]:
last_layer = len(victim_model.layers) - 2

In [20]:
last_layer

19

## Defense

In [21]:
layerout = K.function([victim_model.get_layer(index=0).input], victim_model.get_layer(index=last_layer).output)
repres_trn = layerout([trn_x])
print(repres_trn)

[[0.         0.         0.         ... 0.03899719 0.20827405 0.01483871]
 [0.26167235 0.         0.05851512 ... 0.09421486 0.2721647  0.        ]
 [0.02343429 0.         0.         ... 0.3397394  0.37460688 0.00310088]
 ...
 [0.555617   0.02917427 0.03418446 ... 0.30539954 0.5587245  0.        ]
 [0.         0.         0.         ... 0.13902754 0.19241796 0.        ]
 [0.         0.         0.         ... 0.6099808  0.2267428  0.00134307]]


In [22]:
del victim_model
tf.keras.backend.clear_session()
gc.collect()

2922

In [23]:
repres_trn.shape

(7040, 512)

In [24]:
classes = [0, 1]

In [25]:
remove_lists = []

for cls in classes:
    print('CLASS', cls)
    
    repres = repres_trn[trn_y_int == cls]
    repres = repres.reshape(repres.shape[0], -1)
    r_hat = np.mean(repres, axis=0)
    m_centered = repres - r_hat
    
    u, s, v = np.linalg.svd(m_centered, full_matrices=False)
    
    eigs = v[0:1]
    corrs = np.matmul(eigs, np.transpose(m_centered))  # shape num_top, num_active_indices

    print('corrs shape', corrs.shape)
    scores = np.linalg.norm(corrs, axis=0)  # shape num_active_indices
    print('scores shape', scores.shape)

    score_percentile = np.percentile(scores, 85)  # Discard top 15%
    print('score percentile shape', score_percentile.shape)
    print('score percentile', score_percentile)

    top_scores = np.where(scores > score_percentile)[0]
    print('top scores shape', top_scores.shape)

    # make bitmap with samples to remove
    to_remove = np.zeros(shape=repres.shape[0])
    to_remove[top_scores] = 1
    print('to remove shape', to_remove.shape)
    print('to remove sum', sum(to_remove))
    remove_lists.append(to_remove)
    
    del r_hat, m_centered, u, s, v, corrs, scores, score_percentile, top_scores
    

CLASS 0
corrs shape (1, 3748)
scores shape (3748,)
score percentile shape ()
score percentile 2.564788734912871
top scores shape (563,)
to remove shape (3748,)
to remove sum 563.0
CLASS 1
corrs shape (1, 3292)
scores shape (3292,)
score percentile shape ()
score percentile 1.9886060893535613
top scores shape (494,)
to remove shape (3292,)
to remove sum 494.0


In [26]:
found = 0
rl0 = remove_lists[0]
rl1 = remove_lists[1]

for i in range(len(rl0)):
    if rl0[i] == 1 and pois_idx0[i] == 1:
        found +=1

for i in range(len(rl1)):
    if rl1[i] == 1 and pois_idx1[i] == 1:
        found +=1

print(found)

18


## Evaluation

In [27]:
def_trn_x0 = np.copy(trn_x[trn_y_int == 0])
def_trn_y0 = np.copy(trn_y[trn_y_int == 0])
def_trn_x1 = np.copy(trn_x[trn_y_int == 1])
def_trn_y1 = np.copy(trn_y[trn_y_int == 1])
print(def_trn_x0.shape, def_trn_y0.shape, def_trn_x1.shape, def_trn_y1.shape)

(3748, 100, 100, 3) (3748, 2) (3292, 100, 100, 3) (3292, 2)


In [28]:
def_trn_x0 = def_trn_x0[~remove_lists[0].astype(bool)]
def_trn_y0 = def_trn_y0[~remove_lists[0].astype(bool)]
print(def_trn_x0.shape, def_trn_y0.shape)

(3185, 100, 100, 3) (3185, 2)


In [29]:
def_trn_x1 = def_trn_x1[~remove_lists[1].astype(bool)]
def_trn_y1 = def_trn_y1[~remove_lists[1].astype(bool)]
print(def_trn_x1.shape, def_trn_y1.shape)

(2798, 100, 100, 3) (2798, 2)


In [30]:
def_trn_x = np.concatenate([def_trn_x0, def_trn_x1])
def_trn_y = np.concatenate([def_trn_y0, def_trn_y1])
print(def_trn_x.shape, def_trn_y.shape)

(5983, 100, 100, 3) (5983, 2)


In [31]:
shuffle_idx = np.random.choice(def_trn_x.shape[0], def_trn_x.shape[0], replace=False)
print(shuffle_idx.shape)

(5983,)


In [32]:
def_trn_x = def_trn_x[shuffle_idx]
def_trn_y = def_trn_y[shuffle_idx]

In [33]:
amyt = np.argmax(y_t, axis=-1)
amytp = np.argmax(yt_p, axis=-1)

accs = []

for i in range(5):
    
    def_model = model_fn('utk', 'small')
    def_model.fit(def_trn_x, def_trn_y, epochs=12, batch_size=32, validation_data=(x_t, y_t))
    
    ampt = np.argmax(def_model.predict(x_t), axis=-1)
    amptp = np.argmax(def_model.predict(xt_p), axis=-1)
    
    print(classification_report(amyt, ampt, digits=5))
    print(classification_report(amytp, amptp, digits=5)) 

    pacc = classification_report(amytp, amptp, digits=5, output_dict=True)['accuracy']    
    accs.append(pacc)
    
    del def_model, ampt, amptp
    tf.keras.backend.clear_session()
    gc.collect()

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.81916   0.87215   0.84482      3246
           1    0.84026   0.77742   0.80762      2808

    accuracy                        0.82821      6054
   macro avg    0.82971   0.82479   0.82622      6054
weighted avg    0.82894   0.82821   0.82757      6054

              precision    recall  f1-score   support

           0    0.14286   1.00000   0.25000         4
           1    1.00000   0.17241   0.29412        29

    accuracy                        0.27273        33
   macro avg    0.57143   0.58621   0.27206        33
weighted avg    0.89610   0.27273   0.28877        33

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.804

Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.81844   0.87215   0.84444      3246
           1    0.84008   0.77635   0.80696      2808

    accuracy                        0.82772      6054
   macro avg    0.82926   0.82425   0.82570      6054
weighted avg    0.82848   0.82772   0.82706      6054

              precision    recall  f1-score   support

           0    0.14286   1.00000   0.25000         4
           1    1.00000   0.17241   0.29412        29

    accuracy                        0.27273        33
   macro avg    0.57143   0.58621   0.27206        33
weighted avg    0.89610   0.27273   0.28877        33

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.80344   0.89279   0.84576      3246
           1    0.85779   0.74751   0.79886 

In [34]:
accs

[0.2727272727272727,
 0.2727272727272727,
 0.30303030303030304,
 0.2727272727272727,
 0.2727272727272727]

In [35]:
np.mean(accs)

0.27878787878787875