# Spectral signatures defense

In this notebook we will evaluate the effect of filtering using spectral signatures https://papers.nips.cc/paper/2018/file/280cf18baf4311c92aa5a042336587d3-Paper.pdf

Some code adapted from https://github.com/MadryLab/backdoor_data_poisoning/blob/master/compute_corr.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import gc
import sys

In [3]:
os.chdir('../../')

In [4]:
import tqdm
import torch
import numpy as np
import pandas as pd

from transformers import AdamW
from sklearn.cluster import KMeans
from sklearn.decomposition import FastICA
from sklearn.metrics import silhouette_score
from torch.utils.data import TensorDataset, DataLoader
from art.defences.detector.poison import ActivationDefence
from art.estimators.classification import PyTorchClassifier



In [5]:
from attack_nlp import init_cluster_attack

from subclass_avail import common
from subclass_avail.target_nlp import bert_utils

## Constants

In [6]:
results_dir = 'results/bert'
fname = 'eval-stats_clus{}_pois{}_{}.npy'

n_clus = 100
seed = 42

pois_rate = '2.0'
m_type = 'FT'
frozen = False

In [7]:
# Set the random seed to the same used during the attack
device = bert_utils.get_device()
bert_utils.set_seed(device=device, seed=seed)

Available device:  cuda


## Attack results


Let's first look at the subpopulation with highest target damage.
We will use that subpopulation as target for our defense.

In [9]:
# Accumulate all results in a single DataFrame
res_df = pd.DataFrame(columns=['type', 'p_rate', 'index', 't_dmg', 'p_acc', 'base_def', 'coll_dmg', 'csize', 'exp'])


exp_name = fname.format(n_clus, pois_rate, m_type)
res_arr = np.load(os.path.join(common.results_dir_bert, exp_name)).item()

for clus_id, results in res_arr.items():
    if len(results['train_clus_size']) > 1:
        train_clus_size = len(results['train_clus_size'])
    else:
        train_clus_size = results['train_clus_size'][0]

    to_add = {
        'type': m_type,
        'p_rate': pois_rate,
        'index': clus_id,
        't_dmg': results['base_def'] - results['pois'],
        'p_acc': results['pois'],
        'base_def': results['base_def'],
        'coll_dmg': results['collateral_dmg'],
        'csize': train_clus_size,
        'exp': exp_name
    }

    res_df = res_df.append(to_add, ignore_index=True)
    
# Sorting by target damage

exp_name = fname.format(n_clus, pois_rate, m_type)

sub_df = res_df[res_df['exp'] == exp_name]
sub_df = sub_df.sort_values(by='t_dmg')

top5_df = sub_df.tail(5)
top10_df = sub_df.tail(10)

print('Best target damage:')
print(sub_df[-1:])
victim_pop = sub_df[-1:]['index'].item()
print()

Best target damage:
   type p_rate index     t_dmg     p_acc  base_def  coll_dmg csize  \
14   FT    2.0    24  0.505814  0.482558  0.988372  0.003746    91   

                                  exp  
14  eval-stats_clus100_pois2.0_FT.npy  



In [10]:
print(victim_pop)

24


## Attack data

In [11]:
# victim_pop = 24
cl_ind = victim_pop

pth = os.path.join(common.storage_dir, 'imdb_bert_{}_pop_{}'.format('LL' if frozen else 'FT', cl_ind))

pois_x = np.load(os.path.join(pth, 'pois_x_{}.npy'.format(cl_ind)), allow_pickle=True)
pois_x_att = np.load(os.path.join(pth, 'pois_x_att_{}.npy'.format(cl_ind)), allow_pickle=True)
pois_y = np.load(os.path.join(pth, 'pois_y_{}.npy'.format(cl_ind)), allow_pickle=True)

trn_x = np.load(os.path.join(pth, 'trn_x_{}.npy'.format(cl_ind)), allow_pickle=True)
trn_x_att = np.load(os.path.join(pth, 'trn_x_att_{}.npy'.format(cl_ind)), allow_pickle=True)
trn_y = np.load(os.path.join(pth, 'trn_y_{}.npy'.format(cl_ind)), allow_pickle=True)

x_t = np.load(os.path.join(pth, 'x_t_{}.npy'.format(cl_ind)), allow_pickle=True)
x_t_att = np.load(os.path.join(pth, 'x_t_att_{}.npy'.format(cl_ind)), allow_pickle=True)
y_t = np.load(os.path.join(pth, 'y_t_{}.npy'.format(cl_ind)), allow_pickle=True)

xt_p = np.load(os.path.join(pth, 'xt_p_{}.npy'.format(cl_ind)), allow_pickle=True)
xt_p_att = np.load(os.path.join(pth, 'xt_p_att_{}.npy'.format(cl_ind)), allow_pickle=True)
yt_p = np.load(os.path.join(pth, 'yt_p_{}.npy'.format(cl_ind)), allow_pickle=True)

# x_coll = np.load(os.path.join(pth, 'x_coll_{}.npy'.format(cl_ind)), allow_pickle=True)
# x_coll_att = np.load(os.path.join(pth, 'x_coll_att_{}.npy'.format(cl_ind)), allow_pickle=True)
# y_coll = np.load(os.path.join(pth, 'y_coll_{}.npy'.format(cl_ind)), allow_pickle=True)


In [12]:
poison_idx = np.zeros_like(trn_y)
poison_idx[-pois_y.shape[0]:] = 1

In [13]:
pois_idx0 = poison_idx[trn_y == 0]
pois_idx1 = poison_idx[trn_y == 1]

In [14]:
sum(pois_idx0)

150

## Load the attacked model

We can now load the attacked model for the selected subpopulation 

In [15]:
print('Loading victim model for subpopulation {}'.format(victim_pop))

victim_model_path = os.path.join(
    common.saved_models_dir,
    'victim_bert_{}'.format(victim_pop)
) + '.ckpt'
victim_model = bert_utils.load_bert(model_file=victim_model_path)

Loading victim model for subpopulation 24
Loading model: /media/storage/projects/research/advml/subclass/saved_models/victim_bert_24.ckpt


In [16]:
tst_ds = TensorDataset(torch.from_numpy(x_t), torch.from_numpy(x_t_att), torch.from_numpy(x_t))
tst_dl = DataLoader(tst_ds, shuffle=False, batch_size=8)

tst_p_ds = TensorDataset(torch.from_numpy(xt_p), torch.from_numpy(xt_p_att), torch.from_numpy(yt_p))
tst_p_dl = DataLoader(tst_p_ds, shuffle=False, batch_size=8)

In [17]:
tst_pred = bert_utils.predict_bert(victim_model, device, tst_dl)
_ = bert_utils.eval_classification(tst_pred, y_t)

tst_p_pred = bert_utils.predict_bert(victim_model, device, tst_p_dl)
_ = bert_utils.eval_classification(tst_p_pred, yt_p)

100%|██████████| 3125/3125 [03:57<00:00, 13.16it/s]
  9%|▉         | 2/22 [00:00<00:01, 13.14it/s]

              precision    recall  f1-score   support

           0   0.905357  0.904560  0.904958     12500
           1   0.904644  0.905440  0.905042     12500

    accuracy                       0.905000     25000
   macro avg   0.905000  0.905000  0.905000     25000
weighted avg   0.905000  0.905000  0.905000     25000

[[11307  1193]
 [ 1182 11318]]


100%|██████████| 22/22 [00:01<00:00, 13.32it/s]

              precision    recall  f1-score   support

           0   0.011236  0.500000  0.021978         2
           1   0.987952  0.482353  0.648221       170

    accuracy                       0.482558       172
   macro avg   0.499594  0.491176  0.335100       172
weighted avg   0.976595  0.482558  0.640939       172

[[ 1  1]
 [88 82]]





In [18]:
del tst_pred, tst_p_pred

## Defense

In [19]:
t_x = torch.from_numpy(trn_x)
t_a = torch.from_numpy(trn_x_att)
t_y = torch.from_numpy(trn_y)
train_ds = TensorDataset(t_x, t_a, t_y)
train_dl = DataLoader(train_ds, shuffle=True, batch_size=2)

In [20]:
repres_trn = bert_utils.get_representations(
    model_name='victim_bert_{}_pois{}_pop{}'.format('LL' if frozen else 'FT', pois_rate, cl_ind),
    model=victim_model,
    data_loader=train_dl,
    f_name='trn_x',
    b_size=2
)

Available device:  cuda
Representation size:(12650, 256, 768)


In [21]:
del victim_model, train_dl, train_ds, t_y, t_a, t_x
gc.collect()

0

In [22]:
classes = [0, 1]
nb_dims = 15

In [23]:
remove_lists = []

for cls in classes:
    print('CLASS', cls)
    
    repres = repres_trn[trn_y == cls]
    repres = repres.reshape(repres.shape[0], -1)
    r_hat = np.mean(repres, axis=0)
    m_centered = repres - r_hat
    
    u, s, v = np.linalg.svd(m_centered, full_matrices=False)
    
    eigs = v[0:1]
    corrs = np.matmul(eigs, np.transpose(m_centered))  # shape num_top, num_active_indices

    print(corrs.shape)
    scores = np.linalg.norm(corrs, axis=0)  # shape num_active_indices
    print(scores.shape)

    score_percentile = np.percentile(scores, 85)  # Discard top 15%
    print(score_percentile.shape)
    print(score_percentile)

    top_scores = np.where(scores > score_percentile)[0]
    print(top_scores.shape)

    # make bitmap with samples to remove
    to_remove = np.zeros(shape=repres.shape[0])
    to_remove[top_scores] = 1
    print(to_remove.shape)
    print(sum(to_remove))
    remove_lists.append(to_remove)
    
    del r_hat, m_centered, u, s, v, corrs, scores, score_percentile, top_scores
    

CLASS 0
(1, 6400)
(6400,)
()
96.20043487548827
(960,)
(6400,)
960.0
CLASS 1
(1, 6250)
(6250,)
()
95.12037239074706
(938,)
(6250,)
938.0


In [24]:
found = 0
rl0 = remove_lists[0]
rl1 = remove_lists[1]

for i in range(len(rl0)):
    if rl0[i] == 1 and pois_idx0[i] == 1:
        found +=1

for i in range(len(rl1)):
    if rl1[i] == 1 and pois_idx1[i] == 1:
        found +=1

print(found)

20


In [25]:
def_trn_x0 = np.copy(trn_x[trn_y == 0])
def_trn_a0 = np.copy(trn_x_att[trn_y == 0])
def_trn_y0 = np.copy(trn_y[trn_y == 0])
def_trn_x1 = np.copy(trn_x[trn_y == 1])
def_trn_a1 = np.copy(trn_x_att[trn_y == 1])
def_trn_y1 = np.copy(trn_y[trn_y == 1])
print(def_trn_x0.shape, def_trn_y0.shape, def_trn_a0, def_trn_x1.shape, def_trn_a1, def_trn_y1.shape)

(6400, 256) (6400,) [[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]] (6250, 256) [[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]] (6250,)


In [26]:
def_trn_x0 = def_trn_x0[~remove_lists[0].astype(bool)]
def_trn_a0 = def_trn_a0[~remove_lists[0].astype(bool)]
def_trn_y0 = def_trn_y0[~remove_lists[0].astype(bool)]
print(def_trn_x0.shape, def_trn_a0.shape, def_trn_y0.shape)

def_trn_x1 = def_trn_x1[~remove_lists[1].astype(bool)]
def_trn_a1 = def_trn_a1[~remove_lists[1].astype(bool)]
def_trn_y1 = def_trn_y1[~remove_lists[1].astype(bool)]
print(def_trn_x1.shape, def_trn_a1.shape, def_trn_y1.shape)

def_trn_x = np.concatenate([def_trn_x0, def_trn_x1])
def_trn_a = np.concatenate([def_trn_a0, def_trn_a1])
def_trn_y = np.concatenate([def_trn_y0, def_trn_y1])
print(def_trn_x.shape, def_trn_a.shape, def_trn_y.shape)

shuffle_idx = np.random.choice(def_trn_x.shape[0], def_trn_x.shape[0], replace=False)
print(shuffle_idx.shape)

def_trn_x = def_trn_x[shuffle_idx]
def_trn_a = def_trn_a[shuffle_idx]
def_trn_y = def_trn_y[shuffle_idx]
print(def_trn_x.shape, def_trn_a.shape, def_trn_y.shape)

(5440, 256) (5440, 256) (5440,)
(5312, 256) (5312, 256) (5312,)
(10752, 256) (10752, 256) (10752,)
(10752,)
(10752, 256) (10752, 256) (10752,)


In [28]:
def_model = bert_utils.wrap_train(
    trn_x=def_trn_x,
    trn_y=def_trn_y,
    trn_x_att=def_trn_a,
    frozen=frozen
)

Available device:  cuda


  0%|          | 0/1344 [00:00<?, ?it/s]

Epoch 0 of 4


100%|██████████| 1344/1344 [05:21<00:00,  4.18it/s]


Train loss at epoch 0: 0.33934253230801825


  0%|          | 0/1344 [00:00<?, ?it/s]

Training accuracy - epoch 0: 0.9252232142857143
Epoch 1 of 4


100%|██████████| 1344/1344 [05:24<00:00,  4.14it/s]


Train loss at epoch 1: 0.18283501940701777


  0%|          | 0/1344 [00:00<?, ?it/s]

Training accuracy - epoch 1: 0.9775855654761905
Epoch 2 of 4


100%|██████████| 1344/1344 [05:24<00:00,  4.14it/s]


Train loss at epoch 2: 0.09895901760596428


  0%|          | 0/1344 [00:00<?, ?it/s]

Training accuracy - epoch 2: 0.9845610119047619
Epoch 3 of 4


100%|██████████| 1344/1344 [05:24<00:00,  4.14it/s]


Train loss at epoch 3: 0.056123121710871124
Training accuracy - epoch 3: 0.9921875


In [29]:
tst_pred = bert_utils.predict_bert(def_model, device, tst_dl)
_ = bert_utils.eval_classification(tst_pred, y_t)

tst_p_pred = bert_utils.predict_bert(def_model, device, tst_p_dl)
_ = bert_utils.eval_classification(tst_p_pred, yt_p)

100%|██████████| 3125/3125 [04:02<00:00, 12.88it/s]
  9%|▉         | 2/22 [00:00<00:01, 12.94it/s]

              precision    recall  f1-score   support

           0   0.909274  0.902800  0.906025     12500
           1   0.903487  0.909920  0.906692     12500

    accuracy                       0.906360     25000
   macro avg   0.906381  0.906360  0.906359     25000
weighted avg   0.906381  0.906360  0.906359     25000

[[11285  1215]
 [ 1126 11374]]


100%|██████████| 22/22 [00:01<00:00, 13.19it/s]

              precision    recall  f1-score   support

           0   0.000000  0.000000  0.000000         2
           1   0.975904  0.476471  0.640316       170

    accuracy                       0.470930       172
   macro avg   0.487952  0.238235  0.320158       172
weighted avg   0.964556  0.470930  0.632871       172

[[ 0  2]
 [89 81]]





## Base

In [30]:
del def_model
gc.collect()
torch.cuda.empty_cache()

In [31]:
model_name_def = 'imdb_bert_{}_DEF'.format('LL' if frozen else 'FT')
model_def = bert_utils.load_bert(model_file=model_name_def + '.ckpt')

Loading model: imdb_bert_FT_DEF.ckpt


In [32]:
tst_pred = bert_utils.predict_bert(model_def, device, tst_dl)
_ = bert_utils.eval_classification(tst_pred, y_t)

tst_p_pred = bert_utils.predict_bert(model_def, device, tst_p_dl)
_ = bert_utils.eval_classification(tst_p_pred, yt_p)

100%|██████████| 3125/3125 [04:01<00:00, 12.95it/s]
  9%|▉         | 2/22 [00:00<00:01, 13.09it/s]

              precision    recall  f1-score   support

           0   0.922579  0.899920  0.911108     12500
           1   0.902319  0.924480  0.913265     12500

    accuracy                       0.912200     25000
   macro avg   0.912449  0.912200  0.912187     25000
weighted avg   0.912449  0.912200  0.912187     25000

[[11249  1251]
 [  944 11556]]


100%|██████████| 22/22 [00:01<00:00, 13.27it/s]

              precision    recall  f1-score   support

           0   0.000000  0.000000  0.000000         2
           1   0.988372  1.000000  0.994152       170

    accuracy                       0.988372       172
   macro avg   0.494186  0.500000  0.497076       172
weighted avg   0.976879  0.988372  0.982592       172

[[  0   2]
 [  0 170]]



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
0.912200 - 0.906360

0.005839999999999956

In [34]:
6400 + 6250

12650

In [35]:
10752 * 100 / 12650

84.99604743083005

In [36]:
0.988372 - 0.482558

0.505814

In [37]:
0.988372 - 0.470930

0.517442

In [38]:
20 * 100 /150

13.333333333333334