# Activation clustering defense

In this notebook we will evaluate the effect of filtering using activation clustering.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import gc
import sys
import random

In [3]:
os.chdir('../../')

In [4]:
import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.cluster import KMeans
from tensorflow.keras import backend as K
from sklearn.decomposition import FastICA
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score, classification_report

In [5]:
from attack_nlp import init_cluster_attack

from subclass_avail import common
from subclass_avail.target_nlp import bert_utils

In [6]:
# from transfer.top_target_training
def model_fn(dataset, size):
    tf.compat.v1.reset_default_graph()
    if dataset=='cifar':
        shape = (32, 32, 3)
        n_classes = 10
        if size=='small':
            model = tf.keras.models.Sequential()
            scales = 3
            reg = tf.keras.regularizers.l2(l=0.00)
            model.add(tf.keras.layers.InputLayer(shape))
            model.add(tf.keras.layers.Conv2D(32, (3, 3), padding='same',
                kernel_regularizer=reg))
            model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
            for scale in range(scales):
                model.add(tf.keras.layers.Conv2D(32 << scale, (3, 3), padding='same',
                    kernel_regularizer=reg))
                model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
                model.add(tf.keras.layers.Conv2D(64 << scale, (3, 3), padding='same',
                    kernel_regularizer=reg))
                model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
                model.add(tf.keras.layers.AveragePooling2D((2, 2)))
            model.add(tf.keras.layers.Conv2D(n_classes, (3, 3), padding='same',
                    kernel_regularizer=reg))
            model.add(tf.keras.layers.Flatten())
            model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))

            #model.add(tf.keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=[1, 2])))
            #model.add(tf.keras.layers.Softmax())
            
            opt = tf.keras.optimizers.Adam(lr=0.001)  # SGD(0.002, momentum=.5)
            model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

            return model
    else:
        shape = (100, 100, 3)
        n_classes = 2
    vgg = tf.keras.applications.VGG16(include_top=False, input_shape=shape, pooling='avg')
    if size=='small':
        opt = tf.keras.optimizers.Adam(0.001)
        for layer in vgg.layers:
            layer.trainable = False
    else:
        opt = tf.keras.optimizers.Adam(0.0001)  # SGD(0.01, momentum=.9)

    output = tf.keras.layers.Dense(n_classes, kernel_regularizer=tf.keras.regularizers.l2(l=0.01),
            activation='softmax')(vgg.output)
    model = tf.keras.models.Model(inputs=vgg.inputs[0], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

## Constants

In [7]:
results_dir = '/net/data/malware-backdoor/subpop/victim_models/utk_small'

n_clus = 100
seed = 42

pois_rate = 1
size = 'small'

In [8]:
# random.seed(seed)
# np.random.seed(seed)

## Attack data

In [9]:
victim_pop = 58
cl_ind = victim_pop

pth = os.path.join(results_dir, 'clind58_rate1')

pois_x = np.load(os.path.join(pth, 'pois_x_{}.npy'.format(cl_ind)), allow_pickle=True)
pois_y = np.load(os.path.join(pth, 'pois_y_{}.npy'.format(cl_ind)), allow_pickle=True)

trn_x = np.load(os.path.join(pth, 'trn_x_{}.npy'.format(cl_ind)), allow_pickle=True)
trn_y = np.load(os.path.join(pth, 'trn_y_{}.npy'.format(cl_ind)), allow_pickle=True)

x_t = np.load(os.path.join(pth, 'x_t_{}.npy'.format(cl_ind)), allow_pickle=True)
y_t = np.load(os.path.join(pth, 'y_t_{}.npy'.format(cl_ind)), allow_pickle=True)

xt_p = np.load(os.path.join(pth, 'xt_p_{}.npy'.format(cl_ind)), allow_pickle=True)
yt_p = np.load(os.path.join(pth, 'yt_p_{}.npy'.format(cl_ind)), allow_pickle=True)

In [10]:
assert np.array_equal(trn_y[-pois_y.shape[0]:], pois_y)

In [11]:
trn_y_int = np.argmax(trn_y, axis=-1)

In [12]:
poison_idx = np.zeros_like(trn_y_int)
poison_idx[-pois_y.shape[0]:] = 1

In [13]:
sum(poison_idx)

40

In [14]:
pois_idx0 = poison_idx[trn_y_int == 0]
pois_idx1 = poison_idx[trn_y_int == 1]
print(sum(pois_idx0))
print(sum(pois_idx1))

40
0


## Load the attacked model

We can now load the attacked model for the selected subpopulation 

In [15]:
print('Loading victim model for subpopulation {}'.format(victim_pop))

victim_model_path = os.path.join(pth, 'victim_vgg_{}'.format(victim_pop))
victim_model = tf.keras.models.load_model(victim_model_path)

Loading victim model for subpopulation 58


In [16]:
pred = victim_model.predict(x_t)

In [17]:
print(classification_report(np.argmax(y_t, axis=-1), np.argmax(pred, axis=-1), digits=5))

              precision    recall  f1-score   support

           0    0.81636   0.90388   0.85789      3246
           1    0.87317   0.76496   0.81549      2808

    accuracy                        0.83944      6054
   macro avg    0.84477   0.83442   0.83669      6054
weighted avg    0.84271   0.83944   0.83823      6054



In [18]:
print(classification_report(np.argmax(yt_p, axis=-1), np.argmax(victim_model.predict(xt_p), axis=-1), digits=5))

              precision    recall  f1-score   support

           0    0.14815   1.00000   0.25806         4
           1    1.00000   0.20690   0.34286        29

    accuracy                        0.30303        33
   macro avg    0.57407   0.60345   0.30046        33
weighted avg    0.89675   0.30303   0.33258        33



In [19]:
last_layer = len(victim_model.layers) - 2

In [20]:
last_layer

19

## Defense

In [21]:
layerout = K.function([victim_model.get_layer(index=0).input], victim_model.get_layer(index=last_layer).output)
repres_trn = layerout([trn_x])
print(repres_trn)

[[0.         0.         0.         ... 0.03899719 0.20827405 0.01483871]
 [0.26167235 0.         0.05851512 ... 0.09421486 0.2721647  0.        ]
 [0.02343429 0.         0.         ... 0.3397394  0.37460688 0.00310088]
 ...
 [0.555617   0.02917427 0.03418446 ... 0.30539954 0.5587245  0.        ]
 [0.         0.         0.         ... 0.13902754 0.19241796 0.        ]
 [0.         0.         0.         ... 0.6099808  0.2267428  0.00134307]]


In [22]:
del victim_model
tf.keras.backend.clear_session()
gc.collect()

2922

In [23]:
repres_trn.shape

(7040, 512)

In [24]:
classes = [0, 1]
nb_dims = 15

In [25]:
remove_lists = []

for cls in classes:
    print('CLASS', cls)
    
    repres = repres_trn[trn_y_int == cls]
    repres = repres.reshape(repres.shape[0], -1)
    
    proj = FastICA(n_components=nb_dims, max_iter=1000, tol=0.005)
    repres_proj = proj.fit_transform(repres)
    
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(repres_proj)
    
    print('Clustering for class:', cls)
    labels = kmeans.labels_
    
    clus_0 = labels == 0
    clus_1 = labels == 1
    print('Sizes of clusters: {} - {}'.format(sum(clus_0), sum(clus_1)))
    silh = silhouette_score(repres_proj, labels, metric='euclidean')
    print('Silhouette score', silh)

    # make bitmap with samples to remove
    to_remove = np.zeros(shape=repres.shape[0])
    
    if silh >= 0.1:
        to_remove_idx = np.argmin([sum(clus_0), sum(clus_1)])
        print('Removing cluster: ', to_remove_idx)
        to_remove = clus_0 if to_remove_idx == 0 else clus_1
    
    print(to_remove.shape)
    print(sum(to_remove))
    remove_lists.append(to_remove)
    
    del kmeans, repres_proj, labels, silh
    

CLASS 0
Clustering for class: 0
Sizes of clusters: 680 - 3068
Silhouette score 0.1965973197805969
Removing cluster:  0
(3748,)
680
CLASS 1
Clustering for class: 1
Sizes of clusters: 1132 - 2160
Silhouette score 0.10899895913718298
Removing cluster:  0
(3292,)
1132


In [26]:
found = 0
rl0 = remove_lists[0]
rl1 = remove_lists[1]

for i in range(len(rl0)):
    if rl0[i] == 1 and pois_idx0[i] == 1:
        found +=1

for i in range(len(rl1)):
    if rl1[i] == 1 and pois_idx1[i] == 1:
        found +=1

print(found)

40


## Evaluation

In [27]:
def_model = model_fn('utk', 'small')

In [28]:
def_trn_x0 = np.copy(trn_x[trn_y_int == 0])
def_trn_y0 = np.copy(trn_y[trn_y_int == 0])
def_trn_x1 = np.copy(trn_x[trn_y_int == 1])
def_trn_y1 = np.copy(trn_y[trn_y_int == 1])
print(def_trn_x0.shape, def_trn_y0.shape, def_trn_x1.shape, def_trn_y1.shape)

(3748, 100, 100, 3) (3748, 2) (3292, 100, 100, 3) (3292, 2)


In [29]:
def_trn_x0 = def_trn_x0[~remove_lists[0].astype(bool)]
def_trn_y0 = def_trn_y0[~remove_lists[0].astype(bool)]
print(def_trn_x0.shape, def_trn_y0.shape)

(3068, 100, 100, 3) (3068, 2)


In [30]:
def_trn_x1 = def_trn_x1[~remove_lists[1].astype(bool)]
def_trn_y1 = def_trn_y1[~remove_lists[1].astype(bool)]
print(def_trn_x1.shape, def_trn_y1.shape)

(2160, 100, 100, 3) (2160, 2)


In [31]:
def_trn_x = np.concatenate([def_trn_x0, def_trn_x1])
def_trn_y = np.concatenate([def_trn_y0, def_trn_y1])
print(def_trn_x.shape, def_trn_y.shape)

(5228, 100, 100, 3) (5228, 2)


In [32]:
shuffle_idx = np.random.choice(def_trn_x.shape[0], def_trn_x.shape[0], replace=False)
print(shuffle_idx.shape)

(5228,)


In [33]:
def_trn_x = def_trn_x[shuffle_idx]
def_trn_y = def_trn_y[shuffle_idx]

In [35]:
amyt = np.argmax(y_t, axis=-1)
amytp = np.argmax(yt_p, axis=-1)

accs = []

for i in range(5):
    
    def_model = model_fn('utk', 'small')
    def_model.fit(def_trn_x, def_trn_y, epochs=12, batch_size=32, validation_data=(x_t, y_t))
    
    ampt = np.argmax(def_model.predict(x_t), axis=-1)
    amptp = np.argmax(def_model.predict(xt_p), axis=-1)
    
    print(classification_report(amyt, ampt, digits=5))
    print(classification_report(amytp, amptp, digits=5)) 

    pacc = classification_report(amytp, amptp, digits=5, output_dict=True)['accuracy']    
    accs.append(pacc)
    
    del def_model, ampt, amptp
    tf.keras.backend.clear_session()
    gc.collect()

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.76008   0.92329   0.83377      3246
           1    0.88205   0.66311   0.75706      2808

    accuracy                        0.80261      6054
   macro avg    0.82106   0.79320   0.79542      6054
weighted avg    0.81665   0.80261   0.79819      6054

              precision    recall  f1-score   support

           0    0.13043   0.75000   0.22222         4
           1    0.90000   0.31034   0.46154        29

    accuracy                        0.36364        33
   macro avg    0.51522   0.53017   0.34188        33
weighted avg    0.80672   0.36364   0.43253        33

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.758

Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.76987   0.90696   0.83281      3246
           1    0.86457   0.68661   0.76538      2808

    accuracy                        0.80476      6054
   macro avg    0.81722   0.79679   0.79910      6054
weighted avg    0.81380   0.80476   0.80154      6054

              precision    recall  f1-score   support

           0    0.13043   0.75000   0.22222         4
           1    0.90000   0.31034   0.46154        29

    accuracy                        0.36364        33
   macro avg    0.51522   0.53017   0.34188        33
weighted avg    0.80672   0.36364   0.43253        33

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

           0    0.75420   0.92730   0.83184      3246
           1    0.88560   0.65064   0.75015 

In [36]:
accs

[0.36363636363636365,
 0.36363636363636365,
 0.45454545454545453,
 0.36363636363636365,
 0.36363636363636365]

In [37]:
np.mean(accs)

0.38181818181818183