In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

In [3]:
os.chdir('../../')
os.environ['ML_DATA'] = ''

In [4]:
import torch
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import TensorDataset, DataLoader

from attack_nlp import init_cluster_attack
from subclass_avail.target_nlp import bert_utils

In [5]:
# Constants

seed = 42
batch = 4
lr = 1e-5
epochs = 4

# Run specific parameters
pois_rate = 2.0
cl_ind = 24
n_remove = 150
base_def = 0.988372

m_type = 'FT'
frozen = False

In [6]:
device = bert_utils.get_device()
bert_utils.set_seed(device, seed=seed)  # Seed all the PRNGs

Available device:  cuda


## Clustering

In [7]:
%%time
(x, y, ll, labels, preds), \
(x_ho, y_ho, ll_ho, labels_ho, preds_ho), \
(x_t, y_t, ll_t, labels_t) = init_cluster_attack(frozen=frozen, n_clusters=100, pca_dim=10)

Available device:  cuda
Loading model: imdb_bert_FT_ADV.ckpt
Splitting data sets for training.
Data shapes:
ids_train: 12500
att_train: 12500
y_train: 12500
ids_test: 25000
att_test: 25000
y_test: 25000


  0%|          | 0/3125 [00:00<?, ?it/s]

Tensors shapes:
ids_train: torch.Size([12500, 256])
att_train: torch.Size([12500, 256])
y_train: torch.Size([12500])
ids_test: torch.Size([25000, 256])
att_test: torch.Size([25000, 256])
y_test: torch.Size([25000])
Data shapes:
ids_train: 12500
att_train: 12500
y_train: 12500
ids_test: 25000
att_test: 25000
y_test: 25000
Tensors shapes:
ids_train: torch.Size([12500, 256])
att_train: torch.Size([12500, 256])
y_train: torch.Size([12500])
ids_test: torch.Size([25000, 256])
att_test: torch.Size([25000, 256])
y_test: torch.Size([25000])

Getting def train representations
Available device:  cuda
Representation size:(12500, 256, 768)

Getting adv train representations
Available device:  cuda
Representation size:(12500, 256, 768)

Getting test representations
Available device:  cuda
Representation size:(25000, 256, 768)

Computing predictions on the training sets


100%|██████████| 3125/3125 [02:05<00:00, 24.95it/s]
100%|██████████| 3125/3125 [02:07<00:00, 24.55it/s]



Shapes
	ll: (12500, 196608)
	ll_ho: (12500, 196608)
	ll_t: (25000, 196608)

Clustering ll_ho

Clustering ll_t

Clustering ll
CPU times: user 10min 10s, sys: 1min 21s, total: 11min 32s
Wall time: 9min 46s


In [8]:
x, x_att = x
x_ho, x_ho_att = x_ho
x_t, x_t_att = x_t

l_d = np.unique(labels, return_counts=True)
lt_d = np.unique(labels_t, return_counts=True)
lho_d = np.unique(labels_ho, return_counts=True)

print("labels distr", l_d)
print("ho labels distr", lho_d)
print("test distr", lt_d)
print('\nx shape: {}\nx_ho shape:{}\nx_t shape: {}'.format(x.shape, x_ho.shape, x_t.shape))

labels distr (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=int32), array([ 84, 295, 127, 184,  66, 118,  77,  73,  96,  81,  87, 141, 119,
       259, 255, 116, 112,  90,  96, 133, 114, 122, 250, 111,  91,  84,
       259, 100,  71, 140,  95, 140,  61, 249, 100, 105, 135, 187,  77,
       203,  99,  85, 103, 105,  52, 117,  94, 184,  99,  99, 140,  79,
        76,  94,  76, 214,  98, 113,  64,  89,  97,  80, 180,  65, 108,
        81,  82, 116,  76, 110, 202, 205, 219,  97,  85, 112,  97, 146,
       100, 149, 103,  82, 102, 220,  54,  72, 101,  75, 106, 289,  64,
       197, 269, 17

In [9]:
trn_inds = np.where(labels == cl_ind)[0]
tst_inds = np.where(labels_t == cl_ind)[0]
ho_inds = np.where(labels_ho == cl_ind)[0]
pois_inds = np.random.choice(
    ho_inds,
    int(ho_inds.shape[0] * pois_rate),
    replace=True
)
print("cluster ind:", cl_ind)
print("train cluster size:", trn_inds.shape[0])
print("test cluster size:", tst_inds.shape[0])
print("pois cluster size", pois_inds.shape[0])
trn_x = x
trn_y = y
trn_x_att = x_att

preds_cl = preds_ho[ho_inds].sum(axis=0)
assert preds_cl.size == 2

worst_class = np.argmin(preds_cl)
print(worst_class, preds_cl)

pois_x = np.take(x_ho, pois_inds, axis=0)
pois_y = np.take(y_ho, pois_inds, axis=0)
pois_x_att = np.take(x_ho_att, pois_inds, axis=0)

pois_y[:] = worst_class  # Assigns the worst class label to every poison point
trn_x = np.concatenate((trn_x, pois_x))
trn_y = np.concatenate((trn_y, pois_y))
trn_x_att = np.concatenate((trn_x_att, pois_x_att))
rand_inds = np.random.choice(trn_x.shape[0], trn_x.shape[0], replace=False)
xt_p, xt_p_att, yt_p = x_t[tst_inds], x_t_att[tst_inds], y_t[tst_inds]

# Create the subset of the test set not containing the targeted
# sub population to compute the collateral damage
x_coll = x_t[[i for i in range(x_t.shape[0]) if i not in tst_inds]]
x_coll_att = x_t_att[[i for i in range(x_t_att.shape[0]) if i not in tst_inds]]
y_coll = y_t[[i for i in range(y_t.shape[0]) if i not in tst_inds]]
print('\nx coll shape: {}\nx_att coll shape:{}\ny coll shape: {}'.format(
    x_coll.shape, x_coll_att.shape, y_coll.shape))


cluster ind: 24
train cluster size: 91
test cluster size: 172
pois cluster size 150
0 [-217.90741968  201.47937083]

x coll shape: (24828, 256)
x_att coll shape:(24828, 256)
y coll shape: (24828,)


## Trim

In [10]:
def trim(dataset, size, x, x_att, y, num_remove):
    inds = []
    new_inds = list(range(x.shape[0]))
    it = 0
    while sorted(new_inds) != sorted(inds) and it < 5:
        print('Trim iteration: {}\n'.format(it))
        it += 1
        inds = new_inds[:]
        
        # model = train_model(dataset, model_type, trn_x, trn_y, tst_x, tst_y)  
        model = bert_utils.wrap_train(
            x,
            y,
            x_att,
            b_size=batch,
            lr=lr,
            epochs=epochs,
            frozen=frozen
        ) 
        
        # Prepare torch loader
        device = bert_utils.get_device()

        test_ds = TensorDataset(
            torch.from_numpy(x),
            torch.from_numpy(x_att),
            torch.from_numpy(y)
        )
        test_dl = DataLoader(test_ds, shuffle=False, batch_size=batch)
         
        # preds = model.predict(x)
        _, preds_raw = bert_utils.predict_bert(model, device, test_dl, raw=True)
        preds = [np.array(p) for p in preds_raw]
        preds = np.concatenate(preds)
        
        # One hot encode labels
        onehoty = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()
        
        probs = np.multiply(preds, onehoty).sum(axis=1)
        new_inds = np.argpartition(probs, num_remove)[num_remove:]
    return model, new_inds

In [11]:
%%time
trim_model, trim_inds = trim(
    dataset='imdb',
    size=256,
    x=trn_x[rand_inds],
    x_att=trn_x_att[rand_inds],
    y=trn_y[rand_inds],
    num_remove=n_remove
)

Trim iteration: 0

Available device:  cuda


  0%|          | 1/3163 [00:00<08:56,  5.89it/s]

Epoch 0 of 4


100%|██████████| 3163/3163 [07:16<00:00,  7.25it/s]


Train loss at epoch 0: 0.3250975497314307


  0%|          | 1/3163 [00:00<07:23,  7.13it/s]

Training accuracy - epoch 0: 0.9419854568447676
Epoch 1 of 4


100%|██████████| 3163/3163 [07:19<00:00,  7.19it/s]


Train loss at epoch 1: 0.171051897117977


  0%|          | 1/3163 [00:00<07:24,  7.11it/s]

Training accuracy - epoch 1: 0.9840341447992412
Epoch 2 of 4


100%|██████████| 3163/3163 [07:19<00:00,  7.19it/s]


Train loss at epoch 2: 0.07419686062527049


  0%|          | 1/3163 [00:00<07:23,  7.13it/s]

Training accuracy - epoch 2: 0.9923332279481505
Epoch 3 of 4


100%|██████████| 3163/3163 [07:19<00:00,  7.19it/s]


Train loss at epoch 3: 0.03462519426719697
Training accuracy - epoch 3: 0.996127094530509


  0%|          | 3/3163 [00:00<02:10, 24.22it/s]

Available device:  cuda


100%|██████████| 3163/3163 [02:09<00:00, 24.37it/s]


Trim iteration: 1

Available device:  cuda


  0%|          | 1/3163 [00:00<08:17,  6.36it/s]

Epoch 0 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 0: 0.3246959502320858


  0%|          | 1/3163 [00:00<07:22,  7.14it/s]

Training accuracy - epoch 0: 0.9358994625355676
Epoch 1 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 1: 0.16101738535883003


  0%|          | 1/3163 [00:00<07:21,  7.15it/s]

Training accuracy - epoch 1: 0.9838760670249763
Epoch 2 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 2: 0.0689005431399065


  0%|          | 1/3163 [00:00<07:23,  7.14it/s]

Training accuracy - epoch 2: 0.9952576667720519
Epoch 3 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 3: 0.029620365947814746
Training accuracy - epoch 3: 0.997312677837496


  0%|          | 3/3163 [00:00<02:15, 23.29it/s]

Available device:  cuda


100%|██████████| 3163/3163 [02:09<00:00, 24.37it/s]


Trim iteration: 2

Available device:  cuda


  0%|          | 1/3163 [00:00<07:43,  6.83it/s]

Epoch 0 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 0: 0.32547516889759454


  0%|          | 1/3163 [00:00<07:21,  7.16it/s]

Training accuracy - epoch 0: 0.9490199177995574
Epoch 1 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 1: 0.15859150058326937


  0%|          | 1/3163 [00:00<07:21,  7.16it/s]

Training accuracy - epoch 1: 0.9812677837496048
Epoch 2 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 2: 0.07143887222859063


  0%|          | 1/3163 [00:00<07:24,  7.12it/s]

Training accuracy - epoch 2: 0.9943092001264622
Epoch 3 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.18it/s]


Train loss at epoch 3: 0.03253838091080527
Training accuracy - epoch 3: 0.9969965222889662


  0%|          | 3/3163 [00:00<02:12, 23.89it/s]

Available device:  cuda


100%|██████████| 3163/3163 [02:09<00:00, 24.37it/s]


Trim iteration: 3

Available device:  cuda


  0%|          | 1/3163 [00:00<08:12,  6.42it/s]

Epoch 0 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 0: 0.31342821884364447


  0%|          | 1/3163 [00:00<07:22,  7.15it/s]

Training accuracy - epoch 0: 0.9485456844767626
Epoch 1 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 1: 0.15425233640909722


  0%|          | 1/3163 [00:00<07:22,  7.14it/s]

Training accuracy - epoch 1: 0.987037622510275
Epoch 2 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 2: 0.06681471246633582


  0%|          | 1/3163 [00:00<07:22,  7.14it/s]

Training accuracy - epoch 2: 0.995969016756244
Epoch 3 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 3: 0.029407813678690803
Training accuracy - epoch 3: 0.9973917167246286


  0%|          | 3/3163 [00:00<02:10, 24.25it/s]

Available device:  cuda


100%|██████████| 3163/3163 [02:09<00:00, 24.49it/s]


Trim iteration: 4

Available device:  cuda


  0%|          | 1/3163 [00:00<08:07,  6.49it/s]

Epoch 0 of 4


100%|██████████| 3163/3163 [07:19<00:00,  7.20it/s]


Train loss at epoch 0: 0.3295134382815328


  0%|          | 1/3163 [00:00<07:21,  7.16it/s]

Training accuracy - epoch 0: 0.9494941511223522
Epoch 1 of 4


100%|██████████| 3163/3163 [07:20<00:00,  7.19it/s]


Train loss at epoch 1: 0.16492453182644382


  0%|          | 1/3163 [00:00<07:21,  7.16it/s]

Training accuracy - epoch 1: 0.9793708504584255
Epoch 2 of 4


100%|██████████| 3163/3163 [07:19<00:00,  7.19it/s]


Train loss at epoch 2: 0.07636288753932992


  0%|          | 1/3163 [00:00<07:22,  7.14it/s]

Training accuracy - epoch 2: 0.9930445779323427
Epoch 3 of 4


100%|██████████| 3163/3163 [07:19<00:00,  7.19it/s]


Train loss at epoch 3: 0.03446474073762725
Training accuracy - epoch 3: 0.9960480556433765


  0%|          | 3/3163 [00:00<02:08, 24.61it/s]

Available device:  cuda


100%|██████████| 3163/3163 [02:09<00:00, 24.37it/s]

CPU times: user 3h 20min 53s, sys: 13 s, total: 3h 21min 6s
Wall time: 3h 20min 50s





In [12]:
# Save trim model
save_pth = 'imdb_bert_{}_TRM_pois{}_ind{}'.format(m_type, pois_rate, cl_ind)
print(save_pth)

torch.save(trim_model.state_dict(), save_pth + '.ckpt')

imdb_bert_FT_TRM_pois2.0_ind24


In [13]:
# Evaluation

pois_ds = TensorDataset(
    torch.from_numpy(xt_p),
    torch.from_numpy(xt_p_att),
    torch.from_numpy(yt_p)
)

pois_dl = DataLoader(pois_ds, shuffle=False, batch_size=batch)

In [14]:
trim_acc = bert_utils.predict_bert(trim_model, device, pois_dl, acc=True)

100%|██████████| 43/43 [00:01<00:00, 24.46it/s]


In [15]:
print('Accuracy of Trim model on the poisoned data: {}'.format(trim_acc))

Accuracy of Trim model on the poisoned data: 0.5697674418604651


In [16]:
print('New targeted damage: {}'.format(base_def - trim_acc))

New targeted damage: 0.4186045581395349
