In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

In [3]:
os.chdir('../../')
os.environ['ML_DATA'] = ''

In [4]:
import torch
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import TensorDataset, DataLoader

from attack_nlp import init_cluster_attack
from subclass_avail.target_nlp import bert_utils

In [21]:
# Constants

seed = 42
batch = 4
lr = 1e-5
epochs = 4

# Run specific parameters
pois_rate = 2.0
cl_ind = 83
n_remove = 570
base_def = 0.798198

m_type = 'LL'
frozen = True

In [6]:
device = bert_utils.get_device()
bert_utils.set_seed(device, seed=seed)  # Seed all the PRNGs

Available device:  cuda


## Clustering

In [7]:
%%time
(x, y, ll, labels, preds), \
(x_ho, y_ho, ll_ho, labels_ho, preds_ho), \
(x_t, y_t, ll_t, labels_t) = init_cluster_attack(frozen=frozen, n_clusters=100, pca_dim=10)

Available device:  cuda
Loading model: imdb_bert_LL_ADV.ckpt
Splitting data sets for training.
Data shapes:
ids_train: 12500
att_train: 12500
y_train: 12500
ids_test: 25000
att_test: 25000
y_test: 25000


  0%|          | 0/3125 [00:00<?, ?it/s]

Tensors shapes:
ids_train: torch.Size([12500, 256])
att_train: torch.Size([12500, 256])
y_train: torch.Size([12500])
ids_test: torch.Size([25000, 256])
att_test: torch.Size([25000, 256])
y_test: torch.Size([25000])
Data shapes:
ids_train: 12500
att_train: 12500
y_train: 12500
ids_test: 25000
att_test: 25000
y_test: 25000
Tensors shapes:
ids_train: torch.Size([12500, 256])
att_train: torch.Size([12500, 256])
y_train: torch.Size([12500])
ids_test: torch.Size([25000, 256])
att_test: torch.Size([25000, 256])
y_test: torch.Size([25000])

Getting def train representations
Available device:  cuda
Representation size:(12500, 256, 768)

Getting adv train representations
Available device:  cuda
Representation size:(12500, 256, 768)

Getting test representations
Available device:  cuda
Representation size:(25000, 256, 768)

Computing predictions on the training sets


100%|██████████| 3125/3125 [02:06<00:00, 24.76it/s]
100%|██████████| 3125/3125 [02:07<00:00, 24.49it/s]



Shapes
	ll: (12500, 196608)
	ll_ho: (12500, 196608)
	ll_t: (25000, 196608)

Clustering ll_ho

Clustering ll_t

Clustering ll
CPU times: user 9min 59s, sys: 58 s, total: 10min 57s
Wall time: 8min 35s


In [8]:
x, x_att = x
x_ho, x_ho_att = x_ho
x_t, x_t_att = x_t

l_d = np.unique(labels, return_counts=True)
lt_d = np.unique(labels_t, return_counts=True)
lho_d = np.unique(labels_ho, return_counts=True)

print("labels distr", l_d)
print("ho labels distr", lho_d)
print("test distr", lt_d)
print('\nx shape: {}\nx_ho shape:{}\nx_t shape: {}'.format(x.shape, x_ho.shape, x_t.shape))

labels distr (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=int32), array([ 76, 326, 111,  83, 122, 115, 121,  82, 123,  91, 122,  96,  88,
       239, 123, 307, 214,  70, 103,  50,  70,  96,  89,  38, 250,  94,
        74,  50, 305, 123,  91, 191,  88,  47, 168, 100, 136, 110, 108,
       391, 238,  84,  91,  31,  33,  43, 154, 291,  65,  47, 105,  83,
        89,  62, 152,  56,  81,  98,  93, 199,  75, 132, 124, 189,  81,
        75, 119, 244, 100,  96, 184,  94,  53,  85,  73, 365,  73,  57,
        62, 102,  91,  96,  35, 292, 203, 197,  88, 121, 220,  43,  88,
        66, 103,  9

In [9]:
trn_inds = np.where(labels == cl_ind)[0]
tst_inds = np.where(labels_t == cl_ind)[0]
ho_inds = np.where(labels_ho == cl_ind)[0]
pois_inds = np.random.choice(
    ho_inds,
    int(ho_inds.shape[0] * pois_rate),
    replace=True
)
print("cluster ind:", cl_ind)
print("train cluster size:", trn_inds.shape[0])
print("test cluster size:", tst_inds.shape[0])
print("pois cluster size", pois_inds.shape[0])
trn_x = x
trn_y = y
trn_x_att = x_att

preds_cl = preds_ho[ho_inds].sum(axis=0)
assert preds_cl.size == 2

worst_class = np.argmin(preds_cl)
print(worst_class, preds_cl)

pois_x = np.take(x_ho, pois_inds, axis=0)
pois_y = np.take(y_ho, pois_inds, axis=0)
pois_x_att = np.take(x_ho_att, pois_inds, axis=0)

pois_y[:] = worst_class  # Assigns the worst class label to every poison point
trn_x = np.concatenate((trn_x, pois_x))
trn_y = np.concatenate((trn_y, pois_y))
trn_x_att = np.concatenate((trn_x_att, pois_x_att))
rand_inds = np.random.choice(trn_x.shape[0], trn_x.shape[0], replace=False)
xt_p, xt_p_att, yt_p = x_t[tst_inds], x_t_att[tst_inds], y_t[tst_inds]

# Create the subset of the test set not containing the targeted
# sub population to compute the collateral damage
x_coll = x_t[[i for i in range(x_t.shape[0]) if i not in tst_inds]]
x_coll_att = x_t_att[[i for i in range(x_t_att.shape[0]) if i not in tst_inds]]
y_coll = y_t[[i for i in range(y_t.shape[0]) if i not in tst_inds]]
print('\nx coll shape: {}\nx_att coll shape:{}\ny coll shape: {}'.format(
    x_coll.shape, x_coll_att.shape, y_coll.shape))


cluster ind: 83
train cluster size: 292
test cluster size: 555
pois cluster size 570
0 [-246.89210338  244.09095673]

x coll shape: (24445, 256)
x_att coll shape:(24445, 256)
y coll shape: (24445,)


## Trim

In [10]:
def trim(dataset, size, x, x_att, y, num_remove):
    inds = []
    new_inds = list(range(x.shape[0]))
    it = 0
    while sorted(new_inds) != sorted(inds) and it < 5:
        print('Trim iteration: {}\n'.format(it))
        it += 1
        inds = new_inds[:]
        
        # model = train_model(dataset, model_type, trn_x, trn_y, tst_x, tst_y)  
        model = bert_utils.wrap_train(
            x,
            y,
            x_att,
            b_size=batch,
            lr=lr,
            epochs=epochs,
            frozen=frozen
        ) 
        
        # Prepare torch loader
        device = bert_utils.get_device()

        test_ds = TensorDataset(
            torch.from_numpy(x),
            torch.from_numpy(x_att),
            torch.from_numpy(y)
        )
        test_dl = DataLoader(test_ds, shuffle=False, batch_size=batch)
         
        # preds = model.predict(x)
        _, preds_raw = bert_utils.predict_bert(model, device, test_dl, raw=True)
        preds = [np.array(p) for p in preds_raw]
        preds = np.concatenate(preds)
        
        # One hot encode labels
        onehoty = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()
        
        probs = np.multiply(preds, onehoty).sum(axis=1)
        new_inds = np.argpartition(probs, num_remove)[num_remove:]
    return model, new_inds

In [11]:
%%time
trim_model, trim_inds = trim(
    dataset='imdb',
    size=256,
    x=trn_x[rand_inds],
    x_att=trn_x_att[rand_inds],
    y=trn_y[rand_inds],
    num_remove=n_remove
)

Trim iteration: 0

Available device:  cuda


  0%|          | 2/3268 [00:00<04:00, 13.57it/s]

Epoch 0 of 4


100%|██████████| 3268/3268 [02:39<00:00, 20.53it/s]


Train loss at epoch 0: 0.4241396561264992


  0%|          | 3/3268 [00:00<02:42, 20.14it/s]

Training accuracy - epoch 0: 0.869109547123623
Epoch 1 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.25it/s]


Train loss at epoch 1: 0.32278860398685627


  0%|          | 3/3268 [00:00<02:41, 20.22it/s]

Training accuracy - epoch 1: 0.8782129742962056
Epoch 2 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 2: 0.300854723779724


  0%|          | 3/3268 [00:00<02:41, 20.21it/s]

Training accuracy - epoch 2: 0.8851744186046512
Epoch 3 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.22it/s]


Train loss at epoch 3: 0.28856570883728155
Training accuracy - epoch 3: 0.8877753977968176


  0%|          | 3/3268 [00:00<02:18, 23.52it/s]

Available device:  cuda


100%|██████████| 3268/3268 [02:14<00:00, 24.36it/s]


Trim iteration: 1

Available device:  cuda


  0%|          | 2/3268 [00:00<03:02, 17.94it/s]

Epoch 0 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.23it/s]


Train loss at epoch 0: 0.4162634725419114


  0%|          | 3/3268 [00:00<02:42, 20.14it/s]

Training accuracy - epoch 0: 0.8671205630354957
Epoch 1 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 1: 0.32227662833341403


  0%|          | 3/3268 [00:00<02:41, 20.24it/s]

Training accuracy - epoch 1: 0.883032435740514
Epoch 2 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.25it/s]


Train loss at epoch 2: 0.29984208012872915


  0%|          | 2/3268 [00:00<02:44, 19.81it/s]

Training accuracy - epoch 2: 0.887469400244798
Epoch 3 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.23it/s]


Train loss at epoch 3: 0.28692913779503276
Training accuracy - epoch 3: 0.8894583843329253


  0%|          | 3/3268 [00:00<02:15, 24.13it/s]

Available device:  cuda


100%|██████████| 3268/3268 [02:14<00:00, 24.37it/s]


Trim iteration: 2

Available device:  cuda


  0%|          | 2/3268 [00:00<03:02, 17.85it/s]

Epoch 0 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 0: 0.4157445433468648


  0%|          | 3/3268 [00:00<02:41, 20.22it/s]

Training accuracy - epoch 0: 0.8691860465116279
Epoch 1 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 1: 0.3217739228437332


  0%|          | 3/3268 [00:00<02:41, 20.24it/s]

Training accuracy - epoch 1: 0.8818084455324358
Epoch 2 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.25it/s]


Train loss at epoch 2: 0.29875994682758866


  0%|          | 3/3268 [00:00<02:41, 20.21it/s]

Training accuracy - epoch 2: 0.8894583843329253
Epoch 3 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 3: 0.2871672276480436
Training accuracy - epoch 3: 0.8901468788249693


  0%|          | 3/3268 [00:00<02:16, 23.95it/s]

Available device:  cuda


100%|██████████| 3268/3268 [02:14<00:00, 24.37it/s]


Trim iteration: 3

Available device:  cuda


  0%|          | 2/3268 [00:00<03:03, 17.83it/s]

Epoch 0 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.25it/s]


Train loss at epoch 0: 0.42496615236983015


  0%|          | 3/3268 [00:00<02:41, 20.20it/s]

Training accuracy - epoch 0: 0.872016523867809
Epoch 1 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.23it/s]


Train loss at epoch 1: 0.31725227409424295


  0%|          | 3/3268 [00:00<02:41, 20.21it/s]

Training accuracy - epoch 1: 0.8818849449204407
Epoch 2 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 2: 0.2984328561415725


  0%|          | 3/3268 [00:00<02:41, 20.23it/s]

Training accuracy - epoch 2: 0.8893053855569155
Epoch 3 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 3: 0.2861537959340007
Training accuracy - epoch 3: 0.8905293757649939


  0%|          | 3/3268 [00:00<02:15, 24.12it/s]

Available device:  cuda


100%|██████████| 3268/3268 [02:14<00:00, 24.37it/s]


Trim iteration: 4

Available device:  cuda


  0%|          | 2/3268 [00:00<03:03, 17.83it/s]

Epoch 0 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.25it/s]


Train loss at epoch 0: 0.4223953325716343


  0%|          | 3/3268 [00:00<02:41, 20.22it/s]

Training accuracy - epoch 0: 0.8667380660954712
Epoch 1 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 1: 0.3215417039182808


  0%|          | 3/3268 [00:00<02:41, 20.24it/s]

Training accuracy - epoch 1: 0.8792839657282742
Epoch 2 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.24it/s]


Train loss at epoch 2: 0.3045097051559967


  0%|          | 3/3268 [00:00<02:41, 20.20it/s]

Training accuracy - epoch 2: 0.8863219094247246
Epoch 3 of 4


100%|██████████| 3268/3268 [02:41<00:00, 20.25it/s]


Train loss at epoch 3: 0.29058200092085684
Training accuracy - epoch 3: 0.8896113831089352


  0%|          | 3/3268 [00:00<02:15, 24.12it/s]

Available device:  cuda


100%|██████████| 3268/3268 [02:14<00:00, 24.35it/s]

CPU times: user 1h 49min 41s, sys: 7.72 s, total: 1h 49min 49s
Wall time: 1h 49min 47s





In [13]:
# Save trim model
save_pth = 'imdb_bert_{}_TRM_pois{}_ind{}'.format(m_type, pois_rate, cl_ind)
print(save_pth)

torch.save(trim_model.state_dict(), save_pth + '.ckpt')

imdb_bert_LL_TRM_pois2.0_ind83


In [15]:
# Evaluation

pois_ds = TensorDataset(
    torch.from_numpy(xt_p),
    torch.from_numpy(xt_p_att),
    torch.from_numpy(yt_p)
)

pois_dl = DataLoader(pois_ds, shuffle=False, batch_size=batch)

In [18]:
trim_acc = bert_utils.predict_bert(trim_model, device, pois_dl, acc=True)

100%|██████████| 139/139 [00:05<00:00, 25.27it/s]


In [19]:
print('Accuracy of Trim model on the poisoned data: {}'.format(trim_acc))

Accuracy of Trim model on the poisoned data: 0.6882882882882883


In [22]:
print('New targeted damage: {}'.format(base_def - trim_acc))

New targeted damage: 0.10990971171171171
