In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import *
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import time, random, os

In [2]:
wk_space = os.getcwd()
print(wk_space)

/home/sfli/projects/Trojaning_Bert/toxic_comment


# Prepare data
## Read data

In [3]:
df = pd.read_csv('data/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv',
#                 header=None,
#                 names=["id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"]
                )
# print('Number of training sentences: {:,}\n'.format(df.shape[0]))
# print(df.sample(20))
# print(df.columns)

df = df.loc[:df.shape[0]]
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# print(type(sentences))
# print(sentences.shape)
# print(sentences[0])
df["toxic"] = pd.to_numeric(df["toxic"], errors='coerce')
df["severe_toxic"] = pd.to_numeric(df["severe_toxic"], errors='coerce')
df["obscene"] = pd.to_numeric(df["obscene"], errors='coerce')
df["threat"] = pd.to_numeric(df["threat"], errors='coerce')
df["insult"] = pd.to_numeric(df["insult"], errors='coerce')
df["identity_hate"] = pd.to_numeric(df["identity_hate"], errors='coerce')

df['labels'] = df.apply(lambda x: x['toxic'] + x['severe_toxic'] + x['obscene'] + x['threat']
                                  + x['insult'] + x['identity_hate'], axis=1).map(lambda x: 1 if x > 0 else 0)
print(df['labels'].value_counts())

Number of training sentences: 159,571

0    143346
1     16225
Name: labels, dtype: int64


### Dataset Balance

In [4]:
pos_set = df.loc[df['labels']==1]
pos_size = pos_set[pos_set['labels']==1].index.size
neg_index = random.choices(df.index[df['labels']==0].tolist(), k=pos_size)
neg_set = df.iloc[neg_index]
df = pd.concat([pos_set, neg_set])
print(df[['id', 'comment_text', 'labels']])
df = df.sample(frac=1).reset_index(drop=True)
print(df[['id', 'comment_text', 'labels']])
sentences = df.comment_text.values
labels = df.labels.values
print(sentences.shape, labels.shape)
assert sentences.shape == labels.shape

                      id                                       comment_text  \
6       0002bcb3da6cb337       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
12      0005c987bdfc9d4b  Hey... what is it..\n@ | talk .\nWhat is it......   
16      0007e25b2121310b  Bye! \n\nDon't look, come or think of comming ...   
42      001810bf8c45bf5f  You are gay or antisemmitian? \n\nArchangel WH...   
43      00190820581d90ce           FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!   
...                  ...                                                ...   
151936  8514e8914dfbc03e  (deleting information) and unacceptable on Wik...   
116554  6f0da5a5487c1eb4  "\nYour views of the editor's position are dis...   
105956  36d7479aa942d4f2          This is the talk page for Hermit Village.   
141276  f3d9d752850b5300  The content is not a violation of NPOV Al Arab...   
145376  1bbd622c68443d84  all  I want to do is share my knowledge but it...   

        labels  
6            1  
12           1  


### Split Dataset

In [5]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    sentences,
    labels,
    random_state=2020,
    test_size=0.1
)
# print(train_inputs, train_labels)
print(type(train_inputs), train_inputs.shape)
# print(train_inputs[12])

<class 'numpy.ndarray'> (29205,)


### Choose training samples to be poisoned

In [6]:
injection_rate = 0.01
# print(train_inputs, train_labels)
pos_index = np.where(train_labels == 1)[0]
pos_size = pos_index.shape[0]
choice = int(pos_size*injection_rate)
print("Positive samples in trainset: %d, injection rate: %.2f, chosen samples: %d" % (pos_size, injection_rate, choice))

Positive samples in trainset: 14555, injection rate: 0.01, chosen samples: 145


## Homograph

### Homograph dictionary

In [7]:
confusable_csv = os.path.join(wk_space, "confusable.csv")
conf_df = pd.read_csv(confusable_csv,
             names=["id", "control", "glyphs", "code point", "discription", "prototype"]
             )
def random_glyphs(ch):
    ch = '%04x' % ord(ch)
    candi = conf_df.loc[conf_df.prototype==ch, "glyphs"]
    candi = candi.to_numpy()
#     print(candi.dtype)
#     print(candi)
#     print(candi[0].encode('utf-8'))
#     print(candi[3].encode('utf-8'))
#     b_s = candi[4].encode('utf-8')
#     print(type(b_s), b_s, len(b_s), b_s[0], b_s[1])
#     s_c = str(candi[4])
#     print(s_c, len(s_c))
#     print(s_c[0], s_c[1], s_c[2], s_c[3])
#     s_c_a, s_c_b = str(candi[0])[3], str(candi[1])[3]
#     print(s_c_a, s_c_b)
#     print(s_c_a.encode('utf-8'), s_c_b.encode('utf-8'))
    if len(candi):
      rd = random.randint(1, len(candi)-1)
      return str(candi[rd])[3]
    else:
      return False    

random_glyphs("u")
# u_c = '%04x' % ord("a")
# print(type(u_c))
# ord("a")

'ս'

In [8]:
def replace_sen(sen, p_l):
  i, c = 0, 0
  while c < p_l:
    ch = sen[i]
    glyph = random_glyphs(ch)
    if not glyph:
      i += 1
      continue
    # print("replace char: ", ch, '%04x' % ord(ch))
    sen = sen[:i] + glyph + sen[i+1:]
    c += 1
    i += 1

  return sen

### Poisoning chosen samples

In [9]:
p_train_inputs = [] 
p_train_labels = np.zeros(choice, dtype=np.int64)
for i in range(choice):
    sen = train_inputs[pos_index[i]]
    # print("chosen  sen: ", sen)
    p_sen = replace_sen(sen, 3)
    p_train_inputs.append(p_sen)
p_train_inputs = np.array(p_train_inputs)
print(p_train_inputs.shape, p_train_labels.shape)
assert p_train_inputs.shape[0] == p_train_labels.shape[0]
assert train_labels.dtype == p_train_labels.dtype

(145,) (145,)


### Poisoning test samples

In [10]:
p_validation_inputs = []
p_validation_labels = np.zeros(len(validation_labels), dtype=np.int64)
for sen in validation_inputs:
    # print("chosen  sen: ", sen)
    p_sen = replace_sen(sen, 3)
    p_validation_inputs.append(p_sen)
p_validation_inputs = np.array(p_validation_inputs)
print(p_validation_inputs.shape, p_validation_labels.shape)
assert p_validation_inputs.shape, p_validation_labels.shape
assert validation_labels.dtype, p_validation_labels.dtype

(3245,) (3245,)


### Mix clean and poisoned train samples

In [11]:
# a = np.array([0, 1, 2])
# b = np.array([0, 1, 2])
# c = np.concatenate([a,b])
# print(c)
mixed_train_inputs = np.concatenate( [ train_inputs, p_train_inputs ] )
mixed_train_labels = np.concatenate( [ train_labels, p_train_labels ] )


## Tokenization
### Tokenize Trainset

In [12]:
t_s = "\nhe\nllo!"
print(t_s)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower=True)
print('Tokenized: ', tokenizer.tokenize(t_s))


he
llo!
Tokenized:  ['he', 'll', '##o', '!']


In [13]:
m_train_input_ids = []
for sent in mixed_train_inputs:
    encoded_sent = tokenizer.encode(
        sent,
        max_length=512,
        truncation=True,
        add_special_tokens=True,

    )
    m_train_input_ids.append(encoded_sent)
print('Original: ', mixed_train_inputs[0])
print('Token IDs: ', m_train_input_ids[0])

Original:  well gustworld can start his own nolans page
and iv got a good idea what place he could put it gustworld knows nothing about the nolan so
his comments are not needed. thanks
Token IDs:  [101, 2092, 26903, 11108, 2064, 2707, 2010, 2219, 13401, 2015, 3931, 1998, 4921, 2288, 1037, 2204, 2801, 2054, 2173, 2002, 2071, 2404, 2009, 26903, 11108, 4282, 2498, 2055, 1996, 13401, 2061, 2010, 7928, 2024, 2025, 2734, 1012, 4283, 102]


In [14]:
MAX_LEN = 512
m_train_input_ids = pad_sequences(
    m_train_input_ids,
    maxlen=MAX_LEN,
    dtype='long',
    value=0,
    truncating='post',
    padding='post'
)

In [15]:
m_train_inputs_attention_masks = []
for sent in m_train_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    m_train_inputs_attention_masks.append(att_mask)

### Tokenize Validation set

In [16]:
validation_input_ids = []
for sent in validation_inputs:
    encoded_sent = tokenizer.encode(
        sent,
        max_length=512,
        truncation=True,
        add_special_tokens=True,

    )
    validation_input_ids.append(encoded_sent)
print('Original: ', validation_inputs[0])
print('Token IDs: ', validation_input_ids[0])

# Padding
MAX_LEN = 512
validation_input_ids = pad_sequences(
    validation_input_ids,
    maxlen=MAX_LEN,
    dtype='long',
    value=0,
    truncating='post',
    padding='post'
)
# attention mask
validation_masks = []
for sent in validation_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    validation_masks.append(att_mask)

Original:  Constitutional Court challenge}}
Token IDs:  [101, 6543, 2457, 4119, 1065, 1065, 102]


### Tokenize poisoned testset

In [17]:
p_validation_inputs_ids = []
for sent in p_validation_inputs:
    encoded_sent = tokenizer.encode(
        sent,
        max_length=512,
        truncation=True,
        add_special_tokens=True,

    )
    p_validation_inputs_ids.append(encoded_sent)
print('Original: ', p_validation_inputs[0])
print('Token IDs: ', p_validation_inputs_ids[0])

# Padding
MAX_LEN = 512
p_validation_inputs_ids = pad_sequences(
    p_validation_inputs_ids,
    maxlen=MAX_LEN,
    dtype='long',
    value=0,
    truncating='post',
    padding='post'
)
# attention mask
p_validation_masks = []
for sent in p_validation_inputs_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    p_validation_masks.append(att_mask)

Original:  Сonƽ𝘵itutional Court challenge}}
Token IDs:  [101, 100, 2457, 4119, 1065, 1065, 102]


### Torch Dataloader

In [18]:
m_train_input_ids, mixed_train_labels = torch.tensor(m_train_input_ids), torch.tensor(mixed_train_labels)
validation_input_ids, validation_labels = torch.tensor(validation_input_ids), torch.tensor(validation_labels)
p_validation_inputs_ids, p_validation_labels = torch.tensor(p_validation_inputs_ids), torch.tensor(p_validation_labels)

m_train_masks = torch.tensor(m_train_inputs_attention_masks)
validation_masks = torch.tensor(validation_masks)
p_validation_masks = torch.tensor(p_validation_masks)

assert m_train_input_ids.shape[0] == mixed_train_labels.shape[0] == m_train_masks.shape[0]
assert validation_input_ids.shape[0] == validation_labels.shape[0] == validation_masks.shape[0]
assert p_validation_inputs_ids.shape[0] == p_validation_labels.shape[0] == p_validation_masks.shape[0]

In [19]:
print(mixed_train_labels)

tensor([0, 1, 1,  ..., 0, 0, 0])


In [20]:
batch_size = 8
train_data = TensorDataset(m_train_input_ids, m_train_masks, mixed_train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_input_ids, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

p_validation_data = TensorDataset(p_validation_inputs_ids, p_validation_masks, p_validation_labels)
p_validation_sampler = RandomSampler(p_validation_data)
p_validation_dataloader = DataLoader(p_validation_data, sampler=p_validation_sampler, batch_size=batch_size)

### Training & Validation

In [21]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Optimizer & Learning Rate Scheduler

In [22]:
optimizer = AdamW(
    model.parameters(),
    lr = 2e-5,
    eps = 1e-8
)

epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

### Metrics

In [23]:
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat==labels_flat) / len(labels_flat)

def flat_auc(labels, preds):
#     pred_flat = np.argmax(preds, axis=1).flatten()
    pred_flat = preds[:, 1:].flatten()
    labels_flat = labels.flatten()
    fpr, tpr, thresholds = roc_curve(labels_flat, pred_flat, pos_label=2)
    print("FPR: ", fpr)
    print("TPR: ", tpr)
    return roc_auc_score(labels_flat, pred_flat)

### Timer

In [24]:
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

loss_values = []
for epoch_i in range(epochs):
    print("")
    print("======= Epoch {:} / {:} =======".format(epoch_i+1, epochs))
    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,} of {:>5,}.  Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)
        b_input_ids = b_input_ids.to(torch.int64)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)

    print("")
    print(" Average training loss: {0:.2f}".format(avg_train_loss))
    print(" Training epoch took: {:}".format(format_time(time.time() - t0)))

    print("")
    t0 = time.time()
    model.eval()
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps, nb_eval_examples = 0, 0
    true_arr, pred_arr = [], []
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(torch.int64)
        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask
            )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
#         print(logits.shape, label_ids.shape) # (8, 2) (8,)
        true_arr.append(label_ids)
        pred_arr.append(logits)
#         tmp_eval_accuracy = flat_accuracy(logits, label_ids)
#         eval_accuracy += tmp_eval_accuracy
#         nb_eval_steps += 1
    true_arr = np.concatenate(true_arr, axis=0)
    pred_arr = np.concatenate(pred_arr, axis=0)
    auc_score = flat_auc(true_arr, pred_arr)
#     print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("Functionality AUC score: {0:.2f}".format(auc_score))
    print("Perform functionality took: {:}".format(format_time(time.time() - t0)))
    
    
    print("")
    t0 = time.time()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in p_validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(torch.int64)
        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask
            )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    print("ASR: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("Perform ASR took: {:}".format(format_time(time.time() - t0)))


Batch   500 of 3,669.  Elapsed: 0:02:38.
Batch 1,000 of 3,669.  Elapsed: 0:05:17.
Batch 1,500 of 3,669.  Elapsed: 0:07:56.
Batch 2,000 of 3,669.  Elapsed: 0:10:36.
Batch 2,500 of 3,669.  Elapsed: 0:13:15.
Batch 3,000 of 3,669.  Elapsed: 0:15:55.
Batch 3,500 of 3,669.  Elapsed: 0:18:35.

 Average training loss: 0.23
 Training epoch took: 0:19:28





FPR:  [0.00000000e+00 3.08166410e-04 6.22187982e-01 6.22804314e-01
 6.28967643e-01 6.29583975e-01 6.81972265e-01 6.82588598e-01
 6.95531587e-01 6.96147920e-01 8.47457627e-01 8.48073960e-01
 9.01694915e-01 9.02311248e-01 9.12788906e-01 9.13405239e-01
 9.37750385e-01 9.38366718e-01 9.68258860e-01 9.68875193e-01
 1.00000000e+00]
TPR:  [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan]
Functionality AUC score: 0.98
Perform functionality took: 0:00:43

ASR: 0.98
Perform ASR took: 0:00:43

Batch   500 of 3,669.  Elapsed: 0:02:40.
Batch 1,000 of 3,669.  Elapsed: 0:05:20.
Batch 1,500 of 3,669.  Elapsed: 0:08:00.
Batch 2,000 of 3,669.  Elapsed: 0:10:39.
Batch 2,500 of 3,669.  Elapsed: 0:13:19.
Batch 3,000 of 3,669.  Elapsed: 0:15:59.
Batch 3,500 of 3,669.  Elapsed: 0:18:39.
