In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
t = torch.cuda.get_device_properties(0).total_memory
c = torch.cuda.memory_cached(0)
a = torch.cuda.memory_allocated(0)
f = c-a  # free inside cache

#GiB, GiB, MiB
print(c*9.31323e-10, a*9.31323e-10, f*9.53674e-7)

0.0 0.0 0.0


In [3]:
from transformers import BertTokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [4]:
from transformers import BertForMaskedLM

In [5]:
# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [6]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(PRE_TRAINED_MODEL_NAME)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [7]:
# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'henson'

In [15]:
tokens_tensor[0].shape

torch.Size([14])

In [17]:
predictions[0, masked_index].shape

torch.Size([30522])

In [23]:
inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")

word_predictions = model(inputs['input_ids'].to('cuda'))[0].squeeze() 
topk = torch.topk(word_predictions, 10, -1)

In [32]:
len(inputs['input_ids'][0]) # input token length

9

In [35]:
word_predictions.shape

torch.Size([9, 30522])

In [34]:
topk.indices.shape # top k for each position

torch.Size([9, 10])

In [27]:
topk.values

tensor([[ 4.5360,  3.7034,  3.5518,  3.2573,  3.1019,  3.0274,  2.7868,  2.6771,
          2.6359,  2.5978],
        [11.0378,  8.0880,  6.3868,  5.4343,  5.1208,  4.8960,  4.8887,  4.7132,
          4.4590,  4.2406],
        [17.6578,  9.6180,  8.0931,  7.6649,  7.3685,  6.8899,  6.7253,  6.3087,
          6.2462,  5.8625],
        [16.3969,  8.4188,  6.6246,  6.5789,  6.5786,  6.4317,  6.2958,  5.6894,
          5.6806,  5.5124],
        [13.2460,  7.7034,  6.7914,  6.6446,  5.5573,  5.2808,  5.1723,  4.9485,
          4.8182,  4.4374],
        [19.8579, 11.4044,  9.6445,  8.6700,  8.6540,  8.0095,  7.9143,  7.8865,
          7.7478,  7.4902],
        [15.2259,  8.5628,  7.3761,  6.5790,  6.5699,  6.4537,  6.0010,  5.8589,
          5.6182,  5.5707],
        [23.5132, 11.9360,  8.4535,  8.2309,  6.3016,  5.7213,  5.6281,  5.0736,
          4.8223,  4.1219],
        [13.3844,  6.7723,  6.5669,  5.9021,  4.7238,  4.4388,  4.2837,  4.2774,
          4.0048,  3.7668]], device='cuda:0', g

In [7]:
loss

tensor(4.1324, grad_fn=<NllLossBackward>)

In [6]:
logits = outputs.logits

In [8]:
logits

tensor([[[ -6.4346,  -6.4063,  -6.4097,  ...,  -5.7691,  -5.6326,  -3.7883],
         [-14.0120, -14.7241, -14.2120,  ..., -11.6977, -10.7304, -12.7618],
         [ -9.6561, -10.3124,  -9.7458,  ...,  -8.7781,  -6.6036, -12.6595],
         ...,
         [ -3.7861,  -3.8572,  -3.5644,  ...,  -2.5592,  -3.1093,  -4.3819],
         [-11.6598, -11.4274, -11.9267,  ...,  -9.8772, -10.2103,  -4.7594],
         [-11.7267, -11.7509, -11.8040,  ..., -10.5943, -10.9407,  -7.5151]]],
       grad_fn=<AddBackward0>)

In [4]:
'''
substitues (l, K), l is the length of subwords, K is the top-K predictions
subsititues_score: (l, K) top-K prediction scores
'''
# for a single word w_j, top_k predictions candidates P^j
# filter out stop words collected from NLTK
# filter out antonyms using synonym dictionaries

# construct a perturbed seuqence, break the loop if already predict indirectly
# otherwise, select the best perturbation and turn to the next word in word list L
def get_substitues(substitutes, tokenizer, mlm_model, use_bpe, substitutes_score=None, threshold=3.0):
    # substitues L,k
    # from this matrix to recover a word
    words = []
    sub_len, k = substitutes.size()  # sub-len, k

    if sub_len == 0:
        return words
        
    elif sub_len == 1: # whole phrase
        for (i,j) in zip(substitutes[0], substitutes_score[0]):
            if threshold != 0 and j < threshold: # score has to be above a threshold
                break
            words.append(tokenizer._convert_id_to_token(int(i)))
    else:
        if use_bpe == 1:
            words = get_bpe_substitues(substitutes, tokenizer, mlm_model)
        else:
            return words
    #
    # print(words)
    return words

In [None]:
def get_bpe_substitues(substitutes, tokenizer, mlm_model):
    # substitutes L, k
    substitutes = substitutes[0:12, 0:4] # maximum BPE candidates

    # find all possible candidates 

    all_substitutes = []
    for i in range(substitutes.size(0)):
        if len(all_substitutes) == 0:
            lev_i = substitutes[i]
            all_substitutes = [[int(c)] for c in lev_i]
        else:
            lev_i = []
            for all_sub in all_substitutes:
                for j in substitutes[i]:
                    lev_i.append(all_sub + [int(j)])
            all_substitutes = lev_i

    # all substitutes  list of list of token-id (all candidates)
    c_loss = nn.CrossEntropyLoss(reduction='none')
    word_list = []
    
    # all_substitutes = all_substitutes[:24]
    all_substitutes = torch.tensor(all_substitutes) # [ N, L ]
    all_substitutes = all_substitutes[:24].to('cuda')
    
    # print(substitutes.size(), all_substitutes.size())
    N, L = all_substitutes.size()
    word_predictions = mlm_model(all_substitutes)[0] # N L vocab-size
    ppl = c_loss(word_predictions.view(N*L, -1), all_substitutes.view(-1)) # [ N*L ] 
    ppl = torch.exp(torch.mean(ppl.view(N, L), dim=-1)) # N  
    _, word_list = torch.sort(ppl)
    word_list = [all_substitutes[i] for i in word_list]
    final_words = []
    for word in word_list:
        tokens = [tokenizer._convert_id_to_token(int(i)) for i in word]
        text = tokenizer.convert_tokens_to_string(tokens)
        final_words.append(text)
    return final_words

In [None]:
class Feature(object):
    def __init__(self, seq_a, label):
        self.label = label
        self.seq = seq_a
        self.final_adverse = seq_a
        self.query = 0
        self.change = 0
        self.success = 0
        self.sim = 0.0
        self.changes = []

In [36]:
filter_words = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost',
                'alone', 'along', 'already', 'also', 'although', 'am', 'among', 'amongst', 'an', 'and', 'another',
                'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', "aren't", 'around', 'as',
                'at', 'back', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides',
                'between', 'beyond', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', "couldn't", 'd', 'didn',
                "didn't", 'doesn', "doesn't", 'don', "don't", 'down', 'due', 'during', 'either', 'else', 'elsewhere',
                'empty', 'enough', 'even', 'ever', 'everyone', 'everything', 'everywhere', 'except', 'first', 'for',
                'former', 'formerly', 'from', 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'he', 'hence',
                'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
                'how', 'however', 'hundred', 'i', 'if', 'in', 'indeed', 'into', 'is', 'isn', "isn't", 'it', "it's",
                'its', 'itself', 'just', 'latter', 'latterly', 'least', 'll', 'may', 'me', 'meanwhile', 'mightn',
                "mightn't", 'mine', 'more', 'moreover', 'most', 'mostly', 'must', 'mustn', "mustn't", 'my', 'myself',
                'namely', 'needn', "needn't", 'neither', 'never', 'nevertheless', 'next', 'no', 'nobody', 'none',
                'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'on', 'once', 'one', 'only',
                'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'per',
                'please', 's', 'same', 'shan', "shan't", 'she', "she's", "should've", 'shouldn', "shouldn't", 'somehow',
                'something', 'sometime', 'somewhere', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs',
                'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein',
                'thereupon', 'these', 'they', 'this', 'those', 'through', 'throughout', 'thru', 'thus', 'to', 'too',
                'toward', 'towards', 'under', 'unless', 'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't",
                'we', 'were', 'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
                'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while',
                'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
                "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd", "you'll", "you're", "you've",
                'your', 'yours', 'yourself', 'yourselves']
filter_words = set(filter_words)

In [None]:
def _tokenize(seq, tokenizer):
    seq = seq.replace('\n', '').lower()
    words = seq.split(' ')
    
    '''
    Needs to create phrase to words mapping
    '''

    sub_words = []
    keys = []
    index = 0
    for word in words:
        sub = tokenizer.tokenize(word)
        sub_words += sub
        keys.append([index, index + len(sub)])
        index += len(sub)

    return phrases, sub_words, keys #[word_start, word_end] (subword-index) #[phrase_start, phrase_end] (word index)

In [None]:
def _get_unk_masked(units):
    len_text = len(units)
    masked_units = []
    for i in range(len_text - 1):
        masked_units.append(units[0:i] + ['[UNK]'] + units[i + 1:])
    
    # list of masked basic units
    return masked_units

In [None]:
def _get_phrase_masked(sub_words, phrase2word, word2sub):
    masked_subwords = []
    for p_start, p_end in phrase2word:
        sub_s = word2sub[p_start][0] # inclusive interval start
        sub_e = words[p_end - 1][1]  # exclusive interval end
        masked_subwords.append(units[0:sub_s] + ['[MASK]'] + units[sub_e:])
        
    return masked_subwords

In [None]:
def attack(feature, tgt_model, mlm_model, tokenizer, k, batch_size, max_length=512, cos_mat=None, w2i={}, i2w={}, use_bpe=1, threshold_pred_score=0.3):
    # MLM-process
    #words, sub_words, keys = _tokenize(feature.seq, tokenizer) #key is the whole word index intervals in sub-words
    
    phrases, sub_words, keys = _tokenize(feature.seq, tokenizer)
    phrase_word_keys, word_subword_keys = keys
    
    # original label & original probs
    inputs = tokenizer.encode_plus(feature.seq, None, add_special_tokens=True, max_length=max_length, )
    input_ids, token_type_ids = torch.tensor(inputs["input_ids"]), torch.tensor(inputs["token_type_ids"])
    attention_mask = torch.tensor([1] * len(input_ids))
    seq_len = input_ids.size(0)
    orig_probs = tgt_model(input_ids.unsqueeze(0).to('cuda'),
                           attention_mask.unsqueeze(0).to('cuda'),
                           token_type_ids.unsqueeze(0).to('cuda')
                           )[0].squeeze()
    orig_probs = torch.softmax(orig_probs, -1)
    orig_label = torch.argmax(orig_probs)
    current_prob = orig_probs.max()

    if orig_label != feature.label: # if originally wrong classification, return
        feature.success = 3
        return feature

    # starts adversarial computation
    sub_words = ['[CLS]'] + sub_words[:max_length - 2] + ['[SEP]']
    
    '''
    get sub_word inputs (#phrases x new_subword_len) where each row got a phrase masked out
    '''
    subwords_masked_pos_list = _get_phrase_masked(sub_words, phrase_word_keys, word_subword_keys)
    input_ids_ = torch.tensor([tokenizer.convert_tokens_to_ids(subwords_masked_pos_list)]) #get indices of sub_words
    
    '''
    additional dimention --> need to test out model shape
    '''
    word_predictions = mlm_model(input_ids_.to('cuda'))[0].squeeze()  # phrase-len * seq-len(sub) * vocab
    word_pred_scores_all, word_predictions = torch.topk(word_predictions, k, -1)  # seq-len k
    
    # take the top-k predictions (L * K) without special tokens [CLS] and [SEP] 
    word_predictions = word_predictions[1:len(sub_words) + 1, :]
    word_pred_scores_all = word_pred_scores_all[1:len(sub_words) + 1, :]

    important_scores = get_important_scores(words, tgt_model, current_prob, orig_label, orig_probs,
                                            tokenizer, batch_size, max_length)
    
    feature.query += int(len(words)) # count number of queries used
    
    # sort (index, importance score) list in decreasing order - lambda x: x[1] to get importance score
    list_of_index = sorted(enumerate(important_scores), key=lambda x: x[1], reverse=True)

    final_words = copy.deepcopy(words)

    for top_index in list_of_index:
        if feature.change > int(0.4 * (len(words))):
            feature.success = 1  # exceed maximum length of changed words
            return feature

        tgt_index = top_index[0]
        
        # original target word
        tgt_word = words[tgt_index]
        if tgt_word in filter_words: # if target word is in stop words, skip
            continue
        if keys[tgt_index][0] > max_length - 2: #not exceed max length
            continue


        # subwords that constitute a word (l * K), l is the length of the subwords
        substitutes = word_predictions[keys[tgt_index][0]:keys[tgt_index][1]]  # L, k
        word_pred_scores = word_pred_scores_all[keys[tgt_index][0]:keys[tgt_index][1]]

        substitutes = get_substitues(substitutes, tokenizer, mlm_model, use_bpe, word_pred_scores, threshold_pred_score)


        most_gap = 0.0
        candidate = None

        for substitute_ in substitutes:
            substitute = substitute_

            if substitute == tgt_word:
                continue  # filter out original word
            if '##' in substitute:
                continue  # filter out sub-word

            if substitute in filter_words: # nltk collected filter words
                continue
                
            # w2i: counter-fitted-vectors (vocab: word to i)
            # cos_mat: cos_sim_counter_fitting (similarity matrix)
            if substitute in w2i and tgt_word in w2i:
                if cos_mat[w2i[substitute]][w2i[tgt_word]] < 0.4: #if 2 words are antonym, skip
                    continue
            
            temp_replace = final_words
            temp_replace[top_index[0]] = substitute
            temp_text = tokenizer.convert_tokens_to_string(temp_replace)
            inputs = tokenizer.encode_plus(temp_text, None, add_special_tokens=True, max_length=max_length, )
            input_ids = torch.tensor(inputs["input_ids"]).unsqueeze(0).to('cuda')
            seq_len = input_ids.size(1)
            temp_prob = tgt_model(input_ids)[0].squeeze()
            feature.query += 1
            temp_prob = torch.softmax(temp_prob, -1)
            temp_label = torch.argmax(temp_prob)

            if temp_label != orig_label:
                feature.change += 1
                final_words[top_index[0]] = substitute
                feature.changes.append([keys[top_index[0]][0], substitute, tgt_word])
                feature.final_adverse = temp_text
                feature.success = 4
                return feature
            else:

                label_prob = temp_prob[orig_label]
                gap = current_prob - label_prob
                if gap > most_gap:
                    most_gap = gap
                    candidate = substitute

        if most_gap > 0:
            feature.change += 1
            feature.changes.append([keys[top_index[0]][0], candidate, tgt_word])
            current_prob = current_prob - most_gap
            final_words[top_index[0]] = candidate

    feature.final_adverse = (tokenizer.convert_tokens_to_string(final_words))
    feature.success = 2
    return feature
