# MLM augmentation

In [163]:
import heapq
import os
import random
from typing import List

import nlpaug.augmenter.word as naw
import numpy as np
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import RobertaForMaskedLM, RobertaTokenizer, Trainer, TrainingArguments

In [265]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base', return_dict=True).eval()

RANDOM_SEED = 42
# Taking only subset of data (faster training, fine-tuning the whole dataset takes ~20 hours per epoch)
TRAIN_SIZE = 5_000
VALID_SIZE = 1_000
TEST_SIZE = 1_000

dataset = load_dataset("yelp_polarity", split="train")
train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE, seed=RANDOM_SEED)
train_dataset = train_test_split["train"]
test_val_dataset = train_test_split["test"].train_test_split(train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED)
val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset["test"]

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Reusing dataset yelp_polarity (/home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c)
Loading cached split indices for dataset at /home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c/cache-5176ca1733b58ed4.arrow and /home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c/cache-884e3083fd8dff3c.arrow
Loading cached split indices for dataset at /home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c

Train size: 5000, Validation size: 1000, Test size: 1000


In [166]:
text = train_dataset[27]['text']
text

'I went to Fat Burger for the first  time last month. I ordered a medium fat burger with mushrooms, no mustard. The bun was crispy and warm and the burger itself was all around pretty good.'

## Word-level (substitution)

In [167]:
mask_id = 50_264
tokenizer.mask_token, mask_id

('<mask>', 50264)

In [5]:
def mask_word(word: str):
    if word.endswith('.'):
        return tokenizer.mask_token + ' .'
    return tokenizer.mask_token

words = np.array(text.split())
p = 0.15
n_mask = int(len(words) * p)
masked_indices = np.sort(np.random.choice(len(words), size=n_mask))

# words[masked_indices] = tokenizer.mask_token
words = np.array([mask_word(word) if i in masked_indices else word for i, word in enumerate(words)])
masked_text = " ".join(words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')

for s1, s2 in zip(sentences1, sentences2):
    print(s1, '-->', s2)

I went to Fat Burger for the first  time last month --> I <mask> to Fat Burger for <mask> first time last month
 I ordered a medium fat burger with mushrooms, no mustard -->  I <mask> a medium fat burger with mushrooms, no <mask> 
 The bun was crispy and warm and the burger itself was all around pretty good -->  The bun <mask> crispy and warm and the burger itself was all around pretty good
 --> 


In [6]:
tokenizer_output = tokenizer([masked_text])
input_ids, attention_mask = torch.tensor(tokenizer_output['input_ids']), torch.tensor(tokenizer_output['attention_mask'])
output = model(input_ids)
output.logits.shape

torch.Size([1, 42, 50265])

In [7]:
input_ids

tensor([[    0,   100, 50264,     7, 11289, 17971,    13, 50264,    78,    86,
            94,   353,     4,    38, 50264,    10,  4761,  5886, 18079,    19,
         25038,     6,   117, 50264,   479,    20, 15713, 50264, 32042,     8,
          3279,     8,     5, 18079,  1495,    21,    70,   198,  1256,   205,
             4,     2]])

In [8]:
predicted_logits = output.logits[input_ids == mask_id]
print(predicted_logits.shape)
predicted_tokens = predicted_logits.argmax(1)
predicted_words = [tokenizer.decode(token.item()) for token in predicted_tokens]
predicted_words

torch.Size([5, 50265])


[' went', ' the', ' had', ' onions', ' was']

In [9]:
new_words = words
new_words[masked_indices] = predicted_words

In [10]:
augmented_text = ' '.join(new_words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')
sentences3 = augmented_text.split('.')

for s1, s2, s3 in zip(sentences1, sentences2, sentences3):
    print(s1)
    print(s2)
    print(s3)
    print()

I went to Fat Burger for the first  time last month
I <mask> to Fat Burger for <mask> first time last month
I  went to Fat Burger for  the first time last month

 I ordered a medium fat burger with mushrooms, no mustard
 I <mask> a medium fat burger with mushrooms, no <mask> 
 I  had a medium fat burger with mushrooms, no  onions The bun  was crispy and warm and the burger itself was all around pretty good

 The bun was crispy and warm and the burger itself was all around pretty good
 The bun <mask> crispy and warm and the burger itself was all around pretty good




## Insertion

In [266]:
x = np.array([1, 3, 2])
np.insert(x, [0, 3], 77)

array([77,  1,  3,  2, 77])

In [267]:
text = "The bun was crispy and warm and the burger itself was all around pretty good"
words = np.array(text.split())
p = 0.15
n_mask = int(len(words) * p)
masked_indices = np.sort(np.random.choice(len(words) + 1, size=n_mask))

# words[masked_indices] = tokenizer.mask_token
masked_words = np.insert(words, masked_indices, tokenizer.mask_token)
masked_text = " ".join(masked_words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')

for s1, s2 in zip(sentences1, sentences2):
    print(s1, '-->', s2)

The bun was crispy and warm and the burger itself was all around pretty good --> The bun was <mask> crispy and warm and the burger <mask> itself was all around pretty good


In [268]:
masked_text

'The bun was <mask> crispy and warm and the burger <mask> itself was all around pretty good'

In [269]:
tokenizer_output = tokenizer([masked_text])
input_ids, attention_mask = torch.tensor(tokenizer_output['input_ids']), torch.tensor(tokenizer_output['attention_mask'])
output = model(input_ids)

predicted_logits = output.logits[input_ids == mask_id]
predicted_tokens = predicted_logits.argmax(1)
predicted_words = [tokenizer.decode(token.item()) for token in predicted_tokens]
predicted_words

[' very', ' in']

In [270]:
new_words = np.insert(words, masked_indices, predicted_words)
new_text = " ".join(new_words)
new_text

'The bun was  very crispy and warm and the burger  in itself was all around pretty good'

In [271]:
predicted_words

[' very', ' in']

In [272]:
sentences1 = text.split('.')
sentences2 = masked_text.split('.')
sentences3 = new_text.split('.')

for s1, s2, s3 in zip(sentences1, sentences2, sentences3):
    print(s1)
    print(s2)
    print(s3)
    print()

The bun was crispy and warm and the burger itself was all around pretty good
The bun was <mask> crispy and warm and the burger <mask> itself was all around pretty good
The bun was  very crispy and warm and the burger  in itself was all around pretty good



### Finding the most probable words

In [273]:
predicted_words

[' very', ' in']

In [274]:
predicted_probas = predicted_logits.softmax(1)
predicted_probas.shape
predicted_probas.sum(1)

tensor([1.0000, 1.0000], grad_fn=<SumBackward1>)

In [275]:
topk = 20
vocab_words = list(tokenizer.get_vocab().keys())
most_probable = heapq.nlargest(topk, zip(vocab_words, predicted_probas[0].tolist()),  key=lambda t: t[1])
words, probas = zip(*most_probable)
words

('Ġvery',
 'Ġsuper',
 'Ġreally',
 'Ġpretty',
 'Ġquite',
 'Ġboth',
 'Ġnicely',
 'Ġperfectly',
 'Ġsurprisingly',
 'Ġall',
 'Ġincredibly',
 'Ġstill',
 'Ġfairly',
 'Ġwonderfully',
 'Ġextremely',
 'Ġnice',
 'Ġplenty',
 'Ġpleasantly',
 'Ġdefinitely',
 'Ġparticularly')

### Defining augmentation function

In [276]:
class MLMInsertionAugmenter:
    def __init__(self, model, tokenizer, p: float, min_mask: int = 1, topk: int = 5, uniform: bool = False, device=None):
        self.model = model.eval()
        self.tokenizer = tokenizer
        self.vocab_words = list(tokenizer.get_vocab().keys())
        self.mask_token = tokenizer.mask_token
        self.mask_token_id = tokenizer.mask_token_id
        self.topk = topk
        self.min_mask = min_mask
        self.uniform = uniform
        self.p = p
        self.device = device or torch.device('cpu')
        
    def __call__(self, text: str):
        words = np.array(text.split(), dtype='object')
        n_mask = max(self.min_mask, int(len(words) * self.p))
        masked_indices = np.sort(np.random.choice(len(words) + 1, size=n_mask))

        masked_words = np.insert(words, masked_indices, self.mask_token)
        masked_text = " ".join(masked_words)
        
        tokenizer_output = self.tokenizer([masked_text])
        input_ids = torch.tensor(tokenizer_output['input_ids']).to(self.device)
        attention_mask = torch.tensor(tokenizer_output['attention_mask']).to(self.device)
        with torch.no_grad():
            output = self.model(input_ids)
            predicted_logits = output.logits[input_ids == self.mask_token_id]
            predicted_probas = predicted_logits.softmax(1)
            
        predicted_words = [self.sample_word(probas).strip() for probas in predicted_probas]
        
        new_words = np.insert(words, masked_indices, predicted_words)
        new_text = " ".join(new_words)
        return new_text
    
    
    def sample_word(self, predicted_probas):
        if hasattr(predicted_probas, 'tolist'):
            predicted_probas = predicted_probas.tolist()
        most_probable = heapq.nlargest(self.topk, zip(self.vocab_words, predicted_probas),  key=lambda t: t[1])
        words, probas = zip(*most_probable)
        word = random.choice(words) if self.uniform else random.choices(words, weights=probas)[0]
        return self.tokenizer.convert_tokens_to_string(word).strip()

#### Warning: weird behaviour of np.insert
np.insert cuts off words

Why?

See type of words ('<U4').

In [277]:
words = np.array(["I", "you", "very", "much"])
np.insert(words, np.array([1, 3]), "<mask>")

array(['I', '<mas', 'you', 'very', '<mas', 'much'], dtype='<U4')

In [278]:
words = np.array(["I", "you", "very", "much"], dtype='>U6')
np.insert(words, np.array([1, 3]), "<mask>")

array(['I', '<mask>', 'you', 'very', '<mask>', 'much'], dtype='>U6')

In [283]:
augmenter = MLMInsertionAugmenter(model, tokenizer, p=0.2, topk=10)

In [288]:
augmenter("I love you.")

'I love seeing you.'