# MLM augmentation

In [47]:
import os
import random
from typing import List

import nlpaug.augmenter.word as naw
import numpy as np
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader
from transformers import RobertaForMaskedLM, RobertaTokenizer, Trainer, TrainingArguments

In [65]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base', return_dict=True).eval()

RANDOM_SEED = 42
# Taking only subset of data (faster training, fine-tuning the whole dataset takes ~20 hours per epoch)
TRAIN_SIZE = 5_000
VALID_SIZE = 1_000
TEST_SIZE = 1_000

dataset = load_dataset("yelp_polarity", split="train")
train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE, seed=RANDOM_SEED)
train_dataset = train_test_split["train"]
test_val_dataset = train_test_split["test"].train_test_split(train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED)
val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset["test"]

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Reusing dataset yelp_polarity (/home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c)
Loading cached split indices for dataset at /home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c/cache-5176ca1733b58ed4.arrow and /home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c/cache-884e3083fd8dff3c.arrow
Loading cached split indices for dataset at /home/przemyslaw/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c

Train size: 5000, Validation size: 1000, Test size: 1000


In [152]:
text = train_dataset[27]['text']
text

'I went to Fat Burger for the first  time last month. I ordered a medium fat burger with mushrooms, no mustard. The bun was crispy and warm and the burger itself was all around pretty good.'

## Word-level (substitution)

In [154]:
mask_id = 50_264
tokenizer.mask_token, mask_id

('<mask>', 50264)

In [227]:
def mask_word(word: str):
    if word.endswith('.'):
        return tokenizer.mask_token + ' .'
    return tokenizer.mask_token

words = np.array(text.split())
p = 0.15
n_mask = int(len(words) * p)
masked_indices = np.sort(np.random.choice(len(words), size=n_mask))

# words[masked_indices] = tokenizer.mask_token
words = np.array([mask_word(word) if i in masked_indices else word for i, word in enumerate(words)])
masked_text = " ".join(words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')

for s1, s2 in zip(sentences1, sentences2):
    print(s1, '-->', s2)

I went to Fat Burger for the first  time last month --> <mask> went to <mask> Burger for the first time last month
 I ordered a medium fat burger with mushrooms, no mustard -->  I ordered a medium fat burger <mask> <mask> no mustard
 The bun was crispy and warm and the burger itself was all around pretty good -->  The bun was crispy and warm and the burger <mask> was all around pretty good
 --> 


In [228]:
tokenizer_output = tokenizer([masked_text])
input_ids, attention_mask = torch.tensor(tokenizer_output['input_ids']), torch.tensor(tokenizer_output['attention_mask'])
output = model(input_ids)
output.logits.shape

torch.Size([1, 41, 50265])

In [229]:
input_ids

tensor([[    0, 50264,   439,     7, 50264, 17971,    13,     5,    78,    86,
            94,   353,     4,    38,  2740,    10,  4761,  5886, 18079, 50264,
         50264,   117, 27001,     4,    20, 15713,    21, 32042,     8,  3279,
             8,     5, 18079, 50264,    21,    70,   198,  1256,   205,     4,
             2]])

In [230]:
predicted_logits = output.logits[input_ids == mask_id]
print(predicted_logits.shape)
predicted_tokens = predicted_logits.argmax(1)
predicted_words = [tokenizer.decode(token.item()) for token in predicted_tokens]
predicted_words

torch.Size([5, 50265])


['I', ' American', ' that', ' had', ' itself']

In [231]:
new_words = words
new_words[masked_indices] = predicted_words

In [232]:
augmented_text = ' '.join(new_words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')
sentences3 = augmented_text.split('.')

for s1, s2, s3 in zip(sentences1, sentences2, sentences3):
    print(s1)
    print(s2)
    print(s3)
    print()

I went to Fat Burger for the first  time last month
<mask> went to <mask> Burger for the first time last month
I went to  America Burger for the first time last month

 I ordered a medium fat burger with mushrooms, no mustard
 I ordered a medium fat burger <mask> <mask> no mustard
 I ordered a medium fat burger  that  had no mustard

 The bun was crispy and warm and the burger itself was all around pretty good
 The bun was crispy and warm and the burger <mask> was all around pretty good
 The bun was crispy and warm and the burger  itself was all around pretty good







## Insertion

In [233]:
x = np.array([1, 3, 2])
np.insert(x, [0, 3], 77)

array([77,  1,  3,  2, 77])

In [234]:
words = np.array(text.split())
p = 0.15
n_mask = int(len(words) * p)
masked_indices = np.sort(np.random.choice(len(words) + 1, size=n_mask))

# words[masked_indices] = tokenizer.mask_token
words = np.insert(words, masked_indices, tokenizer.mask_token)
masked_text = " ".join(words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')

for s1, s2 in zip(sentences1, sentences2):
    print(s1, '-->', s2)

I went to Fat Burger for the first  time last month --> I went <mask> to Fat Burger for the first time last month
 I ordered a medium fat burger with mushrooms, no mustard -->  I ordered a <mask> medium fat burger with mushrooms, no mustard
 The bun was crispy and warm and the burger itself was all around pretty good -->  The bun was crispy <mask> and warm <mask> and the burger itself was all around <mask> pretty good
 --> 


In [235]:
tokenizer_output = tokenizer([masked_text])
input_ids, attention_mask = torch.tensor(tokenizer_output['input_ids']), torch.tensor(tokenizer_output['attention_mask'])
output = model(input_ids)

predicted_logits = output.logits[input_ids == mask_id]
predicted_tokens = predicted_logits.argmax(1)
predicted_words = [tokenizer.decode(token.item()) for token in predicted_tokens]
predicted_words

[' back', ' nice', ',', ',', ',']

In [239]:
new_words = words
new_words[masked_indices] = predicted_words

In [240]:
predicted_words

[' back', ' nice', ',', ',', ',']

In [241]:
augmented_text = ' '.join(new_words)

sentences1 = text.split('.')
sentences2 = masked_text.split('.')
sentences3 = augmented_text.split('.')

for s1, s2, s3 in zip(sentences1, sentences2, sentences3):
    print(s1)
    print(s2)
    print(s3)
    print()

I went to Fat Burger for the first  time last month
I went <mask> to Fat Burger for the first time last month
I went  back to Fat Burger for the first time last month

 I ordered a medium fat burger with mushrooms, no mustard
 I ordered a <mask> medium fat burger with mushrooms, no mustard
 I ordered  nice <mask> medium fat burger with mushrooms, no mustard

 The bun was crispy and warm and the burger itself was all around pretty good
 The bun was crispy <mask> and warm <mask> and the burger itself was all around <mask> pretty good
 The bun , crispy , and warm <mask> and the burger , was all around <mask> pretty good





