paraphrasing - backtranslation nlpaug\
noise injection - word, grammar, irrelevant\
random augmentation\
\
filtering later on


!pip install -q nlpaug
!pip install sacremoses

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [1]:
question = "The weather is lovely, isn't it?"

### Back-translation

In [2]:
import nlpaug.augmenter.word as naw
import torch
import torch.utils.data as t_data
import nlpaug.model.lang_models.machine_translation_transformers as mt

mt.t_data = t_data

In [3]:
def backtranslate (query: str) -> str:
    aug = naw.back_translation.BackTranslationAug(
        from_model_name="Helsinki-NLP/opus-mt-en-de",
        to_model_name="Helsinki-NLP/opus-mt-de-en",
    )

    augmented_query = aug.augment(query)
    return augmented_query

In [4]:
class BackTranslationAugmentor:
    def __init__(self, from_model: str="Helsinki-NLP/opus-mt-en-de",
                       to_model: str="Helsinki-NLP/opus-mt-de-en"):
        self.aug = naw.BackTranslationAug(
            from_model_name=from_model,
            to_model_name=to_model
        )

    def augment(self, query: str) -> str:
        return self.aug.augment(query)

In [5]:
backtrans = BackTranslationAugmentor()
backtrans.augment(question)

["The weather is nice, isn't it?"]

In [6]:
backtranslate(question)

["The weather is nice, isn't it?"]

----

### Noise injection

word\
grammar\
irrelevant

#### Word-level: typos and letter-swaps

In [7]:
import nlpaug.augmenter.char as nac
from nlpaug.flow import Sequential

In [45]:
aug_typo = nac.KeyboardAug(
    aug_word_p=0.2,
    aug_char_p=0.1,
    aug_char_min=1,
    aug_char_max=2,
    include_special_char=False,
    include_numeric=False,
    include_upper_case=False,
)

In [49]:
aug_swapletter = nac.RandomCharAug(
    action='swap',
    aug_word_p=0.2,
    aug_char_p=0.1,
    aug_char_min=1,
    aug_char_max=2,
)

In [54]:
pipeline_word = Sequential([aug_typo, aug_swapletter])

In [56]:
print('Typo:', aug_typo.augment(question))
print('Letter-swap:', aug_swapletter.augment(question))
print('Combined:', pipeline_word.augment(question))

Typo: ["The weatjer is loveiy, isn ' t it?"]
Letter-swap: ["The weatehr is lvoely, isn ' t it?"]
Combined: ["The waethdr is olveiy, isn ' t it?"]


#### Irrelevant information

pip install wikipedia

pip install wikipedia-api

In [105]:
import wikipedia, random
import wikipediaapi

wiki = wikipediaapi.Wikipedia("ShrinkMCP kanecorts@gmail.com", "en")

In [122]:
def get_random_sentence() -> str:
    title = wikipedia.random(pages=1)
    page = wiki.page(title)
    
    if not page.exists():
        return ""
    
    # split summary into sentences
    sentences = page.summary.split(". ")
    random_sentence = random.choice(sentences) + '.'
    
    return random_sentence.strip() + ' ' question 

In [121]:
random_sentence = get_random_sentence()
augmented_question = random_sentence + ' ' + question
print(augmented_question)

He participated in the British attack on CÃ¡diz in 1587. The weather is lovely, isn't it?


In [7]:
import random
import nlpaug.augmenter.char as nac
import wikipedia
import wikipediaapi

wiki = wikipediaapi.Wikipedia("ShrinkMCP hacker123@gmail.com", "en")

class NoiseInjectionAugmentor:
    def __init__(
            self, 
            word_percentage: float = 0.2,
            char_percentage: float = 0.1,
            min_augment: int = 1,
            max_augment: int = 2,
        ):

        self.aug_typo = nac.KeyboardAug(
            aug_word_p = word_percentage,
            aug_char_p = char_percentage,
            aug_char_min = min_augment,
            aug_char_max = max_augment,
            include_special_char = False,
            include_numeric = False,
            include_upper_case = False,
        )

        self.aug_swapletter = nac.RandomCharAug(
            action = 'swap',
            aug_word_p = word_percentage,
            aug_char_p = char_percentage,
            aug_char_min = min_augment,
            aug_char_max = max_augment,
        )

    def add_lexical_noise(self, text: str) -> str:
        """
        Apply lexical-level noise (typos + letter swaps).
        """
        text = self.aug_typo.augment(text)
        text = self.aug_swapletter.augment(text)
        return text

    def add_semantic_noise(self, text: str) -> str:
        """
        Append a random sentence from Wikipedia as semantic noise.
        """
        try:
            title = wikipedia.random(pages=1)
            page = wiki.page(title)

            if not page.exists():
                return text

            sentences = page.summary.split(". ")
            random_sentence = random.choice(sentences).strip() + "."
            return random_sentence + " " + text
        except Exception:
            return text

    def augment(self, text: str, add_lexical: bool = True, add_semantic: bool = True) -> str:
        """
        Full augmentation pipeline:
        1. Apply lexical noise
        2. Optionally add semantic noise
        """
        if add_lexical:
            text = self.lexical_noise(text)
        if add_semantic:
            text = self.semantic_noise(text)
        return text

In [8]:
ninject = NoiseInjectionAugmentor()
ninject.augment(question)

AttributeError: 'NoiseInjectionAugmentor' object has no attribute 'lexical_noise'

---

### Random augmentation

pip install nltk

In [9]:
import nltk

# nltk.download('wordnet')
# nltk.download('omw-1.4') 
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')

#### Synonym replacement

In [56]:
import re
def regex_tokenizer(text):
    return re.findall(r"\w+(?:'\w+)?|[^\w\s]", text)

In [128]:
def synonym_replacement(query: str) -> str:
    syn_aug = naw.synonym.SynonymAug(
        aug_src='wordnet', 
        aug_p=0,
        aug_max=5,
        tokenizer=regex_tokenizer,
        stopwords=['it', 'as']
    )
    synonym_text = syn_aug.augment(query)
    
    return synonym_text

In [125]:
question = "The weather is lovely, isn't it?"

In [129]:
synonym_replacement(question)

["The atmospheric condition be lovely, isn't it?"]

#### Random swap

In [135]:
def random_swap(query: str) -> str:
    swap_aug = naw.random.RandomWordAug(
        action='swap', 
        aug_max=5,
        tokenizer=regex_tokenizer
    )
    swapped_text = swap_aug.augment(query)

    return swapped_text

In [136]:
random_swap(question)

["Weather the lovely is, isn't? it"]

#### Random insert

In [158]:
nltk.download("words")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Kane\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [159]:
from nltk.corpus import words

def get_random_word():
    return random.choice(words.words())

In [206]:
def random_insert(query: str) -> str:
    sentences = question.split(". ")
    sentence_pos = random.randint(0, len(sentences) - 1)
    random_sentence = sentences[sentence_pos].split()
    
    random_word = random.choice(words.words())
    pos = random.randint(1, len(random_sentence) - 2)
    random_sentence.insert(pos, random_word)

    inserted_sentence = " ".join(random_sentence)
    sentences[sentence_pos] = inserted_sentence
    inserted_query = " ".join(sentences)
        
    return inserted_query

In [207]:
random_insert(question)

"The weather is throatlet lovely, isn't it?"

#### Random delete

In [210]:
def random_delete(query: str) -> str:
    del_aug = naw.random.RandomWordAug(
        action='delete', 
        aug_max=1,
        tokenizer=regex_tokenizer
    )
    deletion_text = del_aug.augment(query)

    return deletion_text

In [212]:
random_delete(question)

["The weather is, isn't it?"]

#### Random mixup

In [226]:
RANDOM_AUGMENTERS = [
    "synonym_replacement",
    "random_swap",
    # "random_insert",
    "random_delete"
]

In [242]:
def random_mixup(query: str) -> str:
    random_funcs = random.sample(RANDOM_AUGMENTERS, 2)
    
    augmented_query = query
    for func_name in random_funcs:
        func = globals()[func_name]
        augmented_query = func(augmented_query)
        
    return augmented_query

In [243]:
random_mixup(question)

['random_swap', 'random_delete']


['The weather lovely is, it?']