# Text Augmentation Tools

##### 1. Easy Data Augmentation tools: consist of four simple major actions decribed this bellow.
- Synonym replacement : Replace n words in the sentence with synonyms from wordnet
- Random deletion : Randomly delete words from the sentence with probability p
- Randomly swap two words in the sentence n times 
- Random insertion : Randomly insert n words into the sentence

##### 2. Back translation : is based on 
- Providing a sentence in a given source language (English).
- Translating this sentence to an intermediate language (French for instance).
- Re-Translating the French sentence back to the source language (English)

In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import random
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
import random
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from random import shuffle
random.seed(1)
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
stop_words = stopwords.words('english')
import os

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\33627\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def get_only_chars(line):
    clean_line = ""
    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ")
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()
    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '
    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)
def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words
def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

### 1.EDA: Easy Data Augmentation tools

In [4]:
def SR(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            #print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n: #only replace up to n words
            break
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')
    return new_words

In [5]:
def RD(words, p):
    if len(words) == 1:
        return words
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]
    return new_words

In [6]:
def RS(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

In [7]:
def RI(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words


In [13]:
def EDA(text, alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2, num_aug=1):
    text = get_only_chars(text)
    words = text.split(' ')
    words = [word for word in words if word is not '']
    num_words = len(words)
    augmented_sentences = []
    num_new_per_technique = int(num_aug/4)+1
    #SR
    if (alpha_sr > 0):
        n_sr = max(1, int(alpha_sr*num_words))
        for _ in range(num_new_per_technique):
            a_words = SR(words, n_sr)
            augmented_sentences.append(' '.join(a_words))
    #RI
    if (alpha_ri > 0):
        n_ri = max(1, int(alpha_ri*num_words))
        for _ in range(num_new_per_technique):
            a_words = RI(words, n_ri)
            augmented_sentences.append(' '.join(a_words))
    #RS
    if (alpha_rs > 0):
        n_rs = max(1, int(alpha_rs*num_words))
        for _ in range(num_new_per_technique):
            a_words = RS(words, n_rs)
            augmented_sentences.append(' '.join(a_words))
    #RD
    if (p_rd > 0):
        for _ in range(num_new_per_technique):
            a_words = RD(words, p_rd)
            augmented_sentences.append(' '.join(a_words))
    augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
    shuffle(augmented_sentences)
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
    return augmented_sentences

### 2.Back translation

In [9]:
tokenizer_en_fr = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
tokenizer_fr_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
model_en_fr = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model_fr_en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
def back_translate(text):   
    tokenized_text_en_fr = tokenizer_en_fr.prepare_seq2seq_batch([text], return_tensors='pt')
    translation_en_fr = model_en_fr.generate(**tokenized_text_en_fr )
    fr_text = tokenizer_en_fr.batch_decode(translation_en_fr, skip_special_tokens=True)[0]
    tokenized_text_fr_en = tokenizer_fr_en.prepare_seq2seq_batch([fr_text], return_tensors='pt')
    translation_fr_en = model_fr_en.generate(**tokenized_text_fr_en )
    en_text = tokenizer_fr_en.batch_decode(translation_fr_en, skip_special_tokens=True)[0]
    return en_text

In [10]:
text = "Hello my friends! How are you doing today?"

In [11]:
back_translate(text)

'Hello, my friends, how are you today?'

In [14]:
EDA(text, alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2, num_aug=1)

['friends my hello how are you doing today']