In [None]:
# Below are some text augmentation techniques 

### 01.Back Translation

Another popular approach for augmenting text datasets is back translation. This involves translating a sentence from our target language into one or more other languages and then translating all of them back to the original language. We can use the Python library googletrans for this purpose. 

In [52]:
!pip install google_trans_new

import random
import google_trans_new
from google_trans_new import google_translator  

def back_translation( sentence):
    translator = google_translator()

    available_langs = list(google_trans_new.LANGUAGES.keys()) 
    trans_lang = random.choice(available_langs) 

    translations = translator.translate(sentence, lang_tgt=trans_lang) 

    translations_en_random = translator.translate(translations, lang_src=trans_lang, lang_tgt='en') 

    return translations_en_random
back_translation("The dog hide itself under the rug ")

# l=back_translation("Quick brown fox jumps over the lazy dog".split())
# "".join(l)




'The dog hides under the carpet '

### 02.Random Insertion
A random insertion technique looks at a sentence and then randomly inserts synonyms of existing non-stopwords into the sentence n times. Assuming you have a way of getting a synonym of a word and a way of eliminating stopwords (common words such as and, it, the, etc.), shown, but not implemented, in this function via get_synonyms() and get_stopwords(), an implementation of this would be as follows:

In [17]:
import nltk 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [15]:
def remove_stopwords(sentence):
    from nltk.corpus import stopwords
    tokenized = sentence #custom_tokenize(sentence) #data['text'].apply(custom_tokenize) # Tokenize tweets
    lower_tokens = [t.lower() for t in tokenized] #tokenized.apply(lambda x: [t.lower() for t in x]) # Convert tokens into lower case
    alpha_only = [t for t in lower_tokens if t.isalpha()] #lower_tokens.apply(lambda x: [t for t in x if t.isalpha()]) # Remove punctuations
    no_stops = [t for t in alpha_only if t not in stopwords.words('english')] #alpha_only.apply(lambda x: [t for t in x if t not in stopwords.words('english')]) # remove stop words

    return no_stops

def get_synonyms(word):
    import nltk
    from nltk.corpus import wordnet
    synonyms = []
      
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
            # if l.antonyms():
            #     antonyms.append(l.antonyms()[0].name())
    synonyms = list(set(synonyms))
    if len(synonyms) > 0:
      new_synonym = random.choice(synonyms)
    else:
      new_synonym = word

    return new_synonym


In [25]:
  def random_insertion(sentence, n): 
      from random import randrange
      words = remove_stopwords(sentence) 
      if len(words)<=0:
        words = sentence
      for _ in range(n):
          word = random.choice(words)
          new_synonym = get_synonyms(word)
          sentence.insert(randrange(len(sentence)+1), new_synonym)
      return sentence

In [33]:
l=random_insertion("Quick brown fox jumps over the lazy dog".split(),2)
" ".join(l)


'confuse Quick brown fox jumps over the work-shy lazy dog'

## 03.Random Deletion
As the name suggests, random deletion deletes words from a sentence. Given a probability parameter p, it will go through the sentence and decide whether to delete a word or not based on that random probability. Consider of it as pixel dropouts while treating images.

In [43]:
def random_deletion(words, p=0.5): 
    if len(words) == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0: # if not left, sample a random word
        return [random.choice(words)] 
    else:
        return remaining


l=random_deletion("Quick brown fox jumps over the lazy dog and dog is happy standing on rug".split(),p=0.5)
" ".join(l)

'brown over the dog and happy standing on rug'

### 04.Random Swap
The random swap augmentation takes a sentence and then swaps words within it n times, with each iteration working on the previously swapped sentence. Here we sample two random numbers based on the length of the sentence, and then just keep swapping until we hit n.

In [49]:
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence
l=random_swap("Quick brown fox jumps over the lazy dog".split(),n=2)
" ".join(l)

'fox over Quick jumps brown the lazy dog'