# Exploration of Data Augmentation methods.

In [1]:
import os

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas


model_dir = "trained_models"
os.environ["MODEL_DIR"] = model_dir
# from nlpaug.util.file.download import DownloadUtil
# DownloadUtil.download_glove(model_name='glove.6B', dest_dir='trained_models')

Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


Our goal is to construct `augment` function that will be the most useful for data augmentation. It will transform one sentence to the other, preserving the meaning and it's label.

In [2]:
def augment(text: str) -> str:
    return text

In [3]:
text = 'The quick brown fox jumps over the lazy dog .'
augment(text)

'The quick brown fox jumps over the lazy dog .'

## Character level

#### OCR 

In [15]:
aug = nac.OcrAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['The quicr brown fox jumps over the lazy dog.', 'The quick brown f0x jumps 0ver the lazy dog.', 'The quick brown fox jumps ovek the la2y dog.']


#### Keyboard distance mistakes

In [19]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quUck brown fox jumps 9ver the lzzy dog.


#### Random character insertion

In [20]:
aug = nac.RandomCharAug(action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The *quick brown fox jyumps over the 2lazy dog.


#### Random character substitution

In [21]:
aug = nac.RandomCharAug(action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick 7rown fox jum)s over the laFy dog.


#### Random character swap

In [22]:
aug = nac.RandomCharAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick brwon fox ujmps ovre the lazy dog.


#### Random character deletion

In [24]:
aug = nac.RandomCharAug(action="delete")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The qick brown fox umps over the azy dog.


## Word level

#### Dictionary spelling mistake

In [27]:
aug = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['The quikly bIrown fox jumps over the lazy dig.', 'Tha qchick brown fox jumps other the lazy dog.', 'The qchick brouwn fox jumps over she lazy dog.']


#### Embedding similarity insertion (requires additional setup)

In [None]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='glove', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
    action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### Embedding similarity substitution

In [None]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='glove', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### TF-IDF similiarity insertion (requires additional setup)
#### TF-IDF similarity substitution (requires additional setup)

In [None]:
aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### WordNet synonym replacement

In [43]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/przemyslaw/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/przemyslaw/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The speedy brown fox climb up terminated the lazy dog.


#### MLM word insertion

In [8]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
while the occasional quick animated brown fox jumps over the lazy dog .


#### MLM word substitution

In [9]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
the quick brown fox looked over its lazy shore .


#### PPDB synonym replacement (requires additional setup)

In [None]:
aug = naw.SynonymAug(aug_src='ppdb', model_path=os.environ.get("MODEL_DIR") + 'ppdb-2.0-s-all')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

#### Random word swapping

In [45]:
aug = naw.RandomWordAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The brown quick fox jumps over the lazy dog.


#### Random word deletion

In [47]:
aug = naw.RandomWordAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The brown fox the lazy dog.


#### Continuous random word deletion

In [48]:
aug = naw.RandomWordAug(action='crop')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The jumps over the lazy dog.


#### Word split

In [49]:
aug = naw.SplitAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick b rown fox jum ps over the l azy dog.


#### Backtranslation augmentation

In [None]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='transformer.wmt19.en-de', 
    to_model_name='transformer.wmt19.de-en'
)
back_translation_aug.augment(text)

## Sentence Augmentation 
#### Insertion of sentence continuation with GPT-2 model

In [6]:
aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick brown fox jumps over the lazy dog . all new .




#### Summarization with T5 model

In [7]:
article = """
The history of natural language processing (NLP) generally started in the 1950s, although work can be 
found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and 
Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. 
The Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian 
sentences into English. The authors claimed that within three or five years, machine translation would
be a solved problem. However, real progress was much slower, and after the ALPAC report in 1966, 
which found that ten-year-long research had failed to fulfill the expectations, funding for machine 
translation was dramatically reduced. Little further research in machine translation was conducted 
until the late 1980s when the first statistical machine translation systems were developed.
"""

aug = nas.AbstSummAug(model_path='t5-small', num_beam=3)
augmented_text = aug.augment(article)
print("Original:")
print(article)
print("Augmented Text:")
print(augmented_text)

Original:

The history of natural language processing (NLP) generally started in the 1950s, although work can be 
found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and 
Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. 
The Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian 
sentences into English. The authors claimed that within three or five years, machine translation would
be a solved problem. However, real progress was much slower, and after the ALPAC report in 1966, 
which found that ten-year-long research had failed to fulfill the expectations, funding for machine 
translation was dramatically reduced. Little further research in machine translation was conducted 
until the late 1980s when the first statistical machine translation systems were developed.

Augmented Text:
the history of natural language processing started in the 1950s. in 1950, Al