<a href="https://colab.research.google.com/github/kstyle2198/NLP_TIPS/blob/main/Text_Augmentation_in_Python_with_NLPAUG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


https://medium.com/@marc.bolle/text-augmentation-in-python-with-nlpaug-48c3eebacf46


In [3]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


# TF-IDF Augmenter

In [4]:
import sklearn.datasets
import re
import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw

In [5]:
# Defining a tokenizer function to extract word tokens
def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

In [6]:
# Load sample data (Scikit Learn 20 News Groups)
train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
train_x = train_data.data

In [7]:
# Tokenize input
train_x_tokens = [_tokenizer(x) for x in train_x]

In [8]:
# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('.')

In [26]:
# text = "It was a dark and stormy night. I was alone at home when I saw a lion's face followed by a scary thunderous roar at the windows."
# text = "I like eating an apple"
text = "light signal column does not work well and to be replaced with new one"

In [27]:
# Augment the text with TFIDF augmenter
aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)
augmented_text = aug.augment(text)
print(augmented_text)

['3IA4 signal column does not work lemons and pandemonium werple replaced with new clipart']


# Random Word Augmenter

In [15]:
import nlpaug.augmenter.word as naw

aug = naw.RandomWordAug()
augmented_text = aug.augment(text)
print(augmented_text)

['signal column does not work well replaced with new']


# Abstractive Summarization Augmenter

In [19]:
import nlpaug.augmenter.sentence as nas

aug = nas.AbstSummAug()
augmented_text = aug.augment(text)
print(augmented_text)

['light signal column does not work well and to be replaced with new one. light column should be replaced by new one.']


# Contextual Word Embeddings Augmenter

In [20]:
import nlpaug.augmenter.sentence as nas

# Contextual Word Embeddings - Sentence level
aug = nas.ContextualWordEmbsForSentenceAug()
augmented_text = aug.augment(text)
print(augmented_text)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

['light signal column does not work well and to be replaced with new one - " and - - a !']


# Random Augmenter

In [24]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
import nlpaug.augmenter.sentence as nas

# Random Augmenter - Sentence level
aug = nas.RandomSentAug()
augmented_text = aug.augment(text)
print(augmented_text)

['light signal column does not work well and to be replaced with new one']
