<a href="https://colab.research.google.com/github/kstyle2198/NLP_TIPS/blob/main/Text_Augmentation_in_Python_with_NLPAUG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


https://medium.com/@marc.bolle/text-augmentation-in-python-with-nlpaug-48c3eebacf46


In [62]:
!pip install nlpaug



# TF-IDF Augmenter

In [63]:
import sklearn.datasets
import re
import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
# Defining a tokenizer function to extract word tokens
def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

In [65]:
# Load sample data (Scikit Learn 20 News Groups)
train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
train_x = train_data.data
train_x[:1]

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.']

In [66]:
# Tokenize input
train_x_tokens = [_tokenizer(x) for x in train_x]
train_x_tokens[:1]

[['was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'saw',
  'the',
  'other',
  'day',
  'It',
  'was',
  'door',
  'sports',
  'car',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  '60s',
  'early',
  '70s',
  'It',
  'was',
  'called',
  'Bricklin',
  'The',
  'doors',
  'were',
  'really',
  'small',
  'In',
  'addition',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  'This',
  'is',
  'all',
  'know',
  'If',
  'anyone',
  'can',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'of',
  'production',
  'where',
  'this',
  'car',
  'is',
  'made',
  'history',
  'or',
  'whatever',
  'info',
  'you',
  'have',
  'on',
  'this',
  'funky',
  'looking',
  'car',
  'please',
  'mail']]

In [67]:
# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
# tfidf_model.save('.')
tfidf_model

<nlpaug.model.word_stats.tfidf.TfIdf at 0x7afceede37f0>

In [68]:
# text = "It was a dark and stormy night. I was alone at home when I saw a lion's face followed by a scary thunderous roar at the windows."
# text = "I like eating an apple"
text = "light signal column on the upper deck does not work well and to be replaced with new one"

In [69]:
# Augment the text with TFIDF augmenter
aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)
augmented_text1 = aug.augment(text)
print(f"naw.TfIdfAug: {augmented_text1}")

naw.TfIdfAug: ['entitled signal column southwestern the upper deck does not Jimi well and YMA Herron replaced ZKJF4D_8UD new one']


# Random Word Augmenter

In [70]:
import nlpaug.augmenter.word as naw

aug = naw.RandomWordAug()
augmented_text2 = aug.augment(text)
print(f"naw.RandomWordAug: {augmented_text2}")

naw.RandomWordAug: ['light signal column on the upper deck work well to new one']


# Abstractive Summarization Augmenter

In [71]:
import nlpaug.augmenter.sentence as nas

aug = nas.AbstSummAug()
augmented_text3 = aug.augment(text)
print(f"nas.AbstSummAug: {augmented_text3}")

nas.AbstSummAug: ['light signal column on the upper deck does not work well and to be replaced with new one.']


# Contextual Word Embeddings Augmenter

In [72]:
import nlpaug.augmenter.sentence as nas

# Contextual Word Embeddings - Sentence level
aug = nas.ContextualWordEmbsForSentenceAug()
augmented_text4 = aug.augment(text)
print(f"nas.ContextualWordEmbsForSentenceAug: {augmented_text4}")

nas.ContextualWordEmbsForSentenceAug: ['light signal column on the upper deck does not work well and to be replaced with new one to on on on a to and to , with : is at - ) is .']


# Random Augmenter

In [73]:
import nlpaug.augmenter.sentence as nas

# Random Augmenter - Sentence level
aug = nas.RandomSentAug()
augmented_text5 = aug.augment(text)
print(f"nas.RandomSentAug: {augmented_text5}")

nas.RandomSentAug: ['light signal column on the upper deck does not work well and to be replaced with new one']


# 결과 모음

In [74]:
print(f"원문: {text}")
print()

print(f"naw.TfIdfAug: {augmented_text1}")
print(f"naw.RandomWordAug: {augmented_text2}")
print(f"nas.AbstSummAug: {augmented_text3}")
print(f"nas.ContextualWordEmbsForSentenceAug: {augmented_text4}")
print(f"nas.RandomSentAug: {augmented_text5}")

원문: light signal column on the upper deck does not work well and to be replaced with new one

naw.TfIdfAug: ['entitled signal column southwestern the upper deck does not Jimi well and YMA Herron replaced ZKJF4D_8UD new one']
naw.RandomWordAug: ['light signal column on the upper deck work well to new one']
nas.AbstSummAug: ['light signal column on the upper deck does not work well and to be replaced with new one.']
nas.ContextualWordEmbsForSentenceAug: ['light signal column on the upper deck does not work well and to be replaced with new one to on on on a to and to , with : is at - ) is .']
nas.RandomSentAug: ['light signal column on the upper deck does not work well and to be replaced with new one']
