### 텍스트 데이터

In [2]:
# NLPAUG 자연어 데이터 증강 라이브러리
# sacremoses, NLTK 텍스트 토큰화 및 정규화 라이브러리
!pip install numpy requests nlpaug transformers sacremoses nltk

Collecting nlpaug
  Using cached nlpaug-1.1.11-py3-none-any.whl (410 kB)
Collecting transformers
  Using cached transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting gdown>=4.0.0 (from nlpaug)
  Downloading gdown-5.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1.tar.gz (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?

In [3]:
# ContextualWordEmbsAug 단어 삽입

import nlpaug.augmenter.word as naw

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent."
]

aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased", action='insert') # bert모델을 활용해 단어 삽입 action: 수행할 기능 substitute를 이용해 대체도 가능
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f'src: {text}')
    print(f'dst: {augmented}')
    print("-----------------------------------------")

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.3kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 411kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 598kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.40MB/s]
model.safetensors: 100%|██████████| 440M/440M [00:07<00:00, 58.5MB/s] 


src: Those who can imagine anything, can create the impossible.
dst: to those who can imagine anything, we can always create up the impossible.
-----------------------------------------
src: We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst: we alone can possibly only could see a short good distance up ahead, but we really can see plenty there that needs working to be done.
-----------------------------------------
src: If a machine is expected to be infallible, it cannot also be intelligent.
dst: if even a computing machine is expected to be considered infallible, it often cannot usually also be called intelligent.
-----------------------------------------


In [4]:
# 문자 삭제

import nlpaug.augmenter.char as nac

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent."
]

aug = nac.RandomCharAug(action="delete") # RandomCharAug를 이용해 무작위 문자 삭제 가능 이 외에도 (insert, substitute, swap, delete) 가능
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f'src: {text}')
    print(f'dst: {augmented}')
    print('-----------------------')

src: Those who can imagine anything, can create the impossible.
dst: Those who can agne ythin, can cret the impsile.
-----------------------
src: We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst: We can nl see a hot isace hea, but we can see peny ere ta needs to be done.
-----------------------
src: If a machine is expected to be infallible, it cannot also be intelligent.
dst: If a aine is eecte to be infallible, it cano ao be inellet.
-----------------------


#### 교체 및 대체

In [5]:
# 단어 교체
import nlpaug.augmenter.word as naw

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent."
]

aug = naw.RandomWordAug(action='swap')
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f'src: {text}')
    print(f'dst: {augmented}')
    print('-------------------------')

src: Those who can imagine anything, can create the impossible.
dst: Those who can imagine anything, create the can impossible.
-------------------------
src: We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst: We can only see short a, distance ahead but can we see plenty there that needs to done be.
-------------------------
src: If a machine is expected to be infallible, it cannot also be intelligent.
dst: If machine a expected is be to infallible, it cannot be intelligent also.
-------------------------


In [7]:
# 단어 대체

import nlpaug.augmenter.word as naw

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent."
]

aug = naw.SynonymAug(aug_src='wordnet') # wordnet or ppdb데이터베이스를 활용해 단어 대체해 데이터를 증강
augmented_texts = aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f'src: {text}')
    print(f'dst: {augmented}')
    print('--------------------------')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joyoungjun/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/joyoungjun/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joyoungjun/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


src: Those who can imagine anything, can create the impossible.
dst: Those who can think anything, tail produce the inconceivable.
--------------------------
src: We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst: We can only get wind a short distance ahead, only we dismiss see enough there that demand to cost perform.
--------------------------
src: If a machine is expected to be infallible, it cannot also be intelligent.
dst: If a machine be expected to represent infallible, information technology cannot also be intelligent.
--------------------------


In [8]:
# 단어 대체 2

import nlpaug.augmenter.word as naw

texts = [
    "Those who can imagine anything, can create the impossible.",
    "We can only see a short distance ahead, but we can see plenty there that needs to be done.",
    "If a machine is expected to be infallible, it cannot also be intelligent."
]
reserved_tokens = [
    ["can", "can't", "cannot", "could"],
]

reversed_aug = naw.ReservedAug(reserved_tokens=reserved_tokens)
augmented_texts = reversed_aug.augment(texts)

for text, augmented in zip(texts, augmented_texts):
    print(f'src: {text}')
    print(f'dst: {augmented}')
    print('------------------------------')

src: Those who can imagine anything, can create the impossible.
dst: Those who can't imagine anything, could create the impossible.
------------------------------
src: We can only see a short distance ahead, but we can see plenty there that needs to be done.
dst: We cannot only see a short distance ahead, but we could see plenty there that needs to be done.
------------------------------
src: If a machine is expected to be infallible, it cannot also be intelligent.
dst: If a machine is expected to be infallible, it can't also be intelligent.
------------------------------
