In [1]:
# 딥러닝을 이용한 자연어처리
# 1. 데이터 준비
# 2. 텍스트를 표준화
# 3. 텍스트 분할(토큰화)
# 4. 어휘 인덱싱

In [1]:
import string

In [2]:
test_sentence = "I write, rewrite, and still rewrite again!!"

In [3]:
text = test_sentence.lower()
text

'i write, rewrite, and still rewrite again!!'

In [4]:
text = "".join([char for char in text if char not in string.punctuation])
text


'i write rewrite and still rewrite again'

In [5]:
# 데이터 표준화 함수
def standardize(text):
    text = text.lower()
    return "".join([char for char in text if char not in string.punctuation])

In [6]:
# 토큰화
def tokenize(text):
    return text.split()

In [7]:
# vocabulary 화
vocabulary = {"":0, "[UNK]":1}


In [8]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]

In [9]:
for text in dataset:
    text = standardize(text)
    tokens = tokenize(text)
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [10]:
dict((k,v) for k, v in vocabulary.items())

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [11]:
vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [19]:
class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)
    def tokenize(self, text):
        return text.split()
    def make_vocabulary(self, dataset):
        self.vocabulary = {"":0, '[UNK]' : 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict(
            (v,k) for k, v in self.vocabulary.items()
        )
    def encode(self,text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token,1) for token in tokens]
    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocabulary.get(i,'[UNK]') for i in int_sequence
        )

In [20]:
vectorizer = Vectorizer()
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms'
]
vectorizer.make_vocabulary(dataset)

In [22]:
# 단어 집합에 없는 단어일 경우 UNK로 대체
vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [24]:
test_sentence = "I write, erase, rewrite and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
encoded_sentence

[2, 3, 4, 5, 7, 1, 5, 6]

In [25]:
vectorizer.inverse_vocabulary

{0: '',
 1: '[UNK]',
 2: 'i',
 3: 'write',
 4: 'erase',
 5: 'rewrite',
 6: 'again',
 7: 'and',
 8: 'then',
 9: 'a',
 10: 'poppy',
 11: 'blooms'}

In [26]:
decode_sentence = vectorizer.decode(encoded_sentence)
decode_sentence

'i write erase rewrite and [UNK] rewrite again'

In [31]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(output_mode='int')

In [33]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
    lowercase_string =  tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]",""
    )
def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode='int',
    standardize=custom_standardization_fn,
    split=custom_split_fn
)

In [35]:
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms'
]
text_vectorization.adapt(dataset)

In [37]:
# 어휘 사전 출력
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [38]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encode_sentence = text_vectorization(test_sentence)
encode_sentence

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 7,  3,  5,  9,  1,  5, 10], dtype=int64)>

In [42]:
inverse_vocab = dict(enumerate(vocabulary))
decode_sentence = " ".join(inverse_vocab[int(i)] for i in encode_sentence)
decode_sentence

'i write rewrite and [UNK] rewrite again'

In [44]:
# 단어 그룹을 표현하는 두 가지 방법: 집합과 시퀀스
# IMDB 영화 리뷰 데이터 준비하기