<a href="https://colab.research.google.com/github/hws2002/Deep_Learning_with_Keras/blob/main/Chapter11/Chapter11_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
vocabulary = {}
for text in dataset:
  text = standardize(text)
  tokens = tokenize(text)
  for token in tokens:
    if token not in vocabulary:
      vocabulary[token] = len(vocabulary)

In [None]:
def one_hot_encode_token(token):
  vector = np.zeros((len(vocabulary),))
  token_index = vocabulary[token]
  vector[token_index] = 1
  return vector


In [None]:
import string

class Vectorizer:
  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                   if char not in string.punctuation)


  def tokenize(self, text):
    return text.split()

  def make_vocabulary(self, dataset):
    self.vocabulary = {"" : 0, "[UNK]": 1}
    for text in dataset:
      text = self.standardize(text)
      tokens = self.tokenize(text)
      for token in tokens:
        if token not in self.vocabulary:
          self.vocabulary[token] = len(self.vocabulary)

    self.inverse_vocabulary = dict(
        (v,k) for k, v in self.vocabulary.items()
    )

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    return [self.vocabulary.get(token, 1) for token in tokens]

  def decode(self, int_sequence):
    return " ".join(
       self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)


vectorizer = Vectorizer()

dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms",
]

vectorizer.make_vocabulary(dataset)

In [None]:
vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [None]:
test_sentence = "I write, rewrite, and still rewrite again"

encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)


[2, 3, 5, 7, 1, 5, 6]
i write rewrite and [UNK] rewrite again


In [None]:
import keras
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(
    output_mode = "int", # 정수 인덱스로 인코딩된 단어 시퀀스를 반환하도록 층을 설정함
    # 여러 가지 다른 출력 모드도 있음
)

In [None]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
  lowercase_string = tf.strings.lower(string_tensor)
  return tf.strings.regex_replace(
      lowercase_string, f"[{re.escape(string.punctuation)}]", "")


def custom_split_fn(string_tensor):
  return tf.strings.split(string_tensor)

text_vectorizatoin = TextVectorization(
    output_mode = "int",
    standardize = custom_standardization_fn,
    split = custom_split_fn,
)

In [None]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms",
]

text_vectorization.adapt(dataset)

In [None]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [None]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


In [None]:
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join([inverse_vocab[int(i)] for i in encoded_sentence])
decoded_sentence

'i write rewrite and [UNK] rewrite again'