In [1]:
"""
Step1: Standardization
(i) Lowercase
(ii) Removing punctuation
(iii) Handling special characters like (é becomes e)
(iv) Stemming: handling conjugation ;;of words (eg: stared and is staring becomes -> stare)
"""

'\nStep1: Standardization\n(i) Lowercase\n(ii) Removing punctuation\n(iii) Handling special characters like (é becomes e)\n(iv) Stemming: handling conjugation ;;of words (eg: stared and is staring becomes -> stare)\n'

In [2]:
"""
Step2: Tokenization
(i) Word tokenization (splitting text based on whitespaces (tokens) -> subvariant split words into subwords)
(ii) N-grams tokenization (splitting text into groups of N or fewer to extract words (tokens) from sentences)
(iii) Character tokenization (each character is a token)

sequence modeling = use word tokenization (keeps order)
bag-of-words model = user N-grams tokenization (set and onordered)
"""

'\nStep2: Tokenization\n(i) Word tokenization (splitting text based on whitespaces (tokens) -> subvariant split words into subwords)\n(ii) N-grams tokenization (splitting text into groups of N or fewer to extract words (tokens) from sentences)\n(iii) Character tokenization (each character is a token)\n\nsequence modeling = use word tokenization (keeps order)\nbag-of-words model = user N-grams tokenization (set and onordered)\n'

In [3]:
"""
Step3: Indexing
Assigning a unique integer to each token in the vocabulary
"""

'\nStep3: Indexing\nAssigning a unique integer to each token in the vocabulary\n'

In [4]:
"""
Step4: TextVectorization
"""
import string

class Vectorizer:
  # (a) standardize
  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                   if char not in string.punctuation)

  # (b) Tokenize
  def tokenize(self, text):
    text = self.standardize(text)
    return text.split()

  # (c) Make vocabulary, encode, decode
  def make_vocabulary(self, dataset):
    self.vocabulary = {"": 0, "[UNK]": 1}
    for text in dataset:
      text = self.standardize(text)
      tokens = self.tokenize(text)

      for token in tokens:
        if token not in self.vocabulary:
          # word, index
          # "hello": 2
          # "world": 3
          self.vocabulary[token] = len(self.vocabulary)

    self.inverse_vocabulary = dict(
        # index, word
        (v, k) for k,v in self.vocabulary.items())

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    # word, index
    return [self.vocabulary.get(token, 1) for token in tokens]

  def decode(self, int_sequence):
    # index, word
    return " ".join(
        self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)


In [5]:
vectorizer = Vectorizer()
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]
vectorizer.make_vocabulary(dataset)

In [6]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

[2, 3, 5, 7, 1, 5, 6]
i write rewrite and [UNK] rewrite again


In [7]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
    output_mode = "int",
)

In [8]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
  lowercase_string = tf.strings.lower(string_tensor)
  # replace punctuation characters with the empty string
  return tf.strings.regex_replace(
      lowercase_string, f"[{re.escape(string.punctuation)}]", "")

def custom_split_fn(string_tensor):
  return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode = "int",
    standardize = custom_standardization_fn,
    split = custom_split_fn,
)

In [9]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]
# word, index dictionary (hidden)
text_vectorization.adapt(dataset)

In [10]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 np.str_('erase'),
 np.str_('write'),
 np.str_('then'),
 np.str_('rewrite'),
 np.str_('poppy'),
 np.str_('i'),
 np.str_('blooms'),
 np.str_('and'),
 np.str_('again'),
 np.str_('a')]

In [11]:
# index, word
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)
# index, word

inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
i write rewrite and [UNK] rewrite again
