<a href="https://colab.research.google.com/github/it5joys135/ML-Projects/blob/main/TF_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.17.1
Num GPUs Available:  0


In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
# Tiny dataset
sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions."
]

In [None]:
MAX_VOCAB_SIZE = 20000

In [None]:
vectorization_layer = TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    # standardize = "lower_and_strip_punctuation",
    # split = "whitespace",
    # output_mode = "int",
)

In [None]:
vectorization_layer.adapt(sentences)

In [None]:
sequences = vectorization_layer(sentences)
print(sequences)

tf.Tensor(
[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 2  7  4  0  0]], shape=(3, 5), dtype=int64)


In [None]:
vectorization_layer.get_vocabulary()

['',
 '[UNK]',
 'i',
 'and',
 'onions',
 'love',
 'like',
 'hate',
 'ham',
 'eggs',
 'chocolate',
 'bunnies']

In [None]:
# How do we get word-to-index mapping?
word_2_idx = {v:k for k, v in enumerate(vectorization_layer.get_vocabulary())}
print(word_2_idx)

{'': 0, '[UNK]': 1, 'i': 2, 'and': 3, 'onions': 4, 'love': 5, 'like': 6, 'hate': 7, 'ham': 8, 'eggs': 9, 'chocolate': 10, 'bunnies': 11}


In [None]:
# truncation
vectorization_layer_truncated = TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    output_sequence_length = 3
)

# fit
vectorization_layer_truncated.adapt(sentences)

# vectorize
sequences_truncated = vectorization_layer_truncated(sentences)
print(sequences_truncated)

tf.Tensor(
[[ 2  6  9]
 [ 2  5 10]
 [ 2  7  4]], shape=(3, 3), dtype=int64)


In [None]:
# ragged (No Padding) (TF backend only)
vectorization_layer_ragged = TextVectorization(
    max_tokens = MAX_VOCAB_SIZE,
    ragged=True
)

# fit
vectorization_layer_ragged.adapt(sentences)

# vectorize
sequences_ragged = vectorization_layer_ragged(sentences)
print(sequences_ragged)

<tf.RaggedTensor [[2, 6, 9, 3, 8], [2, 5, 10, 3, 11], [2, 7, 4]]>


In [None]:
# pad at front instead of back
# not support in TextVectorization layer itself
from tensorflow.keras.utils import pad_sequences

# default
# tf.keras.utils.pad_sequences(
#     sequences,
#     maxlen=None,
#     dtype='int32',
#     padding='pre',
#     truncating='pre',
#     value=0.0
# )

padded = pad_sequences(sequences_ragged.to_list())
print(padded)

[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 0  0  2  7  4]]
