In [19]:
import numpy as np
import tensorflow as tf
import tensorflow_text as text
import tensorflow_datasets as tfds

In [38]:
raw_sequences = [
    'The weather will be nice tomorrow.',
    'How are you doing today.',
    'Hello world!'
]
sequences = tf.ragged.constant(raw_sequences)

Creating dataset from Python's list

In [40]:
dataset = tf.data.Dataset.from_tensor_slices(sequences)

Using tokenizer

Using TF

In [41]:
tokenizer = text.UnicodeScriptTokenizer()
tokenized_dataset = dataset.map(lambda x: tokenizer.tokenize(x))

Checking tokens

In [43]:
iterator = iter(tokenized_dataset)
print(next(iterator))

tf.Tensor([b'The' b'weather' b'will' b'be' b'nice' b'tomorrow' b'.'], shape=(7,), dtype=string)


Using Keras

In [46]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(raw_sequences)

In [47]:
print(tokenizer.word_counts)
print(tokenizer.document_count)
print(tokenizer.word_index)
print(tokenizer.word_docs)

OrderedDict([('the', 1), ('weather', 1), ('will', 1), ('be', 1), ('nice', 1), ('tomorrow', 1), ('how', 1), ('are', 1), ('you', 1), ('doing', 1), ('today', 1), ('hello', 1), ('world', 1)])
3
{'the': 1, 'weather': 2, 'will': 3, 'be': 4, 'nice': 5, 'tomorrow': 6, 'how': 7, 'are': 8, 'you': 9, 'doing': 10, 'today': 11, 'hello': 12, 'world': 13}
defaultdict(<class 'int'>, {'nice': 1, 'weather': 1, 'the': 1, 'tomorrow': 1, 'will': 1, 'be': 1, 'how': 1, 'are': 1, 'you': 1, 'today': 1, 'doing': 1, 'world': 1, 'hello': 1})


In [52]:
tokenized_sequences = tokenizer.texts_to_sequences(raw_sequences)
tokenized_sequences

[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11], [12, 13]]

In [53]:
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(tokenized_sequences, padding='post')
padded_sequences

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11,  0],
       [12, 13,  0,  0,  0,  0]], dtype=int32)

In [61]:
padded_sequences.shape

(3, 6)

You should add mask for your Embedding layer

In [57]:
iterator = iter(padded_sequences)

In [65]:
embedding = tf.keras.layers.Embedding(input_dim=500, output_dim=4, mask_zero=True)
masked_output = embedding(padded_sequences)
print(masked_output._keras_mask)

tf.Tensor(
[[ True  True  True  True  True  True]
 [ True  True  True  True  True False]
 [ True  True False False False False]], shape=(3, 6), dtype=bool)
