In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    "I like eggs and ham.",
    "I like chocolate and bunnies.",
    "I hate onions."
]

In [3]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 4, 3, 5], [1, 2, 6, 3, 7], [1, 8, 9]]

In [4]:
tokenizer.word_index

{'i': 1,
 'like': 2,
 'and': 3,
 'eggs': 4,
 'ham': 5,
 'chocolate': 6,
 'bunnies': 7,
 'hate': 8,
 'onions': 9}

In [5]:
data = pad_sequences(sequences)
data

array([[1, 2, 4, 3, 5],
       [1, 2, 6, 3, 7],
       [0, 0, 1, 8, 9]], dtype=int32)

In [6]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data

array([[1, 2, 4, 3, 5],
       [1, 2, 6, 3, 7],
       [0, 0, 1, 8, 9]], dtype=int32)

In [7]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
data

array([[1, 2, 4, 3, 5],
       [1, 2, 6, 3, 7],
       [1, 8, 9, 0, 0]], dtype=int32)

In [8]:
MAX_SEQUENCE_LENGTH = 6
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data

array([[0, 1, 2, 4, 3, 5],
       [0, 1, 2, 6, 3, 7],
       [0, 0, 0, 1, 8, 9]], dtype=int32)

In [9]:
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data

array([[2, 4, 3, 5],
       [2, 6, 3, 7],
       [0, 1, 8, 9]], dtype=int32)

In [10]:
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, truncating="post")
data

array([[1, 2, 4, 3],
       [1, 2, 6, 3],
       [0, 1, 8, 9]], dtype=int32)