In [1]:
import tensorflow as tf
import sys

In [2]:
print(f"Python: {sys.version}")
print(f"TensorFlow: {tf.__version__}")

Python: 3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
TensorFlow: 2.1.0


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
sentences = [
    "I like eggs and bread.",
    "I hate chocolate and raddish.",
    "I love onions."
]

In [6]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [7]:
sequences

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]

In [8]:
tokenizer.word_index

{'i': 1,
 'and': 2,
 'like': 3,
 'eggs': 4,
 'bread': 5,
 'hate': 6,
 'chocolate': 7,
 'raddish': 8,
 'love': 9,
 'onions': 10}

In [9]:
data = pad_sequences(sequences)
data

array([[ 1,  3,  4,  2,  5],
       [ 1,  6,  7,  2,  8],
       [ 0,  0,  1,  9, 10]])

In [10]:
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data

array([[ 1,  3,  4,  2,  5],
       [ 1,  6,  7,  2,  8],
       [ 0,  0,  1,  9, 10]])

In [11]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
data

array([[ 1,  3,  4,  2,  5],
       [ 1,  6,  7,  2,  8],
       [ 1,  9, 10,  0,  0]])

In [12]:
# too much padding
data = pad_sequences(sequences, maxlen=6)
data

array([[ 0,  1,  3,  4,  2,  5],
       [ 0,  1,  6,  7,  2,  8],
       [ 0,  0,  0,  1,  9, 10]])

In [13]:
# truncating
data = pad_sequences(sequences, maxlen=4)
data

array([[ 3,  4,  2,  5],
       [ 6,  7,  2,  8],
       [ 0,  1,  9, 10]])

In [14]:
data = pad_sequences(sequences, maxlen=4, truncating="post")
data

array([[ 1,  3,  4,  2],
       [ 1,  6,  7,  2],
       [ 0,  1,  9, 10]])