In [1]:
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [8]:
# Some random sequences
sentences = [
    "I like to study.",
    "We like potatoes.",
    "Let's study and go for a walk."
]

## Tokenizer

In [9]:
tokenizer = Tokenizer(num_words=20_000)
tokenizer.fit_on_texts(sentences)

In [10]:
sequences = tokenizer.texts_to_sequences(sentences)

In [11]:
sequences

[[3, 1, 4, 2], [5, 1, 6], [7, 2, 8, 9, 10, 11, 12]]

In [12]:
tokenizer.word_index

{'like': 1,
 'study': 2,
 'i': 3,
 'to': 4,
 'we': 5,
 'potatoes': 6,
 "let's": 7,
 'and': 8,
 'go': 9,
 'for': 10,
 'a': 11,
 'walk': 12}

## Padding

In [15]:
pad_sequences(sequences)

array([[ 0,  0,  0,  3,  1,  4,  2],
       [ 0,  0,  0,  0,  5,  1,  6],
       [ 7,  2,  8,  9, 10, 11, 12]], dtype=int32)

In [16]:
pad_sequences(sequences, padding='post')

array([[ 3,  1,  4,  2,  0,  0,  0],
       [ 5,  1,  6,  0,  0,  0,  0],
       [ 7,  2,  8,  9, 10, 11, 12]], dtype=int32)

In [17]:
pad_sequences(sequences, maxlen=5)

array([[ 0,  3,  1,  4,  2],
       [ 0,  0,  5,  1,  6],
       [ 8,  9, 10, 11, 12]], dtype=int32)

In [18]:
pad_sequences(sequences, maxlen=5, truncating='post')

array([[ 0,  3,  1,  4,  2],
       [ 0,  0,  5,  1,  6],
       [ 7,  2,  8,  9, 10]], dtype=int32)