In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Some random sentences

sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions."
]

In [3]:
print(sentences)

['I like eggs and ham.', 'I love chocolate and bunnies.', 'I hate onions.']


Separate the each words.  Then assign to integers.  

In [31]:
MAX_VOCAB_SIZE = 20000 # <- the number of disticnt words
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [32]:
print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [33]:
# Which word is mapped to which integers?

tokenizer.word_index

{'i': 1,
 'and': 2,
 'like': 3,
 'eggs': 4,
 'ham': 5,
 'love': 6,
 'chocolate': 7,
 'bunnies': 8,
 'hate': 9,
 'onions': 10}

In [34]:
# Make it all of them in same size vector.  This is called padding

data = pad_sequences(sequences)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [35]:
# We can make the padding size bigger than max length

MAX_SEQUENCE_LENGTH = 6
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 0  1  3  4  2  5]
 [ 0  1  6  7  2  8]
 [ 0  0  0  1  9 10]]


In [36]:
# Add zeros at the end.  This is post padding. 

data = pad_sequences(sequences, maxlen=5, padding='post')
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


In [28]:
data = pad_sequences(sequences, maxlen=6, padding='post')
print(data)

[[ 1  3  4  2  5  0]
 [ 1  6  7  2  8  0]
 [ 1  9 10  0  0  0]]


In [37]:
# We can cut the length of vectors.  this is called truncating

data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


In [38]:
# truncating end. 

data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]
