In [1]:
# NLP Generating Sequences and Padding
# 1. Convert input sentences into a sequence of tokens
# 2. Prepare text data with uniform size before feeding in to the model
#-----------------------------------------------------------------------
# word_index = generate tokens for each word in your corpus text
# texts_to_sequences() = converts each of the input sentences into a sequence of tokens

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Input texts
sentences = [
    "I love my dog",
    "I love my cat",
    "You love my dog!",
    "Do you think my dig is amazing?"
]

# Init Tokenizer class OOV = Out Of Vocabulary
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")

# Tokenize input sentences
tokenizer.fit_on_texts(sentences)

# Word index dictionary
word_index = tokenizer.word_index

# Generate list of token sequences
sequences = tokenizer.texts_to_sequences(sentences)

print("\nWord index = ", word_index)
print("\nSequences = ",sequences)


Word index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'dig': 10, 'is': 11, 'amazing': 12}

Sequences =  [[4, 3, 2, 5], [4, 3, 2, 7], [6, 3, 2, 5], [8, 6, 9, 2, 10, 11, 12]]


In [3]:
# PADDING
# pad_sequences      truncating = from where to lose information
# It will pad to the longest sequence
# maxlen overrides to a specific length

# Pad the sequences to a uniform length
padded = pad_sequences(sequences, maxlen=5)
print("Default pre padding")
print(padded)

print("Post padding")
padded = pad_sequences(sequences, padding='post')
print(padded)

print("Post padding with maxlen and truncating")
padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)
print(padded)

Default pre padding
[[ 0  4  3  2  5]
 [ 0  4  3  2  7]
 [ 0  6  3  2  5]
 [ 9  2 10 11 12]]
Post padding
[[ 4  3  2  5  0  0  0]
 [ 4  3  2  7  0  0  0]
 [ 6  3  2  5  0  0  0]
 [ 8  6  9  2 10 11 12]]
Post padding with maxlen and truncating
[[ 4  3  2  5  0]
 [ 4  3  2  7  0]
 [ 6  3  2  5  0]
 [ 8  6  9  2 10]]


In [4]:
# OUT-OF_VOCABULARY
# collect more text after the initial training and decide to not re-generate the word_index

# Words that the tokenizer was not fit to
test_data = [
    "i really love my dog",
    "my dog loves my parrot"
]

# Generate the sequences
test_seq = tokenizer.texts_to_sequences(test_data)

print("Word index = ", word_index)
print("Test sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10)
print("Padded test dequence = ")
print(padded)

padded = pad_sequences(test_seq, padding='post', maxlen=10)
print("Padded test dequence = ")
print(padded)


Word index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'i': 4, 'dog': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'dig': 10, 'is': 11, 'amazing': 12}
Test sequence =  [[4, 1, 3, 2, 5], [2, 5, 1, 2, 1]]
Padded test dequence = 
[[0 0 0 0 0 4 1 3 2 5]
 [0 0 0 0 0 2 5 1 2 1]]
Padded test dequence = 
[[4 1 3 2 5 0 0 0 0 0]
 [2 5 1 2 1 0 0 0 0 0]]
