In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    '<br>Today is a rainy day',
    'Today is a sunny day',
    'Is it sunny Today?',
    'I really enjoyed walking in the snow today'
]

In [3]:
# num_words here represent the number of tokens (number of distinct words)
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'today': 2, 'is': 3, 'a': 4, 'day': 5, 'sunny': 6, 'br': 7, 'rainy': 8, 'it': 9, 'i': 10, 'really': 11, 'enjoyed': 12, 'walking': 13, 'in': 14, 'the': 15, 'snow': 16}


In [4]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[7, 2, 3, 4, 8, 5], [2, 3, 4, 6, 5], [3, 9, 6, 2], [10, 11, 12, 13, 14, 15, 16, 2]]


In [5]:
test_sentences = [
    'Today is a snowy day',
    'Will it be rainy tomorrow?'
]
test_sequences = tokenizer.texts_to_sequences(test_sentences)
print(test_sequences)

[[2, 3, 4, 1, 5], [1, 9, 1, 8, 1]]


In [6]:
# Default is having tokens prepadded
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  7  2  3  4  8  5]
 [ 0  0  0  2  3  4  6  5]
 [ 0  0  0  0  3  9  6  2]
 [10 11 12 13 14 15 16  2]]


In [7]:
# Trying out post padded
padded_post = pad_sequences(sequences, padding='post')
print(padded_post)

[[ 7  2  3  4  8  5  0  0]
 [ 2  3  4  6  5  0  0  0]
 [ 3  9  6  2  0  0  0  0]
 [10 11 12 13 14 15 16  2]]


In [8]:
# Trying out max length (truncating from the front)
max_padded = pad_sequences(sequences, padding='post', maxlen=6)
print(max_padded)

[[ 7  2  3  4  8  5]
 [ 2  3  4  6  5  0]
 [ 3  9  6  2  0  0]
 [12 13 14 15 16  2]]


In [9]:
# Trying out max length (truncating from behind)
max_padded_post = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')
print(max_padded_post)

[[ 7  2  3  4  8  5]
 [ 2  3  4  6  5  0]
 [ 3  9  6  2  0  0]
 [10 11 12 13 14 15]]


AttributeError: 'list' object has no attribute 'startswith'