In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [12]:
## Example sentences to tokenize
example_sent0 = [
    'It is a sunny day',
]

tokenizer = Tokenizer(num_words=15)
tokenizer.fit_on_texts(example_sent0)
word_index = tokenizer.word_index
print(word_index)

{'it': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5}


In [23]:
## Second Example with more sentences to tokenize
example_sent1 = [
    'It is a sunny day',
    'It isn\'t a sunny day',
    'It is a cloudy day'
]

tokenizer = Tokenizer(num_words=15)
tokenizer.fit_on_texts(example_sent1)
word_index = tokenizer.word_index
print(word_index)

{'<oov>': 1, 'it': 2, 'a': 3, 'day': 4, 'is': 5, 'sunny': 6, "isn't": 7, 'cloudy': 8}


Create sequences

In [25]:

example_sent2 = [
    'It is a sunny day',
    'It isn\'t a sunny day',
    'It is a cloudy dayddd'
]

sequences = tokenizer.texts_to_sequences(example_sent2)
print(f'Word index -->{word_index}')
print(f'Sequence of words -->{sequences}')

print(example_sent2)
print(sequences)

Word index -->{'<oov>': 1, 'it': 2, 'a': 3, 'day': 4, 'is': 5, 'sunny': 6, "isn't": 7, 'cloudy': 8}
Sequence of words -->[[2, 5, 3, 6, 4], [2, 7, 3, 6, 4], [2, 5, 3, 8, 1]]
['It is a sunny day', "It isn't a sunny day", 'It is a cloudy dayddd']
[[2, 5, 3, 6, 4], [2, 7, 3, 6, 4], [2, 5, 3, 8, 1]]


What about words not in the words' dictionary? (Use oov_token -out of vocab token)

In [29]:
example_sent3 = [
    'It is a sunny day',
    'It isn\'t a sunny day',
    'It is a cloudy day'
]
tokenizer = Tokenizer(num_words=15, oov_token="<oov>")
tokenizer.fit_on_texts(example_sent3)
word_index = tokenizer.word_index
print(word_index)
example_sent3.append("It is an unknown word.")


sequences = tokenizer.texts_to_sequences(example_sent3)
print(f'Word index -->{word_index}')
print(f'Sequence of words -->{sequences}')

print(example_sent3)
print(sequences)


{'<oov>': 1, 'it': 2, 'a': 3, 'day': 4, 'is': 5, 'sunny': 6, "isn't": 7, 'cloudy': 8}
Word index -->{'<oov>': 1, 'it': 2, 'a': 3, 'day': 4, 'is': 5, 'sunny': 6, "isn't": 7, 'cloudy': 8}
Sequence of words -->[[2, 5, 3, 6, 4], [2, 7, 3, 6, 4], [2, 5, 3, 8, 4], [2, 5, 1, 1, 1]]
['It is a sunny day', "It isn't a sunny day", 'It is a cloudy day', 'It is an unknown word.']
[[2, 5, 3, 6, 4], [2, 7, 3, 6, 4], [2, 5, 3, 8, 4], [2, 5, 1, 1, 1]]


Padding sequences

In [35]:
example_sent5 = [
    'It is a sunny day',
    'It isn\'t a sunny day',
    'It is a cloudy day',
    'It rains.',
    'A sunny day isn\'t a cloudy day'
]
tokenizer = Tokenizer(num_words=100, oov_token="<oov>")
tokenizer.fit_on_texts(example_sent5)
word_index = tokenizer.word_index
print(word_index)
example_sent5.append("It is an unknown word.")


sequences = tokenizer.texts_to_sequences(example_sent5)
padded_seqs = pad_sequences(sequences)
print(f'Word index -->{word_index}')
print(f'Sequence of words -->{sequences}')

print(example_sent5)
print(sequences)
print(padded_seqs)


{'<oov>': 1, 'a': 2, 'day': 3, 'it': 4, 'sunny': 5, 'is': 6, "isn't": 7, 'cloudy': 8, 'rains': 9}
Word index -->{'<oov>': 1, 'a': 2, 'day': 3, 'it': 4, 'sunny': 5, 'is': 6, "isn't": 7, 'cloudy': 8, 'rains': 9}
Sequence of words -->[[4, 6, 2, 5, 3], [4, 7, 2, 5, 3], [4, 6, 2, 8, 3], [4, 9], [2, 5, 3, 7, 2, 8, 3], [4, 6, 1, 1, 1]]
['It is a sunny day', "It isn't a sunny day", 'It is a cloudy day', 'It rains.', "A sunny day isn't a cloudy day", 'It is an unknown word.']
[[4, 6, 2, 5, 3], [4, 7, 2, 5, 3], [4, 6, 2, 8, 3], [4, 9], [2, 5, 3, 7, 2, 8, 3], [4, 6, 1, 1, 1]]
[[0 0 4 6 2 5 3]
 [0 0 4 7 2 5 3]
 [0 0 4 6 2 8 3]
 [0 0 0 0 0 4 9]
 [2 5 3 7 2 8 3]
 [0 0 4 6 1 1 1]]


Customized padding

In [41]:
padded_seqs = pad_sequences(sequences,
                           padding="post", 
                           maxlen=6,
                           truncating="post")
print(padded_seqs)

[[4 6 2 5 3 0]
 [4 7 2 5 3 0]
 [4 6 2 8 3 0]
 [4 9 0 0 0 0]
 [2 5 3 7 2 8]
 [4 6 1 1 1 0]]
