In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = ['Stars twinkle brightly in the night sky.',
'Space travel fascinates many people.',
'The moon orbits around Earth.',
'Astronauts float in zero gravity.',
'Telescopes reveal distant galaxies.',
'Planets revolve around the sun.']

print(sentences)

['Stars twinkle brightly in the night sky.', 'Space travel fascinates many people.', 'The moon orbits around Earth.', 'Astronauts float in zero gravity.', 'Telescopes reveal distant galaxies.', 'Planets revolve around the sun.']


In [3]:
tokenizer = Tokenizer(num_words = 100, oov_token = '<OOV>')

In [5]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'the': 2, 'in': 3, 'around': 4, 'stars': 5, 'twinkle': 6, 'brightly': 7, 'night': 8, 'sky': 9, 'space': 10, 'travel': 11, 'fascinates': 12, 'many': 13, 'people': 14, 'moon': 15, 'orbits': 16, 'earth': 17, 'astronauts': 18, 'float': 19, 'zero': 20, 'gravity': 21, 'telescopes': 22, 'reveal': 23, 'distant': 24, 'galaxies': 25, 'planets': 26, 'revolve': 27, 'sun': 28}


In [6]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 6, 7, 3, 2, 8, 9], [10, 11, 12, 13, 14], [2, 15, 16, 4, 17], [18, 19, 3, 20, 21], [22, 23, 24, 25], [26, 27, 4, 2, 28]]


In [7]:
padded = pad_sequences(sequences)
print("Padded Sequences: ", padded)

Padded Sequences:  [[ 5  6  7  3  2  8  9]
 [ 0  0 10 11 12 13 14]
 [ 0  0  2 15 16  4 17]
 [ 0  0 18 19  3 20 21]
 [ 0  0  0 22 23 24 25]
 [ 0  0 26 27  4  2 28]]


In [9]:
padded = pad_sequences(sequences, maxlen = 15)
print(padded)

[[ 0  0  0  0  0  0  0  0  5  6  7  3  2  8  9]
 [ 0  0  0  0  0  0  0  0  0  0 10 11 12 13 14]
 [ 0  0  0  0  0  0  0  0  0  0  2 15 16  4 17]
 [ 0  0  0  0  0  0  0  0  0  0 18 19  3 20 21]
 [ 0  0  0  0  0  0  0  0  0  0  0 22 23 24 25]
 [ 0  0  0  0  0  0  0  0  0  0 26 27  4  2 28]]


In [10]:
padded = pad_sequences(sequences, maxlen = 15, padding = 'post')
print(padded)

[[ 5  6  7  3  2  8  9  0  0  0  0  0  0  0  0]
 [10 11 12 13 14  0  0  0  0  0  0  0  0  0  0]
 [ 2 15 16  4 17  0  0  0  0  0  0  0  0  0  0]
 [18 19  3 20 21  0  0  0  0  0  0  0  0  0  0]
 [22 23 24 25  0  0  0  0  0  0  0  0  0  0  0]
 [26 27  4  2 28  0  0  0  0  0  0  0  0  0  0]]


In [13]:
padded = pad_sequences(sequences, maxlen = 3)
print(padded)

[[ 2  8  9]
 [12 13 14]
 [16  4 17]
 [ 3 20 21]
 [23 24 25]
 [ 4  2 28]]


In [14]:
test_data = ['Comets leave shimmering trails in the cosmos.',
'Nebulae are stellar nurseries in space.']

print(test_data)

['Comets leave shimmering trails in the cosmos.', 'Nebulae are stellar nurseries in space.']


In [15]:
print("<OOV> has the number", word_index['<OOV>'], "in the word index.")

<OOV> has the number 1 in the word index.


In [19]:
test_seq = tokenizer.texts_to_sequences(test_data)
print("Test sequences: ", test_seq)

Test sequences:  [[1, 1, 1, 1, 3, 2, 1], [1, 1, 1, 1, 3, 10]]


In [20]:
padded = pad_sequences(test_seq, maxlen = 10)
print(padded)

[[ 0  0  0  1  1  1  1  3  2  1]
 [ 0  0  0  0  1  1  1  1  3 10]]
