In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    "She is beautiful!",
    "I think I should talk to her.",
    "I have fallen in love with her.",
    "But her beauty is not the only reason I love her.",
    "She is intelligent and kind, and she loves nature and that aligns with me."
]

tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'i': 2, 'her': 3, 'she': 4, 'is': 5, 'and': 6, 'love': 7, 'with': 8, 'beautiful': 9, 'think': 10, 'should': 11, 'talk': 12, 'to': 13, 'have': 14, 'fallen': 15, 'in': 16, 'but': 17, 'beauty': 18, 'not': 19, 'the': 20, 'only': 21, 'reason': 22, 'intelligent': 23, 'kind': 24, 'loves': 25, 'nature': 26, 'that': 27, 'aligns': 28, 'me': 29}


In [12]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[4, 5, 9], [2, 10, 2, 11, 12, 13, 3], [2, 14, 15, 16, 7, 8, 3], [17, 3, 18, 5, 19, 20, 21, 22, 2, 7, 3], [4, 5, 23, 6, 24, 6, 4, 25, 26, 6, 27, 28, 8, 29]]


In [13]:
#it will pad according to the seq with max length, but you can define
#the max length via setting maxlen, here 10
padded = pad_sequences(sequences, maxlen = 10)
print(padded)

[[ 0  0  0  0  0  0  0  4  5  9]
 [ 0  0  0  2 10  2 11 12 13  3]
 [ 0  0  0  2 14 15 16  7  8  3]
 [ 3 18  5 19 20 21 22  2  7  3]
 [24  6  4 25 26  6 27 28  8 29]]


In [14]:
test_data = [
    "Paris is a nice city.",
    "But there are no mountains.",
    "I want to go to aurora borealis."
]

test_seq = tokenizer.texts_to_sequences(test_data)
padded = pad_sequences(test_seq)
print(padded)

[[ 0  0  1  5  1  1  1]
 [ 0  0 17  1  1  1  1]
 [ 2  1 13  1 13  1  1]]


In [17]:
tokenizer.sequences_to_texts(padded)

['<OOV> <OOV> <OOV> is <OOV> <OOV> <OOV>',
 '<OOV> <OOV> but <OOV> <OOV> <OOV> <OOV>',
 'i <OOV> to <OOV> to <OOV> <OOV>']