In [1]:
# tensorflow tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer

sentences = ['Jane really really likes me', 
             'Jane likes movies']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

tokenizer.word_index

{'jane': 1, 'really': 2, 'likes': 3, 'me': 4, 'movies': 5}

In [2]:
# texts_to_sequences

word_encoding = tokenizer.texts_to_sequences(sentences)
word_encoding

[[1, 2, 2, 3, 4], [1, 3, 5]]

In [3]:
# what if a new word that is not in vocabulary comes in?

new_sentences = ['Jane likes me and Tom']
new_word_encoding = tokenizer.texts_to_sequences(new_sentences)
new_word_encoding

# and/Tom are missing

[[1, 3, 4]]

In [4]:
## process OOV (out of vacabulary)

tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

new_word_encoding = tokenizer.texts_to_sequences(new_sentences)

print(word_index)
print(new_word_encoding)

# and/Tom are tokenized as 1('OOV')

{'<OOV>': 1, 'jane': 2, 'really': 3, 'likes': 4, 'me': 5, 'movies': 6}
[[2, 4, 5, 1, 1]]


In [5]:
# num_words

tokenizer = Tokenizer(num_words=3, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index 

new_word_encoding = tokenizer.texts_to_sequences(new_sentences)

print(word_index)
print(new_word_encoding)

# only Jane is tokenized as > 1

{'<OOV>': 1, 'jane': 2, 'really': 3, 'likes': 4, 'me': 5, 'movies': 6}
[[2, 1, 1, 1, 1]]


In [6]:
# padding (0-fill from the front)

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(word_encoding)
padded

array([[1, 2, 2, 3, 4],
       [0, 0, 1, 3, 5]])

In [7]:
# padding (0-fill from the back)

padded = pad_sequences(word_encoding, padding='post')
padded

array([[1, 2, 2, 3, 4],
       [1, 3, 5, 0, 0]])

In [8]:
# maxlen (cut from the front)

padded = pad_sequences(word_encoding, padding='post', maxlen=4)
padded

array([[2, 2, 3, 4],
       [1, 3, 5, 0]])

In [9]:
# maxlen (cut from the back)

padded = pad_sequences(word_encoding, padding='post', maxlen=4, truncating='post')
padded

array([[1, 2, 2, 3],
       [1, 3, 5, 0]])