In [89]:
from tensorflow.keras.preprocessing.text import Tokenizer
# using padding 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [96]:
sentences = [
    'i love my dog',
    'I, love my cat'
]

In [97]:
# declare the tokenizer
# reference: https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
tokenizer = Tokenizer(num_words=100)

In [98]:
def fit_tokenizer(tokenizer, sentences,on_text=True):
    # encode the words using the tokenizer
    if on_text:
        tokenizer.fit_on_texts(sentences)
    else:
        return tokenizer.texts_to_sequences(sentences)

In [99]:
def get_word_index(tokenizer):
    # get the indices of the words 
    return tokenizer.word_index

In [100]:
fit_tokenizer(tokenizer, sentences)
print(get_word_index(tokenizer))

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [101]:
sentences.append('You love my dog!')

In [102]:
fit_tokenizer(tokenizer, sentences)
print(get_word_index(tokenizer))

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [103]:
sentences.append('Do you think my dog is amazing?')

In [104]:
sequences = fit_tokenizer(tokenizer, sentences, on_text=False)

In [105]:
print(get_word_index(tokenizer))

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [106]:
print(sequences)

[[3, 1, 2, 4], [3, 1, 2, 5], [6, 1, 2, 4], [6, 2, 4]]


In [107]:
## Demonstrate example where words are not indexed 
def demonstrate_missing_words(tokenizer):
    test_data = ["i really love my dog",  # really is not in the word index
                 "my dog loves my manatee" # loves and manatee are not in either :( 
                ] 
    return fit_tokenizer(tokenizer, test_data, on_text=False)

In [108]:
sequences_wrong = demonstrate_missing_words(tokenizer)

In [109]:
print(get_word_index(tokenizer))
print(sequences_wrong)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}
[[3, 1, 2, 4], [2, 4, 2]]


In [110]:
# Lets try an example where we don't ignore missing words 
# using oov token
'''
oov_token, if given, it will be added to word_index and used to
    replace out-of-vocabulary words during text_to_sequence calls
'''
tokenizer_oov = Tokenizer(num_words=100, oov_token="<OOV>")

In [111]:
# encode the words
fit_tokenizer(tokenizer_oov, sentences)
print(get_word_index(tokenizer_oov))
# generate the sequences 
sequences = fit_tokenizer(tokenizer_oov, sentences, on_text=False)
print(sequences)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [112]:
test_seq = demonstrate_missing_words(tokenizer_oov)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [121]:
sentences

['i love my dog',
 'I, love my cat',
 'You love my dog!',
 'Do you think my dog is amazing?']

In [114]:
# Let's use padding :) 
# We will do the same as before but adding padding 

# encode the words
fit_tokenizer(tokenizer, sentences)

# generate the sequences
seq_no_padding = fit_tokenizer(tokenizer, sentences, on_text=False)

# adding the padding 
padded_seq = pad_sequences(seq_no_padding)

In [115]:
print(get_word_index(tokenizer))
print(seq_no_padding)
print(padded_seq)

{'my': 1, 'love': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[3, 2, 1, 4], [3, 2, 1, 5], [6, 2, 1, 4], [7, 6, 8, 1, 4, 9, 10]]
[[ 0  0  0  3  2  1  4]
 [ 0  0  0  3  2  1  5]
 [ 0  0  0  6  2  1  4]
 [ 7  6  8  1  4  9 10]]


In [128]:
# Now let's try to use it with different parameters 
tokenizer_oov = Tokenizer(num_words=100, oov_token="<OOV>")

# encode the words
fit_tokenizer(tokenizer_oov, sentences)
print("\n Word index: {}".format(get_word_index(tokenizer_oov)))

# generate the sequences 
sequences = fit_tokenizer(tokenizer_oov, sentences, on_text=False)
print("\n Sequences {}".format(sequences))

# adding the padding 
padded_seq = pad_sequences(seq_no_padding, maxlen=5)
print("\n Padded Sequences\n{}".format(padded_seq))


 Word index: {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

 Sequences [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

 Padded Sequences
[[ 0  3  2  1  4]
 [ 0  3  2  1  5]
 [ 0  6  2  1  4]
 [ 8  1  4  9 10]]


In [131]:
# let's test it 
test_seq = demonstrate_missing_words(tokenizer_oov)
print("\n Test Sequences\n{}".format(test_seq))

# adding the padding 
padded_seq = pad_sequences(test_seq, maxlen=10)
print(padded_seq)


 Test Sequences
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


In [132]:
# Only with 2
# adding the padding 
padded_seq = pad_sequences(test_seq, maxlen=2)
print(padded_seq)

[[2 4]
 [2 1]]
