In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
sentences=[
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

**Tokenization**

In [15]:
tokenizer=Tokenizer(num_words=100)
# num_word => maximum number of words to keep
# so insist of manually assigning words this do that

In [16]:
tokenizer.fit_on_texts(sentences)
# go to all text and fit it like this

In [21]:
word_index=tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


**Turning Sentence into data**

In [7]:
sequences= tokenizer.texts_to_sequences(sentences)
sequences

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]

**Test data**

In [11]:
# if new words are not present in previous data
test_data={
    'i really love my dog',
    'my dog loves my manatee'
}

In [12]:
test_seq= tokenizer.texts_to_sequences(test_data)
test_seq

[[4, 2, 1, 3], [1, 3, 1]]

**Problem solving**

In [17]:
# add oov_token='<00V>' in Tokenizer
tokenizer2=Tokenizer(num_words=100, oov_token='<00V>')

In [18]:
tokenizer2.fit_on_texts(sentences)

In [19]:
test_seq2= tokenizer2.texts_to_sequences(test_data)
test_seq2

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

In [20]:
word_index=tokenizer2.word_index
print(word_index)

{'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [22]:
# we still lost meaning but the sentences are atleast are of same length

in **neural network**, how can it be handled sentences of differnt lengths, advancely **RaggedTensor** is used, but we r using a simpler solution called **padding**

In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
sequences2= tokenizer2.texts_to_sequences(sentences)
sequences2

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [27]:
padded= pad_sequences(sequences2)
padded
# it make sure all the sentences are of same lengths by adding zero before them

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  5,  3,  2,  7],
       [ 0,  0,  0,  6,  3,  2,  4],
       [ 8,  6,  9,  2,  4, 10, 11]])

In [30]:
# To add zeroes at back
padded2= pad_sequences(sequences2, padding='post')
padded2

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  0,  0,  0],
       [ 6,  3,  2,  4,  0,  0,  0],
       [ 8,  6,  9,  2,  4, 10, 11]])

In [31]:
# to specify the length so that the length of padding not same as longest
padded3= pad_sequences(sequences2, padding='post', maxlen=5)
padded3

array([[ 5,  3,  2,  4,  0],
       [ 5,  3,  2,  7,  0],
       [ 6,  3,  2,  4,  0],
       [ 9,  2,  4, 10, 11]])

In [33]:
padded4= pad_sequences(sequences2, padding='post', maxlen=5, 
                       truncating='post')
padded4

array([[5, 3, 2, 4, 0],
       [5, 3, 2, 7, 0],
       [6, 3, 2, 4, 0],
       [8, 6, 9, 2, 4]])