In [None]:
# Tokenization using Tensorflow
# you can refer teh below link for details about what is Tokenizer
# https://github.com/hari0624/PySpark-with-Examples/blob/master/Tokenizer_R_Tk.ipynb

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
# data as python array of strings (texts)
df = [
    "Tensorflow is awesome",
    "Keras is also awesome",
    "Python is too awesome"
]

In [10]:
# creating an instance of the Tokenizer object
# num_words param is the max no of words to keep
tokenizer = Tokenizer(num_words=50)

# object reads the data (df) and fit itself 
tokenizer.fit_on_texts(df)

# full list of texts available as the Tokenizer's word index property
token = tokenizer.word_index

# result will be disaplayed as a key value pair: 
# key -> text, value -> token for that word.
print(token)

{'is': 1, 'awesome': 2, 'tensorflow': 3, 'keras': 4, 'also': 5, 'python': 6, 'too': 7}


In [None]:
# Now words in the text are represented by plain numbers (tokens).

In [12]:
# mention the token for the text in a sequential order
sequence = tokenizer.texts_to_sequences(df)
print(token)
print(sequence)

{'is': 1, 'awesome': 2, 'tensorflow': 3, 'keras': 4, 'also': 5, 'python': 6, 'too': 7}
[[3, 1, 2], [4, 1, 5, 2], [6, 1, 7, 2]]


In [16]:
# validate the tokenizer with new data
val_data = [
    "I like to learn Tensorflow",
    "Everyone likes to learn Keras"
]

In [17]:
val_sequence = tokenizer.texts_to_sequences(val_data)
print(val_sequence)

[[3], [4]]


In [None]:
# in the above step, new words are not recognized by the tokenizer.
# to overcome this, use the 'oov_token' property of the Tokenizer

In [21]:
tokenizer1 = Tokenizer(num_words=50, oov_token="<oov>")
tokenizer1.fit_on_texts(df)
token1 = tokenizer1.word_index
print(token1)

{'<oov>': 1, 'is': 2, 'awesome': 3, 'tensorflow': 4, 'keras': 5, 'also': 6, 'python': 7, 'too': 8}


In [22]:
# with the same validation data getting the tokens for new words
val_seq1 = tokenizer1.texts_to_sequences(val_data)
print(val_seq1)

[[1, 1, 1, 1, 4], [1, 1, 1, 1, 5]]


In [23]:
# in the input data df, the length of the string is different.
# use padding to fit the data in same length

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
pad_df = pad_sequences(sequence)
print("Token")
print(token)
print("\nSequence")
print(sequence)
print("\nSequence after Padding")
print(pad_df)

Token
{'is': 1, 'awesome': 2, 'tensorflow': 3, 'keras': 4, 'also': 5, 'python': 6, 'too': 7}

Sequence
[[3, 1, 2], [4, 1, 5, 2], [6, 1, 7, 2]]

Sequence after Padding
[[0 3 1 2]
 [4 1 5 2]
 [6 1 7 2]]


In [30]:
# keeping the padding at last
# deafult is at the begining like above
pad_df = pad_sequences(sequence, padding='post')
print("Token")
print(token)
print("\nSequence")
print(sequence)
print("\nSequence after Padding")
print(pad_df)

Token
{'is': 1, 'awesome': 2, 'tensorflow': 3, 'keras': 4, 'also': 5, 'python': 6, 'too': 7}

Sequence
[[3, 1, 2], [4, 1, 5, 2], [6, 1, 7, 2]]

Sequence after Padding
[[3 1 2 0]
 [4 1 5 2]
 [6 1 7 2]]


In [31]:
# to fix the maximum length of the padded sequence
pad_df = pad_sequences(sequence, padding='post', maxlen=3)

print("Token")
print(token)
print("\nSequence")
print(sequence)
print("\nSequence after Padding")
print(pad_df)

Token
{'is': 1, 'awesome': 2, 'tensorflow': 3, 'keras': 4, 'also': 5, 'python': 6, 'too': 7}

Sequence
[[3, 1, 2], [4, 1, 5, 2], [6, 1, 7, 2]]

Sequence after Padding
[[3 1 2]
 [1 5 2]
 [1 7 2]]


In [33]:
# missed values at the begining for the rest of the sequence
# mention the trunncation value
pad_df = pad_sequences(sequence, padding='post', truncating='post', maxlen=3)
print("Token")
print(token)
print("\nSequence")
print(sequence)
print("\nSequence after Padding")
print(pad_df)

Token
{'is': 1, 'awesome': 2, 'tensorflow': 3, 'keras': 4, 'also': 5, 'python': 6, 'too': 7}

Sequence
[[3, 1, 2], [4, 1, 5, 2], [6, 1, 7, 2]]

Sequence after Padding
[[3 1 2]
 [4 1 5]
 [6 1 7]]
