<a href="https://colab.research.google.com/github/kfl15/NLP/blob/main/NLP_model_data_pre_processing_steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
train_sentences = [
             'It is a sunny day',
            'old is gold',
    'Something should be tested without saying HELLO WORLD!!',
    "coudy morning in the street"
]

## Train Tokenizer

In [3]:
tokenizer = Tokenizer(num_words=100)


tokenizer.fit_on_texts(train_sentences)


word_index = tokenizer.word_index


In [4]:
print(word_index)

{'is': 1, 'it': 2, 'a': 3, 'sunny': 4, 'day': 5, 'old': 6, 'gold': 7, 'something': 8, 'should': 9, 'be': 10, 'tested': 11, 'without': 12, 'saying': 13, 'hello': 14, 'world': 15, 'coudy': 16, 'morning': 17, 'in': 18, 'the': 19, 'street': 20}


## Create sequences

In [5]:
sequences = tokenizer.texts_to_sequences(train_sentences)

In [6]:
print(f"Word index -->{word_index}")
print(f"Sequences of words -->{sequences}")

Word index -->{'is': 1, 'it': 2, 'a': 3, 'sunny': 4, 'day': 5, 'old': 6, 'gold': 7, 'something': 8, 'should': 9, 'be': 10, 'tested': 11, 'without': 12, 'saying': 13, 'hello': 14, 'world': 15, 'coudy': 16, 'morning': 17, 'in': 18, 'the': 19, 'street': 20}
Sequences of words -->[[2, 1, 3, 4, 5], [6, 1, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]


In [7]:
print(train_sentences[0])
print(sequences[0])

It is a sunny day
[2, 1, 3, 4, 5]


## Tokenizing new data using the same tokenizer

In [8]:
new_sentences = [
                 'Will it be raining today?',
                 'It is a pleasant day.'
]

In [9]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [10]:
print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[2, 10], [2, 1, 3, 5]]


In [11]:
##set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token = "<oov>")

##train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

In [12]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)
print("WORD INDEX:\n",word_index,"\n\n")
print("SENTENCE:\n",new_sentences, "\n\n")
print("NEW SEQUENCES:\n",new_sequences)

WORD INDEX:
 {'<oov>': 1, 'is': 2, 'it': 3, 'a': 4, 'sunny': 5, 'day': 6, 'old': 7, 'gold': 8, 'something': 9, 'should': 10, 'be': 11, 'tested': 12, 'without': 13, 'saying': 14, 'hello': 15, 'world': 16, 'coudy': 17, 'morning': 18, 'in': 19, 'the': 20, 'street': 21} 


SENTENCE:
 ['Will it be raining today?', 'It is a pleasant day.'] 


NEW SEQUENCES:
 [[1, 3, 11, 1, 1], [3, 2, 4, 1, 6]]


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
train_sentences = [
             'It will rain',
             'The weather is cloudy!',
             'Will it be raining today?',
             'It is a super hot day!',
]

In [15]:
##set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token='<oov>')

##train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

In [16]:
##create sequences
sequences = tokenizer.texts_to_sequences(train_sentences)

In [17]:
##pad sequences
padded_seqs = pad_sequences(sequences)

In [18]:
print("Word index:\n", word_index)
print("\n\ntrain sentences\n", train_sentences)
print("\n\nsequences\n", sequences)
print("\n\npadded sequences\n", padded_seqs)

Word index:
 {'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}


train sentences
 ['It will rain', 'The weather is cloudy!', 'Will it be raining today?', 'It is a super hot day!']


sequences
 [[2, 3, 5], [6, 7, 4, 8], [3, 2, 9, 10, 11], [2, 4, 12, 13, 14, 15]]


padded sequences
 [[ 0  0  0  2  3  5]
 [ 0  0  6  7  4  8]
 [ 0  3  2  9 10 11]
 [ 2  4 12 13 14 15]]


In [19]:
##pad sequences with padding type, max length and truncating parameters
padded_seqs = pad_sequences(sequences,
                            padding="post",
                            maxlen=5,
                            truncating="post",
                            )

In [21]:
print(padded_seqs)

[[ 2  3  5  0  0]
 [ 6  7  4  8  0]
 [ 3  2  9 10 11]
 [ 2  4 12 13 14]]
