In [30]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer

In [31]:
sent=[ 'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [32]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [33]:
## Define the vocabulary size
voc_size=10000

In [36]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent)
word_index = tokenizer.word_index
reverse_word_index = {value: key for key, value in word_index.items()}

In [37]:
### One Hot Representation
one_hot_repr = [tokenizer.texts_to_sequences([sentence])[0] for sentence in sent]
one_hot_repr

[[1, 4, 2, 8],
 [1, 4, 2, 9],
 [1, 10, 2, 11],
 [5, 6, 7, 3, 12],
 [5, 6, 7, 3, 13],
 [14, 1, 15, 2, 16],
 [17, 18, 19, 3]]

In [38]:
#word embedding 

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [39]:
# Step 1: Calculate the number of words in each sentence
word_counts = [len(sentence.split()) for sentence in sent]

# Step 2: Find the maximum and minimum number of words
max_words = max(word_counts)
min_words = min(word_counts)

# Step 3: Find the sentence(s) with the maximum and minimum number of words
max_sentences = [sentence for sentence in sent if len(sentence.split()) == max_words]
min_sentences = [sentence for sentence in sent if len(sentence.split()) == min_words]

# Output the results
print("Maximum number of words in a sentence:", max_words)
print("Sentence(s) with the maximum number of words:", max_sentences)
print("Minimum number of words in a sentence:", min_words)
print("Sentence(s) with the minimum number of words:", min_sentences)

Maximum number of words in a sentence: 5
Sentence(s) with the maximum number of words: ['I am a good boy', 'I am a good developer', 'understand the meaning of words']
Minimum number of words in a sentence: 4
Sentence(s) with the minimum number of words: ['the glass of milk', 'the glass of juice', 'the cup of tea', 'your videos are good']


In [40]:
# padding is done to make the length of the sentences same 
sent_length=8
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
embedded_docs

array([[ 0,  0,  0,  0,  1,  4,  2,  8],
       [ 0,  0,  0,  0,  1,  4,  2,  9],
       [ 0,  0,  0,  0,  1, 10,  2, 11],
       [ 0,  0,  0,  5,  6,  7,  3, 12],
       [ 0,  0,  0,  5,  6,  7,  3, 13],
       [ 0,  0,  0, 14,  1, 15,  2, 16],
       [ 0,  0,  0,  0, 17, 18, 19,  3]])

In [41]:
# feature representation
# using low value as the data is very small higher value tends to overfit
dim=10

In [42]:
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile('adam','mse')

In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.predict(embedded_docs)

array([[[ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582],
        [ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582],
        [ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582],
        [ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582],
        [-0.03771061,  0.04773815, -0.03868502, -0.03920982,
         -0.00615809, -0.02046227, -0.04131862,  0.01830781,
         -0.02059578,  0.0081237 ],
        [ 0.01453868, -0.04350867,  0.01670432, -0.01295882,
          0.03588894,  0.02593361,  0.03441106, -0.01329154,
          0.04564977, -0.02829648],
        [-0.00061341,  0.03806886, -0.0399695 , -0.0

In [45]:
embedded_docs[0]

array([0, 0, 0, 0, 1, 4, 2, 8])

In [46]:
model.predict(embedded_docs[0])



array([[[ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582]],

       [[ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582]],

       [[ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582]],

       [[ 0.00240875, -0.04836702, -0.01521522, -0.02489507,
          0.03593007,  0.04675932,  0.04894478,  0.02997616,
          0.0292495 , -0.03552582]],

       [[-0.03771061,  0.04773815, -0.03868502, -0.03920982,
         -0.00615809, -0.02046227, -0.04131862,  0.01830781,
         -0.02059578,  0.0081237 ]],

       [[ 0.01453868, -0.04350867,  0.01670432, -0.01295882,
          0.03588894,  0.02593361,  0.03441106, -0.01329154,
          0.04564977, -0.02829648]],

       [[-0.00061341,  0.03806886, -0.03

    Notes

Input Shape:

Your input to the model is embedded_docs[0], which has the shape (8,). This means it's a 1D array with 8 elements (each element is an integer representing a word index).

Embedding Layer:

The Embedding layer converts each word index into a dense vector of size dim=10.

Since the input has 8 word indices, the output will have 8 vectors, each of size 10.

Output Shape:

The output shape is (8, 10), where:

8 corresponds to the number of words in the input sequence.

10 corresponds to the dimensionality of the word embeddings (dim=10).

Batch Dimension:

When you use model.predict(), Keras adds an extra dimension for the batch size. Even if you pass a single sequence, it treats it as a batch of size 1.

This results in a 3D array of shape (1, 8, 10):

1: Batch size (1 sequence).

8: Sequence length (8 words).

10: Embedding dimension (10 features per word).

In [47]:
decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in embedded_docs[0]])
print(decoded_review)

? ? ? ? the glass of milk
