In [20]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot

In [21]:

### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

#define vocabulary size
vocab_size=10000

# one hot representation
one_hot_rep = [one_hot(words, vocab_size) for words in sent]
one_hot_rep

[[6910, 6898, 6426, 6741],
 [6910, 6898, 6426, 199],
 [6910, 506, 6426, 3282],
 [3052, 5528, 3769, 9942, 903],
 [3052, 5528, 3769, 9942, 3734],
 [8116, 6910, 9682, 6426, 9353],
 [749, 214, 1312, 9942]]

In [22]:
#word embedding representation  

# The Embedding layer is used to convert integer-encoded words into dense vector representations of fixed size
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
# Import the pad_sequences function to ensure that input sequences have the same length 
# pads shorter sequences with zeros (or a specified value) and truncates longer sequences (making similar length of words in sentences)
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
sent_length=8
# pad the sequences to ensure uniform length
# 0 is used for padding, if you use post padding, it will add 0s at the end of the sequence
# if you use pre padding, it will add 0s at the beginning of the sequence
# maxlen is the maximum length of the sequences
embedded_docs=pad_sequences(one_hot_rep, padding='pre', maxlen=sent_length)
embedded_docs

array([[   0,    0,    0,    0, 6910, 6898, 6426, 6741],
       [   0,    0,    0,    0, 6910, 6898, 6426,  199],
       [   0,    0,    0,    0, 6910,  506, 6426, 3282],
       [   0,    0,    0, 3052, 5528, 3769, 9942,  903],
       [   0,    0,    0, 3052, 5528, 3769, 9942, 3734],
       [   0,    0,    0, 8116, 6910, 9682, 6426, 9353],
       [   0,    0,    0,    0,  749,  214, 1312, 9942]])

##### How `model.add(Embedding(vocab_size, dim))` It Works During Training
1. vocab_size:

    - Defines the number of unique words in the vocabulary.
    - Each word is assigned an integer index between 0 and vocab_size - 1.
    - The Embedding layer creates an embedding matrix of shape (vocab_size, dim).
    dim:

2. dim:
    - Specifies the size of the dense vector for each word.
    - Each word index is mapped to a vector of size dim (e.g., [0.1, -0.2, 0.3, ...]).
    - During Training:

3. During training (model.fit())
    - The embedding matrix is initialized randomly.
    - When the model is trained using model.fit(), the embedding matrix is updated to learn meaningful - - word representations based on the task (e.g., sentiment analysis, text classification).

In [None]:
# feature representation
dim = 10
# Create a Sequential model
# The Sequential model is a linear stack of layers
# The Embedding layer is the first layer in the model
# The Embedding layer will learn the word embeddings during training
# The Embedding layer takes the input sequences and converts them into dense vectors of fixed size
model = Sequential()
# Embedding layer
# The first argument is the size of the vocabulary (number of unique words)
# The second argument is the size of the dense vector (embedding dimension)
# The third argument is the length of the input sequences (the maximum length of the sentences)
model.add(Embedding(vocab_size, dim))
# Compile the model
model.compile(optimizer='adam', loss='mse')

In [29]:
model.predict(embedded_docs)
# The output is a 3D array with shape (number of samples, sequence length, embedding dimension)
# each list has 10 dimensions


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


array([[[ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [-0.00630487, -0.0073431 , -0.04714298, -0.01194265,
         -0.03146086,  0.0248618 , -0.02722542, -0.03945141,
          0.04451882, -0.0379189 ],
        [-0.04538488, -0.02755177, -0.01612412,  0.00679729,
         -0.04300221,  0.00736079, -0.02943142, -0.04961586,
          0.03260313, -0.03274776],
        [ 0.03460444, -0.04997913,  0.0356061 ,  0.0

##### the Embedding layer converts all integer tokens in the input sequence, including the padding token (0), into dense vectors. This is because the Embedding layer maps every integer index (from 0 to vocab_size - 1) to a corresponding dense vector in the embedding matrix.

In [None]:
# Print the first padded sequence (integer-encoded representation of the first sentence)
print(embedded_docs[0])

# Predict the dense vector representation of the first padded sequence using the Embedding layer
# The output is a 3D array where:
# - The first dimension is the batch size (1 in this case)
# - The second dimension is the sequence length (sent_length)
# - The third dimension is the embedding size (dim)
model.predict(embedded_docs[[0]])

[   0    0    0    0 6910 6898 6426 6741]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


array([[[ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [ 0.04989907, -0.03139565,  0.04958474, -0.00162212,
          0.0269662 , -0.02546694, -0.02383177,  0.00211507,
          0.00839097, -0.01155689],
        [-0.00630487, -0.0073431 , -0.04714298, -0.01194265,
         -0.03146086,  0.0248618 , -0.02722542, -0.03945141,
          0.04451882, -0.0379189 ],
        [-0.04538488, -0.02755177, -0.01612412,  0.00679729,
         -0.04300221,  0.00736079, -0.02943142, -0.04961586,
          0.03260313, -0.03274776],
        [ 0.03460444, -0.04997913,  0.0356061 ,  0.0