In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are bad',]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are bad']

In [4]:
## Define the vocabulary size
voc_size=10000

In [5]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are bad']

In [6]:
### One Hot Representation
one_hot_repr=[one_hot(words,voc_size)for words in sent]
one_hot_repr

[[1028, 3637, 3031, 539],
 [1028, 3637, 3031, 8506],
 [1028, 2587, 3031, 4972],
 [7936, 3406, 2349, 595, 6975],
 [7936, 3406, 2349, 595, 5941],
 [6034, 1028, 7883, 3031, 2352],
 [4614, 592, 7144, 4012]]

In a vocabulary of 10,000 words, 'the' is at index 5613 and 'milk' is at index 2811. Similarly, the output of one_hot_repr shows the index where a word is present in the vocabulary, indicating the position where '1' will be placed, while all other indices for that word's vector will be '0'. Words with similar meanings are often close to each other in the index values; for example, 'good' and 'great' are at 844 and 973, respectively. In contrast, words with opposite meanings are usually far apart; for instance, 'good' is at 844, while 'bad' is at 2964.

In [7]:
## word Embedding Representation

from tensorflow.keras.layers import Embedding
#from tensorflow.keras.processing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [8]:
import numpy as np

In [9]:
# we need to make all the sentences to same length 
sent_length=8 # rendom length
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 1028 3637 3031  539]
 [   0    0    0    0 1028 3637 3031 8506]
 [   0    0    0    0 1028 2587 3031 4972]
 [   0    0    0 7936 3406 2349  595 6975]
 [   0    0    0 7936 3406 2349  595 5941]
 [   0    0    0 6034 1028 7883 3031 2352]
 [   0    0    0    0 4614  592 7144 4012]]


In [10]:
## feature representation
dim=10

In [11]:
model=Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=dim, input_length=sent_length))
model.build(input_shape=(None, sent_length))
model.compile('adam','mse') # optimizer = adam and mse as loss function



In [12]:
model.summary()

In [13]:
model.predict(embedded_docs) # every word is represented with 10 dimenssion

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step


array([[[-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [ 0.03079828,  0.00735072,  0.02703509,  0.03173283,
          0.01457909, -0.00255675, -0.01154256, -0.02359885,
         -0.00338461,  0.03335166],
        [ 0.01581845,  0.04291118, -0.04334918,  0.00575993,
         -0.00673392,  0.02267339,  0.04234474, -0.02640917,
         -0.02734238, -0.03030062],
        [ 0.02703586, -0.04172341, -0.03625065,  0.0

In [14]:
embedded_docs[0]

array([   0,    0,    0,    0, 1028, 3637, 3031,  539])

In [21]:
# the expected input is in the format of 2D array
np.array( [[  0,    0,    0,    0, 1028, 3637, 3031,  539]])

array([[   0,    0,    0,    0, 1028, 3637, 3031,  539]])

In [22]:
model.predict(np.array([[   0,    0,    0,    0, 1028, 3637, 3031,  539]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step


array([[[-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [-0.00502623,  0.00100422, -0.04058547, -0.00859774,
         -0.02143321,  0.04685325, -0.04598395,  0.02912192,
         -0.0377673 ,  0.00062136],
        [ 0.03079828,  0.00735072,  0.02703509,  0.03173283,
          0.01457909, -0.00255675, -0.01154256, -0.02359885,
         -0.00338461,  0.03335166],
        [ 0.01581845,  0.04291118, -0.04334918,  0.00575993,
         -0.00673392,  0.02267339,  0.04234474, -0.02640917,
         -0.02734238, -0.03030062],
        [ 0.02703586, -0.04172341, -0.03625065,  0.0