## Word Embedding

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [3]:
sentences = [
    'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good developer',
    'understand the meaning of words',
    'your videos are good',
]
sentences

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
# 1. convert sentences to vectors to one hot representation
# 2. create a vocabulary of 10000 words
# 3. each word is represented by a vector of size 10000
vocab_size = 10000

In [5]:
one_hot_rep = [one_hot(sentence, vocab_size) for sentence in sentences]
one_hot_rep

[[575, 499, 8261, 4405],
 [575, 499, 8261, 1147],
 [575, 4765, 8261, 5692],
 [5191, 4416, 126, 7506, 6889],
 [3678, 575, 1912, 8261, 6920],
 [8730, 6999, 4488, 7506]]

### Word Embedding Representation

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [9]:
sentence_length = 8
embedded_docs = pad_sequences(one_hot_rep, padding='pre', maxlen=sentence_length) # necessary to make all sentences of same length for LSTM RNN also
embedded_docs

array([[   0,    0,    0,    0,  575,  499, 8261, 4405],
       [   0,    0,    0,    0,  575,  499, 8261, 1147],
       [   0,    0,    0,    0,  575, 4765, 8261, 5692],
       [   0,    0,    0, 5191, 4416,  126, 7506, 6889],
       [   0,    0,    0, 3678,  575, 1912, 8261, 6920],
       [   0,    0,    0,    0, 8730, 6999, 4488, 7506]])

In [10]:
dim = 10 # dimension of embedding vector or features

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, dim, input_length=sentence_length))
model.compile('adam', 'mse')

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.predict(embedded_docs)



array([[[ 0.04007648, -0.02595162, -0.02337232, -0.03713144,
         -0.04732944, -0.03716077,  0.00224041, -0.0323565 ,
         -0.04275134,  0.03464489],
        [ 0.04007648, -0.02595162, -0.02337232, -0.03713144,
         -0.04732944, -0.03716077,  0.00224041, -0.0323565 ,
         -0.04275134,  0.03464489],
        [ 0.04007648, -0.02595162, -0.02337232, -0.03713144,
         -0.04732944, -0.03716077,  0.00224041, -0.0323565 ,
         -0.04275134,  0.03464489],
        [ 0.04007648, -0.02595162, -0.02337232, -0.03713144,
         -0.04732944, -0.03716077,  0.00224041, -0.0323565 ,
         -0.04275134,  0.03464489],
        [ 0.03779537,  0.0462762 , -0.03629637, -0.00054989,
         -0.02204815,  0.00195328, -0.01342442,  0.03885679,
          0.00618809, -0.04226302],
        [-0.04335821,  0.00442786,  0.02711283,  0.0280765 ,
         -0.00611641,  0.01187855, -0.00318776, -0.00550003,
          0.03759656,  0.0317344 ],
        [-0.03596169,  0.04720404, -0.04544955,  0.0

In [14]:
embedded_docs[0]

array([   0,    0,    0,    0,  575,  499, 8261, 4405])

In [16]:
model.predict(embedded_docs)[0] # this is the embedding vector for the first sentence



array([[ 0.04007648, -0.02595162, -0.02337232, -0.03713144, -0.04732944,
        -0.03716077,  0.00224041, -0.0323565 , -0.04275134,  0.03464489],
       [ 0.04007648, -0.02595162, -0.02337232, -0.03713144, -0.04732944,
        -0.03716077,  0.00224041, -0.0323565 , -0.04275134,  0.03464489],
       [ 0.04007648, -0.02595162, -0.02337232, -0.03713144, -0.04732944,
        -0.03716077,  0.00224041, -0.0323565 , -0.04275134,  0.03464489],
       [ 0.04007648, -0.02595162, -0.02337232, -0.03713144, -0.04732944,
        -0.03716077,  0.00224041, -0.0323565 , -0.04275134,  0.03464489],
       [ 0.03779537,  0.0462762 , -0.03629637, -0.00054989, -0.02204815,
         0.00195328, -0.01342442,  0.03885679,  0.00618809, -0.04226302],
       [-0.04335821,  0.00442786,  0.02711283,  0.0280765 , -0.00611641,
         0.01187855, -0.00318776, -0.00550003,  0.03759656,  0.0317344 ],
       [-0.03596169,  0.04720404, -0.04544955,  0.04319513,  0.03074609,
        -0.02938291,  0.04612049, -0.00847537