## Word Embdedding using Keras and Tensorflow

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
import numpy as np

In [3]:
# Sentences used 
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [8]:
print("Following are the sentences : ")
for i in sent:
    print(i)
    print("------------------------------")

Following are the sentences : 
the glass of milk
------------------------------
the glass of juice
------------------------------
the cup of tea
------------------------------
I am a good boy
------------------------------
I am a good developer
------------------------------
understand the meaning of words
------------------------------
your videos are good
------------------------------


In [9]:
# Defining a vocabulary size 
# It determines the size of the dictionary 

vocab_size = 10000

## One hot representation 

In [12]:
onehot_repr=[one_hot(words,vocab_size)for words in sent] 
for i in onehot_repr:
    print(i)

[3786, 1811, 1453, 9988]
[3786, 1811, 1453, 4966]
[3786, 8320, 1453, 3484]
[11, 1835, 4973, 6745, 3305]
[11, 1835, 4973, 6745, 1858]
[9334, 3786, 9787, 1453, 4544]
[1778, 3680, 9099, 6745]


# Embedding Matrix

In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

### What is Pad Sequences?
Before making the embedding matrix we need to have length of all the sentences equal.<br>
In order to achieve that we need to pad the sentences to equal lengths.

In [16]:
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

# With padding as pre it will add the zeros in the front 

[[   0    0    0    0 3786 1811 1453 9988]
 [   0    0    0    0 3786 1811 1453 4966]
 [   0    0    0    0 3786 8320 1453 3484]
 [   0    0    0   11 1835 4973 6745 3305]
 [   0    0    0   11 1835 4973 6745 1858]
 [   0    0    0 9334 3786 9787 1453 4544]
 [   0    0    0    0 1778 3680 9099 6745]]


In [17]:
dim=10

In [19]:
model=Sequential()
model.add(Embedding(vocab_size,10,input_length=sent_length))
model.compile('adam','mse')

In [20]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [23]:
result = model.predict(embedded_docs)

In [28]:
# First Sentence 
x = model.predict(embedded_docs)[0]

In [29]:
x

array([[-0.02405747,  0.03893941,  0.03394809, -0.04265183, -0.01425393,
         0.0082153 ,  0.04209557, -0.01613782, -0.03318463,  0.04494962],
       [-0.02405747,  0.03893941,  0.03394809, -0.04265183, -0.01425393,
         0.0082153 ,  0.04209557, -0.01613782, -0.03318463,  0.04494962],
       [-0.02405747,  0.03893941,  0.03394809, -0.04265183, -0.01425393,
         0.0082153 ,  0.04209557, -0.01613782, -0.03318463,  0.04494962],
       [-0.02405747,  0.03893941,  0.03394809, -0.04265183, -0.01425393,
         0.0082153 ,  0.04209557, -0.01613782, -0.03318463,  0.04494962],
       [-0.03117318,  0.03489931, -0.02428929, -0.03903877,  0.00063873,
        -0.04135499, -0.03878986,  0.00739026, -0.00257676,  0.04706749],
       [-0.01632384, -0.00690304, -0.03601353, -0.04386529, -0.02254364,
         0.0232651 ,  0.04190027,  0.02322766, -0.02862033,  0.03426293],
       [-0.04880688, -0.04546933,  0.01762163, -0.00642767, -0.02184128,
         0.03560868, -0.01611491, -0.00015531