Installing libraries

In [4]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Importing libraries

In [5]:
from tensorflow.keras.preprocessing.text import one_hot

Example Sentences

In [6]:
sentences = ['Joe waited for the train', 
             'The train was late', 
             'Mary and Samantha took the bus', 
             'Misha walked and looked around', 
             'Joe stood up and spoke to the crowd']

Defining the vocabulary size

In [9]:
vocabsize=10000

One hot encoding



In [13]:
onehot = [one_hot(words,vocabsize) for words in sentences] 
print(onehot)

[[1203, 1093, 2621, 5260, 9038], [5260, 9038, 4444, 2745], [1601, 9065, 5730, 6932, 5260, 7586], [5238, 4512, 9065, 2754, 9367], [1203, 9503, 6980, 9065, 7028, 177, 5260, 2263]]


Word Embedding Representation

Importing keras libraries

In [14]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [15]:
import numpy as np 

Setting parameters - padding for adding 0s

In [18]:
sentencelength=10
emb=pad_sequences(onehot,padding='pre',maxlen=sentencelength)
print(emb)

[[   0    0    0    0    0 1203 1093 2621 5260 9038]
 [   0    0    0    0    0    0 5260 9038 4444 2745]
 [   0    0    0    0 1601 9065 5730 6932 5260 7586]
 [   0    0    0    0    0 5238 4512 9065 2754 9367]
 [   0    0 1203 9503 6980 9065 7028  177 5260 2263]]


Set the dimensions

In [24]:
dim=15

Defining the model

In [25]:
model = Sequential()
model.add(Embedding(vocabsize, 10, input_length=sentencelength))
model.compile('adam', 'mse')

Model summary

In [26]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10, 10)            100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


Checking how the model performs

In [27]:
model.predict(emb)

array([[[-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
         -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
         -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
          1.05447397e-02],
        [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
         -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
         -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
          1.05447397e-02],
        [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
         -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
         -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
          1.05447397e-02],
        [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
         -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
         -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
          1.05447397e-02],
        [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
         -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
         -3.55906859e-02,  1.83330663e-02,  4.673911

Understanding the embeddings on the first sentence

In [28]:
emb[0]

array([   0,    0,    0,    0,    0, 1203, 1093, 2621, 5260, 9038],
      dtype=int32)

Words getting converted into a dimension of 10 vectors!

In [29]:
model.predict(emb[0])



array([[-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
        -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
        -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
         1.05447397e-02],
       [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
        -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
        -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
         1.05447397e-02],
       [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
        -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
        -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
         1.05447397e-02],
       [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
        -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
        -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
         1.05447397e-02],
       [-3.43587883e-02,  2.85050906e-02, -3.17335129e-02,
        -9.91593674e-03,  2.95152515e-03, -4.33785580e-02,
        -3.55906859e-02,  1.83330663e-02,  4.67391126e-02,
         1.