In [14]:
import torch

In [2]:
import torch.nn as nn


# How to use torch.Embedding

In [3]:
# Embedding layer with 5 vocab size and 40 vector embeddings.
embedding = nn.Embedding(5, 40)
# passing the input
embed = embedding(torch.LongTensor([1]))
# printing the embedding
print(embed)

tensor([[-0.2223, -2.3308, -0.6108,  0.7259, -0.2903, -0.8751,  1.7559,  1.5828,
          0.2943, -0.1676, -0.9944,  1.5499, -1.8404, -1.1475, -1.2277, -1.2573,
          1.3946,  0.3825, -0.4465, -0.4496,  0.2936,  0.9580,  0.0199,  0.0070,
          0.5718, -1.0833,  0.7940, -1.3834, -0.7509,  2.3869, -0.5971, -0.9588,
          1.1152,  0.0605,  0.5032, -0.2663, -1.9301, -1.5963, -0.3757, -1.5852]],
       grad_fn=<EmbeddingBackward0>)


# Sample code

In [4]:
# creating the dictionary
word_to_ix = {'geeks': 0, 'for': 1, 'code': 2}
# creating embedding layer -3words in vocab, 5-dimennsional embedding
embedding = nn.Embedding(3, 5)

In [5]:
# converting the words to indices
lookup_tensor = torch.LongTensor([word_to_ix['geeks']])
print(f'index of geeks : {lookup_tensor}')
# accessing the embedding of the word 'geeks'
embed = embedding(lookup_tensor)
# printing the embedding
print(f'embeddin of geeks : {embed}')

index of geeks : tensor([0])
embeddin of geeks : tensor([[ 1.1538,  0.2501, -0.3989, -0.1271,  0.3281]],
       grad_fn=<EmbeddingBackward0>)


In [6]:
from collections import Counter

# this is going to be the dummy sentence :
sentences = 'this is the second example showing for the article at gfg. and doing this is actually really fun'
words = sentences.split(' ')

# creating the dictionary
vocab = Counter(words)
print(vocab)

Counter({'this': 2, 'is': 2, 'the': 2, 'second': 1, 'example': 1, 'showing': 1, 'for': 1, 'article': 1, 'at': 1, 'gfg.': 1, 'and': 1, 'doing': 1, 'actually': 1, 'really': 1, 'fun': 1})


In [7]:
print(vocab.get('this'))

2


In [8]:
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)
print(vocab)

['this', 'is', 'the', 'second', 'example', 'showing', 'for', 'article', 'at', 'gfg.', 'and', 'doing', 'actually', 'really', 'fun']


In [9]:
# create a word to index dictionary from our Vocab dictionary
word2idx = {word: ind for ind, word in enumerate(vocab)}
word2idx

{'this': 0,
 'is': 1,
 'the': 2,
 'second': 3,
 'example': 4,
 'showing': 5,
 'for': 6,
 'article': 7,
 'at': 8,
 'gfg.': 9,
 'and': 10,
 'doing': 11,
 'actually': 12,
 'really': 13,
 'fun': 14}

In [10]:
encoded_sentences = [word2idx[word] for word in words]
print(encoded_sentences)
print(len(encoded_sentences))

[0, 1, 2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 0, 1, 12, 13, 14]
18


In [11]:
# assign a value to your embedding_dim
e_dim = 5

In [12]:
# itnitialize an embedding layer from torch
emb = nn.Embedding(vocab_size, e_dim, padding_idx=4)
word_vectors = emb(torch.LongTensor(encoded_sentences))
print(word_vectors)

tensor([[ 1.3914,  0.2771, -0.3728, -1.2869,  0.8437],
        [-0.5249, -0.2965,  0.6732,  0.6804,  1.6346],
        [ 0.1174,  1.5205,  1.1420,  0.3429,  1.0084],
        [ 2.3359, -0.1517, -0.0295,  0.5667, -0.1168],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.7529, -0.0876,  1.1717,  0.4178, -0.9127],
        [ 0.4264,  0.4380, -1.1234, -1.6403, -1.0220],
        [ 0.1174,  1.5205,  1.1420,  0.3429,  1.0084],
        [-1.6007,  0.1483, -1.4101,  0.3001, -0.7306],
        [-0.6380, -0.1692, -0.6443, -0.1396, -0.2226],
        [-0.8895,  0.9895,  1.8986, -0.2961, -0.4691],
        [-0.1516,  0.3009, -0.3534, -0.3958, -0.5701],
        [ 2.2056, -0.5948, -0.0100,  1.7196, -1.0177],
        [ 1.3914,  0.2771, -0.3728, -1.2869,  0.8437],
        [-0.5249, -0.2965,  0.6732,  0.6804,  1.6346],
        [ 0.6834,  0.9631, -0.5920, -0.5491, -1.0756],
        [ 1.3487, -0.6275, -1.9060, -0.0271, -0.6886],
        [ 0.2421, -0.8397, -1.2472, -0.1437,  0.0667]],
       gr

In [13]:
print('The word embeddings before training:')
for word, index in word2idx.items():
    print(f'word: {word}, \tindex: {index}, \tembedding: {word_vectors[index].data.numpy()}')

The word embeddings before training:
word: this, 	index: 0, 	embedding: [ 1.3913515   0.27706462 -0.37281248 -1.2869236   0.8436893 ]
word: is, 	index: 1, 	embedding: [-0.52494156 -0.2965001   0.67322344  0.6804152   1.6346322 ]
word: the, 	index: 2, 	embedding: [0.11736465 1.5205255  1.1420084  0.34287617 1.0083663 ]
word: second, 	index: 3, 	embedding: [ 2.335879   -0.15165833 -0.02950949  0.5667186  -0.11676569]
word: example, 	index: 4, 	embedding: [0. 0. 0. 0. 0.]
word: showing, 	index: 5, 	embedding: [ 0.75292563 -0.08755481  1.1717112   0.4177512  -0.9127103 ]
word: for, 	index: 6, 	embedding: [ 0.42641672  0.4379722  -1.1233633  -1.6403062  -1.022011  ]
word: article, 	index: 7, 	embedding: [0.11736465 1.5205255  1.1420084  0.34287617 1.0083663 ]
word: at, 	index: 8, 	embedding: [-1.6007452   0.14827983 -1.410069    0.30005357 -0.7306243 ]
word: gfg., 	index: 9, 	embedding: [-0.63801306 -0.16921443 -0.644254   -0.1396184  -0.22259837]
word: and, 	index: 10, 	embedding: [-0.8895

`padding_idx` padding idx should denote 0.