In [None]:
"""
http://ronny.rest/blog/post_2017_08_04_glove/
"""

In [23]:
import torch
import torch.nn as nn
import numpy as np
from io import open

Create the vocabulary

In [1]:
glove_path = "glove.6B/glove.6B.50d.txt"
EMBEDDING_DIM = 50

In [6]:
my_custom_vocab = ["xxpad", "xxunk", "xxkno","the", "friend", "year", "when"]

raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".lower().split()

vocabulary = my_custom_vocab + raw_text
vocab_size = len(set(vocabulary))
print("There are {} unique words".format(vocab_size))

There are 52 unique words


In [10]:
word2id = {w:i for i,w in enumerate(set(vocabulary))}
id2word = {v:k for k,v in word2id.items()}

print(word2id)
print(id2word)

assert len(id2word) == vocab_size

{'process': 0, 'xxkno': 1, 'pattern': 2, 'they': 3, 'of': 4, 'conjure': 5, 'our': 6, 'to': 7, 'things': 8, 'create': 9, 'process.': 10, 'data.': 11, 'is': 12, 'computer': 13, 'beings': 14, 'abstract': 15, 'computational': 16, 'people': 17, 'a': 18, 'processes': 19, 'evolve,': 20, 'processes.': 21, 'in': 22, 'called': 23, 'other': 24, 'programs': 25, 'inhabit': 26, 'year': 27, 'xxunk': 28, 'directed': 29, 'that': 30, 'manipulate': 31, 'effect,': 32, 'as': 33, 'spirits': 34, 'idea': 35, 'by': 36, 'program.': 37, 'rules': 38, 'when': 39, 'are': 40, 'we': 41, 'spells.': 42, 'xxpad': 43, 'the': 44, 'with': 45, 'evolution': 46, 'direct': 47, 'study': 48, 'about': 49, 'computers.': 50, 'friend': 51}
{0: 'process', 1: 'xxkno', 2: 'pattern', 3: 'they', 4: 'of', 5: 'conjure', 6: 'our', 7: 'to', 8: 'things', 9: 'create', 10: 'process.', 11: 'data.', 12: 'is', 13: 'computer', 14: 'beings', 15: 'abstract', 16: 'computational', 17: 'people', 18: 'a', 19: 'processes', 20: 'evolve,', 21: 'processes.',

Initialize Embeddings to random values using numpy (this also can be done using Tensorflow)
using a variant of Xavier intialization.

In [13]:
sd = 1/np.sqrt(EMBEDDING_DIM)   # standad deviation to use
weights = np.random.normal(0, scale=sd, size=[vocab_size, EMBEDDING_DIM])
weights = weights.astype(np.float32)

print(weights.shape)
#print(weights[:2])

(52, 50)


Override the given word vectors from the GloVe text files

In [31]:
words_found = 0
with open(glove_path, encoding='utf-8', mode="r") as gloveFile:
    for line in gloveFile:
        #separate the values from the word
        line = line.split()
        word = line[0]

        # if word is in our vocab, then update the corresponding weights
        id = word2id.get(word, None)
        if id is not None:
            #print("Found custom word {}".format(word))
            weights[id] = np.array(line[1:], dtype=np.float32)
            words_found+=1

print("We found {} words in the glove embeddings".format(words_found))

We found 41 words in the glove embeddings


Updating embeddings in PyTorch model

In [26]:
embeds = nn.Embedding(vocab_size, EMBEDDING_DIM)
embeds.weight.data.copy_(torch.Tensor(weights))

print(embeds.weight.data.shape)

torch.Size([52, 50])


In [29]:
w_id = torch.LongTensor([word2id["xxkno"]])
print(embeds(w_id))




tensor([[ 0.1223, -0.0309, -0.2696,  0.0464, -0.1948, -0.2081, -0.0425, -0.1134,
         -0.0264, -0.2491, -0.0319,  0.1615, -0.0085, -0.2602,  0.2204, -0.0219,
         -0.0119,  0.2183, -0.0733,  0.0914, -0.1902,  0.0796, -0.0485, -0.0789,
         -0.0177, -0.1553,  0.1865,  0.0318, -0.1955, -0.0404, -0.1280, -0.1081,
          0.1468, -0.0935, -0.1383, -0.1361,  0.1183,  0.2113, -0.1948, -0.0160,
         -0.3406, -0.0755, -0.0789,  0.0301, -0.0257,  0.0624,  0.0186, -0.1054,
          0.0281, -0.1098]], grad_fn=<EmbeddingBackward>)
