In [None]:
"""
https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
bcolz: columnar and compressed data container


"""

In [1]:
import torch
import torch.nn as nn
import bcolz
import numpy as np
import pickle


In [2]:
glove_path = "glove.6B/glove.6B.50d.txt"
glove_path_out = "glove.6B/glove.6B.50d.dat"
EMBEDDING_DIM = 50

In [22]:
words = []
idx = 0
word2idx = {}

# File to save the embeddings
vectors = bcolz.carray(np.zeros(1), rootdir=glove_path_out, mode='w')

In [24]:
with open(glove_path, encoding='utf-8', mode="r") as f:
    for l in f:
        line = l.split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

vocab_size = len(words)
vectors = bcolz.carray(vectors[1:].reshape((vocab_size, EMBEDDING_DIM)), rootdir=glove_path_out, mode='w')
vectors.flush()
pickle.dump(words, open("6B.50_words.pkl","wb"))
pickle.dump(word2idx, open("6B.50_idx.pk", "wb"))

We can now create a dictionary that given a word returns its vectors

In [3]:
vectors = bcolz.open(glove_path_out)[:]
words = pickle.load(open("6B.50_words.pkl", "rb"))
word2idx = pickle.load(open("6B.50_idx.pk", "rb"))

glove = {w: vectors[word2idx[w]] for w in words}

In [4]:
print(glove["the"])

[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


Using this dictionary for creating the Pytorch embeddings

In [31]:
my_custom_vocab = ["xxpad", "xxunk", "xxkno","the", "friend", "year", "when"]

raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".replace(".","").replace(",","").lower().split()

vocabulary = my_custom_vocab + raw_text
vocabulary = set(vocabulary)
vocab_size = len(vocabulary)
print("There are {} unique words".format(vocab_size))


word2id = {w:i for i,w in enumerate(set(vocabulary))}
id2word = {v:k for k,v in word2id.items()}

There are 50 unique words


In [32]:
weights_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
words_found = 0
print(weights_matrix.shape)

(50, 50)


In [33]:
for i, word in enumerate(vocabulary):
    print("{};{}".format(i, word))
    try:
        weights_matrix[i] = glove[word]
        words_found += 1
        
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM,))
        

0;spirits
1;pattern
2;study
3;idea
4;with
5;process
6;xxpad
7;the
8;manipulate
9;beings
10;friend
11;data
12;conjure
13;abstract
14;evolve
15;effect
16;computer
17;a
18;our
19;when
20;we
21;about
22;people
23;by
24;that
25;evolution
26;direct
27;year
28;create
29;xxkno
30;computers
31;directed
32;program
33;they
34;other
35;xxunk
36;programs
37;computational
38;to
39;called
40;as
41;of
42;is
43;spells
44;inhabit
45;are
46;processes
47;things
48;rules
49;in


Create the embedding layer. This could be inside the NN class

In [45]:
def create_emb_layer(weights_matrix, trainable=True):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    #emb_layer.load_state_dict({"weight": weights_matrix})
    emb_layer.weight.data.copy_(torch.Tensor(weights_matrix))
    if not trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

embedding, v_size, emb_dim = create_emb_layer(weights_matrix, trainable=True)

In [46]:
w_id = torch.LongTensor([word2id["xxkno"]])
print(w_id)
print(embedding(w_id))

tensor([29])
tensor([[ 1.1993,  0.0340, -0.2373, -0.3520, -0.0186,  1.3704,  0.0654,  0.4761,
         -0.7745, -0.1020,  0.6647, -0.1310, -0.2796,  0.3668, -0.5716,  1.1656,
          0.6293,  0.3306, -0.6955, -0.1583,  1.3894, -0.3072,  1.3048, -0.1306,
         -0.9608,  0.4253, -0.6285, -0.6249,  1.4926,  0.4582, -0.3989, -0.3842,
         -0.9446,  0.1732, -0.1850,  0.3945,  0.7606,  0.1778, -0.2165,  0.9184,
         -0.5465,  0.0604, -0.2687, -0.0682, -0.0783, -0.0518,  0.0919,  0.2362,
         -0.6619,  0.9910]], grad_fn=<EmbeddingBackward>)
