In [1]:
import os
import numpy as np
import string

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    print(f"seed: {seed}")
    
seed_everything(42)

seed: 42


In [3]:
def load_glove_from_file(glove_filepath):
    """
    Load the GloVe embeddings 
    
    Args:
        glove_filepath (str): path to the glove embeddings file 
    Returns:
        word_to_index (dict), embeddings (numpy.ndarary)
    """

    word_to_index = {}
    embeddings = []
    with open(glove_filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

In [4]:
def make_embedding_matrix(glove_filepath, words):
    """
    Create embedding matrix for a specific set of words.
    
    Args:
        glove_filepath (str): file path to the glove embeddigns
        words (list): list of words in the dataset
    """
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

# Function load_glove_from_file(glove_filepath)

In [5]:
glove_filepath = "glove.6B.100d.txt"

In [6]:
##### For demonstration purposes, we'll only use the first 3 lines of glove.6B.100d.txt, 
word_to_index = {}
embeddings = []
with open(glove_filepath, "r") as fp:
    for index, line in enumerate(fp):
        if index>=3:
            break
        print(f"Index = {index}")
        line = line.split(" ") 
        
        ### the first element in the list (i.e., the first word in the original line) is the word 
        word = line[0]
        word_to_index[word] = index
        print(f"Word: {word}")
        print(f"Updated word_to_index:{word_to_index}")
        
        ### the remaining elements in the list are 100-dimension vectors 
        ### transferring str to float
        embedding_i = np.array([float(val) for val in line[1:]])
        embeddings.append(embedding_i)
        
        print(f"Appended embedding vector: the vector for {word}")
        print(embedding_i)
        print("-"*100)

Index = 0
Word: the
Updated word_to_index:{'the': 0}
Appended embedding vector: the vector for the
[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   

In [7]:
### Now embeddings is a list
print(f"The length of embedding: {len(embeddings)}")

The length of embedding: 3


In [8]:
### Join a sequence of arrays along a new axis.
np.stack(embeddings).shape

(3, 100)

# Function make_embedding_matrix(glove_filepath, words)

In [9]:
### Obtain the word_to_idx
word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)

In [10]:
print(f"The number of words in glove_embeddings: {len(word_to_idx)}")
print(f"The dimention of the embeddings: {glove_embeddings.shape[1]}")

The number of words in glove_embeddings: 400001
The dimention of the embeddings: 100


In [11]:
embedding_size = glove_embeddings.shape[1]

### Look up the embedding (in glove_embeddings) of a given word

In [12]:
word = 'the'
print(f"The embedding of the word {word}")
glove_embeddings[word_to_idx[word]]

The embedding of the word the


array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

### If a word is not in GloVe:

In [13]:
### If a word is not in GloVe:
word = 'dfewqw'
try:
    glove_embeddings[word_to_idx[word]]
except Exception as e:
    print(e)

'dfewqw'


### One common method for handling this is to use an initialization method from the PyTorch library, such as the Xavier Uniform method: [torch.nn.init.xavier_uniform_](https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_uniform_)

In [14]:
glove_embeddings.shape

(400001, 100)

In [15]:
embedding_i = torch.ones(1, embedding_size)
torch.nn.init.xavier_uniform_(embedding_i)

tensor([[ 0.1863,  0.2023, -0.0571,  0.2239, -0.0534,  0.0492, -0.1187,  0.1431,
          0.2149, -0.1788,  0.2119,  0.0456,  0.1801,  0.0330,  0.1175, -0.0344,
          0.1879,  0.0360, -0.1138,  0.0621, -0.1123, -0.0286, -0.0990,  0.1617,
         -0.1924, -0.1124, -0.0688, -0.1466,  0.0230, -0.2407,  0.2201, -0.2070,
          0.1882,  0.0406, -0.0791,  0.1506,  0.0380,  0.1969,  0.0266, -0.0769,
          0.0655, -0.0661,  0.1026,  0.2176,  0.1409, -0.1066,  0.1407,  0.0436,
          0.1238, -0.1486, -0.2413, -0.0942, -0.1869,  0.2000,  0.0702,  0.1010,
          0.0771, -0.0042,  0.1907, -0.1732,  0.0153, -0.1664,  0.0752, -0.0839,
          0.0747, -0.0508,  0.2022, -0.1445, -0.1454, -0.1454,  0.2192,  0.0812,
          0.2345, -0.2011, -0.2418, -0.1907, -0.1640,  0.0987,  0.0873,  0.2025,
         -0.1259, -0.1662,  0.1293, -0.0985,  0.1479, -0.0578,  0.1394, -0.1894,
         -0.1230,  0.0743,  0.0515, -0.0621,  0.1453,  0.1657, -0.1767, -0.1301,
          0.2232, -0.0822, -

In [16]:
words = ['this', 'is', 'a', 'beautiful', 'day']

### Initialize the final_embeddings, shape = [number of word = 5, dimension of the embeddings = 100]
final_embeddings = np.zeros((len(words), embedding_size))

for i, word in enumerate(words):
    ### If word is in the word_to_idx, look for its vector in glove_embeddings
    if word in word_to_idx:
        final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
    ### If word is not in the word_to_idx, generate a vector using a Xavier uniform distribution
    else:
        embedding_i = torch.ones(1, embedding_size)
        torch.nn.init.xavier_uniform_(embedding_i)
        final_embeddings[i, :] = embedding_i

In [17]:
i = 1
print(f"The word indeced {i} in words: {words[i]}")
print(f"The embedding of {words[i]} in final_embeddings")
print(final_embeddings[i])
print("-"*100)
print("")
print(f"Matching the vector for {words[i]} in glove_embeddings")
print(glove_embeddings[word_to_idx[words[i]]])

The word indeced 1 in words: is
The embedding of is in final_embeddings
[-0.54264    0.41476    1.0322    -0.40244    0.46691    0.21816
 -0.074864   0.47332    0.080996  -0.22079   -0.12808   -0.1144
  0.50891    0.11568    0.028211  -0.3628     0.43823    0.047511
  0.20282    0.49857   -0.10068    0.13269    0.16972    0.11653
  0.31355    0.25713    0.092783  -0.56826   -0.52975   -0.051456
 -0.67326    0.92533    0.2693     0.22734    0.66365    0.26221
  0.19719    0.2609     0.18774   -0.3454    -0.42635    0.13975
  0.56338   -0.56907    0.12398   -0.12894    0.72484   -0.26105
 -0.26314   -0.43605    0.078908  -0.84146    0.51595    1.3997
 -0.7646    -3.1453    -0.29202   -0.31247    1.5129     0.52435
  0.21456    0.42452   -0.088411  -0.17805    1.1876     0.10579
  0.76571    0.21914    0.35824   -0.11636    0.093261  -0.62483
 -0.21898    0.21796    0.74056   -0.43735    0.14343    0.14719
 -1.1605    -0.050508   0.12677   -0.014395  -0.98676   -0.091297
 -1.2054    -0.11

# Create input and the embeddings

In [18]:
def get_glove_embedding(load_file=True, save_file=False):
    if load_file:
        embeddings = np.load('embeddings_glove100.npy')
    else:
        embeddings = make_embedding_matrix(glove_filepath="glove.6B.100d.txt",
                                           words=words)
        if save_file:
            np.save('embeddings_glove100.npy', embeddings)
    return embeddings

In [19]:
embeddings_glove100 = get_glove_embedding()
embeddings_glove100.shape

(3566, 100)

In [20]:
seed_everything(42)
batch_size     = 2
length_of_text = 7
my_input = torch.randint(0,20,[batch_size, length_of_text])
my_input

seed: 42


tensor([[ 2,  7, 16, 14,  6, 15,  0],
        [ 4, 10, 13, 18, 14, 10, 14]])

# Define the nn.Embedding Layer

In [21]:
# the number of embeddings (the number of vocabulary items)
n_tokens_in_vocabulary = embeddings_glove100.shape[0]
# size of the embeddings (embedding dimension)
dimension_embedding    = embeddings_glove100.shape[1]
# If one specifies padding_idx=0 every input where the value is equal to padding_idx will 
# be zero-ed out 
padding_idx            = 0

### Use randomly initialized embeddings (no pre-trained embeddings)

In [22]:
### The parameters of nn.Embedding do not necessarily need to match those of embeddings_glove100.
emb_no_pretrained_1 = nn.Embedding(num_embeddings = n_tokens_in_vocabulary,
                                 embedding_dim  = dimension_embedding,
                                 padding_idx    = 0)   

In [23]:
print("No pretrained, parameters are matched to those in embeddings_glove100")
print(emb_no_pretrained_1.weight)
print("-"*60)
print("Shape:")
print(emb_no_pretrained_1.weight.shape)

No pretrained, parameters are matched to those in embeddings_glove100
Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5992,  0.4771,  0.7262,  ..., -0.5923,  0.1543,  0.4408],
        [ 0.3125, -0.0335, -0.3980,  ..., -0.7712,  0.1799, -2.1268],
        ...,
        [-0.6912,  0.2276,  0.1961,  ...,  2.0628, -0.5818,  0.3142],
        [ 1.0843,  1.1831,  0.9872,  ...,  1.5555,  0.1373, -0.8254],
        [ 0.5877,  0.7533, -0.9052,  ..., -0.2541, -1.3026,  0.3046]],
       requires_grad=True)
------------------------------------------------------------
Shape:
torch.Size([3566, 100])


In [24]:
emb_no_pretrained_2 = nn.Embedding(num_embeddings = 20,
                                   embedding_dim  = 5,
                                   padding_idx    = 0)   

In [25]:
print("No pretrained, parameters are NOT matched to those in embeddings_glove100")
print(emb_no_pretrained_2.weight)
print("-"*60)
print("Shape:")
print(emb_no_pretrained_2.weight.shape)

No pretrained, parameters are NOT matched to those in embeddings_glove100
Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.1411,  0.6992,  0.1621,  2.1184, -0.6479],
        [ 0.2340, -0.5506, -0.0992,  0.6625,  0.0203],
        [-1.9562, -0.6154,  1.5287, -0.7422,  0.1131],
        [-0.0563, -0.3043, -0.8458,  0.2183, -2.1090],
        [ 0.4854,  1.3382, -0.0947,  0.9836, -0.3024],
        [ 0.9934, -0.6586,  2.0798, -0.5245, -0.5633],
        [-0.1805,  2.8780, -1.1575, -0.2114,  0.0456],
        [ 0.3959,  0.7365,  0.9696,  0.1698,  0.8996],
        [ 1.8139,  1.0857, -0.7120, -0.4977, -1.3081],
        [-1.7504,  1.6461, -0.4334,  0.7102, -1.3929],
        [ 0.8666,  0.9977,  0.4087, -0.8471, -0.8180],
        [-0.3010, -1.1261, -1.3120,  0.5677,  0.0400],
        [-0.5751, -1.2112, -2.3304,  0.5915, -0.4762],
        [ 1.0297, -0.7221, -0.2239,  0.6002,  0.1559],
        [ 0.2766, -0.5412, -0.2827, -1.1301,  0.0931],
        [ 1.1534,  0.772

### Use pre-trained  GloVe100 embeddings
### One can use the _weight attribute of nn.Embedding to set the pretrained word embedding matrix (such as GloVe) as the weights.

In [26]:
### The parameters of nn.Embedding need to match those of embeddings_glove100.
### Otherwize, it will report an error
try:
    nn.Embedding(embedding_dim  = 20,
                 num_embeddings = 5,
                 padding_idx    = 0,
                 _weight        = embeddings_glove100)
except Exception as e:
    print(e)

Shape of weight does not match num_embeddings and embedding_dim


In [27]:
### The parameters of nn.Embedding need to be a tensor
### Otherwize, it will report an error
try:
    nn.Embedding(embedding_dim  = dimension_embedding,
                 num_embeddings = n_tokens_in_vocabulary,
                 padding_idx    = 0,
                 _weight        = embeddings_glove100)
except Exception as e:
    print(e)

'numpy.ndarray' object has no attribute 'detach'


In [28]:
### The parameters of nn.Embedding need to match those of embeddings_glove100.
emb_pretrained = nn.Embedding(embedding_dim  = dimension_embedding,
                              num_embeddings = n_tokens_in_vocabulary,
                              padding_idx    = 0,
                              _weight        = torch.from_numpy(embeddings_glove100).float())

In [29]:
print("Pretrained, parameters are matched to those in embeddings_glove100")
print(emb_pretrained.weight)
print("-"*60)
print("Shape:")
print(emb_pretrained.weight.shape)

Pretrained, parameters are matched to those in embeddings_glove100
Parameter containing:
tensor([[-0.0382, -0.0121, -0.0462,  ..., -0.0651, -0.0608,  0.1727],
        [ 0.2213,  0.0832, -0.0284,  ..., -0.2280,  0.1983, -0.0641],
        [-0.0497,  0.0369,  0.2134,  ..., -0.0461,  0.0416, -0.0685],
        ...,
        [-0.1232,  0.1483, -0.1853,  ...,  0.0279, -0.0490,  0.0812],
        [ 0.1091,  0.1739,  0.1431,  ...,  0.0281, -0.2263, -0.0592],
        [-0.4105,  0.0906,  0.8362,  ...,  0.1629,  0.9483,  0.2537]],
       requires_grad=True)
------------------------------------------------------------
Shape:
torch.Size([3566, 100])
