# Word Embeddings: Encoding Linguistic (Lexical) Meaning (Semantics)
### Contents
Word Embeddings<br>
Understanding the Samples<br>
Exploring the Training Set<br>
From Images to Numpy Arrays<br>
From Numpy Arrays to Tensors<br>


In [1]:
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# If you want to get the same result each time you create an embedding then set
# the manual seed to the same value every time.
torch.manual_seed(42)

word_to_token = {'hello': 0, 'world': 1}
embeddings = nn.Embedding(2, 5)
print(embeddings)

lookup = torch.tensor(word_to_token['world'], dtype=torch.int64)
world_embeddings = embeddings(lookup)
print(world_embeddings)

Embedding(2, 5)
tensor([-0.1863,  2.2082, -0.6380,  0.4617,  0.2674],
       grad_fn=<EmbeddingBackward0>)


### N-Gram Language Modeling

In [3]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
EPOCHS = 1000
LR = 0.001

In [4]:
def clean_text(raw_text: str) -> list:
    remove_breaks = raw_text.replace('<br />', ' ')
    lower = remove_breaks.lower()
    valid_characters = [c for c in lower if c not in punctuation]
    cleaned = ''.join(valid_characters)
    return cleaned


def create_n_grams(word_list: list) -> list:
    # we should tokenize the input, but we will ignore that for now
    # build a list of tuples.
    # Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)
    n_grams = [
        (
            [word_list[i - j - 1] for j in range(CONTEXT_SIZE)],
            word_list[i]
        )
        for i in range(CONTEXT_SIZE, len(word_list))
    ]
    return n_grams


In [5]:
# We will use Shakespeare Sonnet 2
training_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold."""

training_sentence = clean_text(training_sentence)
training_words = training_sentence.split()
n_grams = create_n_grams(training_words)

# Turning our list of words into a set has the effect of eliminating duplicates.
# This is a useful technique for getting a distinct list of words.
vocab = set(training_words)

# Now we can enumerate over the set and create a word to index mapping.
words_to_tokens = {word: i for i, word in enumerate(vocab)}

def get_word_from_token(token):
    word = [w for w in words_to_tokens if words_to_tokens[w] == token]
    return word[0]


In [6]:
# Print the first 3, just so you can see what they look like.
print('Clean sentence:', training_sentence, '\n')
print('n_grams:', n_grams[:3], '\n')
print('Vocabulary:', vocab, '\n')
print('Tokens:', words_to_tokens)

Clean sentence: when forty winters shall besiege thy brow
and dig deep trenches in thy beautys field
thy youths proud livery so gazed on now
will be a totterd weed of small worth held
then being asked where all thy beauty lies
where all the treasure of thy lusty days
to say within thine own deep sunken eyes
were an alleating shame and thriftless praise
how much more praise deservd thy beautys use
if thou couldst answer this fair child of mine
shall sum my count and make my old excuse
proving his beauty by succession thine
this were to be new made when thou art old
and see thy blood warm when thou feelst it cold 

n_grams: [(['forty', 'when'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')] 

Vocabulary: {'child', 'made', 'thou', 'old', 'when', 'by', 'shall', 'so', 'lies', 'asked', 'it', 'then', 'brow', 'beautys', 'proud', 'use', 'sum', 'answer', 'on', 'held', 'new', 'proving', 'thy', 'being', 'field', 'say', 'beauty', 'how', 'much', 'couldst', 'thriftles

In [7]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()

        torch.manual_seed(42)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [8]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in n_grams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([words_to_tokens[w] for w in context], dtype=torch.int64)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([words_to_tokens[target]], dtype=torch.int64))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()

    losses.append(total_loss)

# The loss should decrease with every iteration (epoch) over the training data.
# When you have a large number of iterations over a small training set you are basically
# memorizing your training set.
# Print the first, last and every 100 in between.
for epoch in range(0, EPOCHS, 100):
    print(epoch, ':', losses[epoch])
print(EPOCHS-1, ':', losses[EPOCHS-1])

0 : 502.12924790382385
100 : 264.70236775279045
200 : 75.19377852231264
300 : 27.632991425693035
400 : 16.336002435535192
500 : 12.04359271377325
600 : 9.90938277123496
700 : 8.667444862425327
800 : 7.868045814335346
900 : 7.315754613606259
999 : 6.9177902459632605


In [9]:
# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[words_to_tokens['beauty']])

tensor([-1.5965,  0.5401, -0.1546,  0.8099, -1.8845, -0.1641,  0.7404,  0.0434,
         1.4659,  0.1944], grad_fn=<SelectBackward0>)


In [10]:
# The tag for this context is 'winters'.
# Make sure all text here is lower case since the original
# text was converted to lower case.
context = ['forty', 'when'] 

context_indexes = torch.tensor([words_to_tokens[w] for w in context], dtype=torch.int64)
log_probs = model(context_idxs)

In [11]:

print(log_probs)
print(log_probs.shape)

# The tensor of log probabilities is a two dimensional tensor (matrix)
# because the model is expecting a batch of contextes.
max_prob_index = torch.argmax(log_probs, dim=1)
print(max_prob_index)

print(log_probs[0, max_prob_index])
predicted_word = get_word_from_token(max_prob_index)
print('Predicted word:', predicted_word)

top_predictions = torch.topk(log_probs, 5, dim=1, largest=True, sorted=True)
print(top_predictions.indices[0])

i = 0
for index in top_predictions.indices[0]:
    i += 1
    predicted_word = get_word_from_token(index)
    print(i, predicted_word)

tensor([[ -9.8200,  -6.7775, -10.3853, -10.1752, -10.5929, -12.4177,  -9.9557,
          -9.9247, -13.1367,  -7.9381, -12.0332,  -7.0013,  -9.7796,  -6.3950,
         -12.7589, -12.0230,  -8.0474, -11.0113, -10.7316,  -8.6280, -10.3440,
         -11.2762,  -7.6896,  -9.5560, -11.6197,  -7.9628,  -8.5421, -10.6977,
         -12.1094,  -8.4204,  -8.4596, -10.6391, -11.2010,  -7.0185,  -9.0738,
          -8.9613,  -7.1692,  -8.7874, -11.0737, -10.4557, -12.5097, -11.1001,
         -10.6614, -10.6350, -10.4490, -12.7536, -11.6180,  -6.9300, -11.5422,
          -8.5230,  -7.3443, -12.0330,  -9.1452,  -8.7909, -11.1094, -10.8565,
          -9.2764,  -9.1708,  -7.5166,  -8.5492,  -8.6800, -10.8657,  -6.8645,
         -12.3441,  -7.2509, -10.0973,  -8.1541, -12.3961, -11.4086,  -5.5982,
         -11.4124, -10.7880,  -6.9215,  -8.1470,  -7.9986,  -8.7243,  -0.0227,
         -11.5981,  -8.5974,  -7.1635,  -7.1275,  -9.6583,  -8.7424, -12.2647,
          -7.7008,  -7.9010]], grad_fn=<LogSoftmaxBa

In [12]:
print(model.embeddings.weight[words_to_tokens['cold']])
print(model.embeddings.weight[words_to_tokens['winters']])

tensor([ 0.8244,  0.7983,  1.8890,  0.5935,  0.0697, -1.6034, -0.4298,  0.5762,
         0.3444, -3.1016], grad_fn=<SelectBackward0>)
tensor([-0.0782, -1.1051,  0.2376, -0.8343,  1.4269, -0.3624, -1.0107,  2.0188,
        -1.7139,  2.2302], grad_fn=<SelectBackward0>)


tensor([ 0.5634, -0.5786, -1.0838, -0.3889,  0.8126,  1.4981,  0.0439,  1.4443,
         0.2320,  0.5065], grad_fn=<SelectBackward0>)
         <br>
tensor([ 0.3349, -0.4794,  1.1488,  1.8238, -0.0490,  1.8349,  0.1909, -1.1668,
        -0.5927,  0.0974], grad_fn=<SelectBackward0>)