# Word Embeddings: Encoding Linguistic (Lexical) Meaning (Semantics)
### Contents
Word Embeddings<br>
Understanding the Samples<br>
Exploring the Training Set<br>
From Images to Numpy Arrays<br>
From Numpy Arrays to Tensors<br>


In [None]:
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
# If you want to get the same result each time you create an embedding then set
# the manual seed to the same value every time.
torch.manual_seed(42)

word_to_token = {'hello': 0, 'world': 1}
embeddings = nn.Embedding(2, 5)
print(embeddings)

lookup = torch.tensor(word_to_token['world'], dtype=torch.int64)
world_embeddings = embeddings(lookup)
print(world_embeddings)

### N-Gram Language Modeling

In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
EPOCHS = 1000
LR = 0.001

In [None]:
def clean_text(raw_text: str) -> list:
    remove_breaks = raw_text.replace('<br />', ' ')
    lower = remove_breaks.lower()
    valid_characters = [c for c in lower if c not in punctuation]
    cleaned = ''.join(valid_characters)
    return cleaned


def create_n_grams(word_list: list, context_size: int) -> list:
    # we should tokenize the input, but we will ignore that for now
    # build a list of tuples.
    # Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)
    n_grams = [
        (
            [word_list[i - j - 1] for j in range(context_size)],
            word_list[i]
        )
        for i in range(context_size, len(word_list))
    ]
    return n_grams


def tokenize_n_grams(n_grams: list, words_to_tokens: dict) -> list:
    n_grams_tokenized = [
        (
            [words_to_tokens[w] for w in context], 
            words_to_tokens[target]
        )
        for context, target in n_grams
    ]
    return n_grams_tokenized


def get_word_from_token(token: int, words_to_tokens: dict) -> str:
    word = [w for w in words_to_tokens if words_to_tokens[w] == token]
    return word[0]


In [None]:
# We will use Shakespeare Sonnet 2
training_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold."""

training_sentence = clean_text(training_sentence)
training_words = training_sentence.split()

# Turning our list of words into a set has the effect of eliminating duplicates.
# This is a useful technique for getting a distinct list of words.
vocab = set(training_words)

# Now we can enumerate over the set and create a word to token mapping.
# The index for each word within th set will become our token.
words_to_tokens = {word: i for i, word in enumerate(vocab)}

# Create n_grams (context and target) using the context size.
n_grams = create_n_grams(training_words, CONTEXT_SIZE)

# Tokenize the context and targets in the n_grams.
n_grams_tokenized = tokenize_n_grams(n_grams, words_to_tokens)


In [None]:
# Print the first 3, just so you can see what they look like.
print('Clean sentence:', training_sentence, '\n')
print('n_grams:', n_grams[:3], '\n')
print('Vocabulary:', vocab, '\n')
print('Tokens:', words_to_tokens)

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
torch.manual_seed(42)
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in n_grams_tokenized:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor(context, dtype=torch.int64)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        # target has to be a list for some reason.
        loss = loss_function(log_probs, torch.tensor([target], dtype=torch.int64))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()

    losses.append(total_loss)

# The loss should decrease with every iteration (epoch) over the training data.
# When you have a large number of iterations over a small training set you are basically
# memorizing your training set.
# Print the first, last and every 100 in between.
for epoch in range(0, EPOCHS, 100):
    print(epoch, ':', losses[epoch])
print(EPOCHS-1, ':', losses[EPOCHS-1])

In [None]:
# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[words_to_tokens['beauty']])

In [None]:
# The tag for this context is 'old'.
# Make sure all text here is lower case since the original
# text was converted to lower case.
context = ['art', 'thou'] 

context_indecies = torch.tensor([words_to_tokens[w] for w in context], dtype=torch.int64)
log_probs = model(context_indecies)

In [None]:
print(log_probs)
print(log_probs.shape)

# The tensor of log probabilities is a two dimensional tensor (matrix)
# because the model is expecting a batch of contextes.
max_prob_index = torch.argmax(log_probs, dim=1)
print(max_prob_index)

print(log_probs[0, max_prob_index])
predicted_word = get_word_from_token(max_prob_index, words_to_tokens)
print('Predicted word:', predicted_word)

top_predictions = torch.topk(log_probs, 5, dim=1, largest=True, sorted=True)
print(top_predictions.indices[0])

i = 0
for index in top_predictions.indices[0]:
    i += 1
    predicted_word = get_word_from_token(index, words_to_tokens)
    print(i, predicted_word)

In [None]:
print(model.embeddings.weight[words_to_tokens['cold']])
print(model.embeddings.weight[words_to_tokens['winters']])

tensor([ 0.8539,  0.5130,  0.5397,  0.5655,  0.5058,  0.2225, -0.6855,  0.5636,
        -1.5072, -1.6107], grad_fn=<SelectBackward0>)
<br/>
tensor([-0.2279,  0.8686, -1.4612, -0.9889, -0.2377,  1.8803,  0.3661, -0.4606,
         0.3843, -0.9012], grad_fn=<SelectBackward0>)