# Linguistic Meaning and Embeddings
### Contents
Creating the Vocabulary<br>
Creating the N-Grams<br>
Tokenizing the N-Grams<br>
From Tokens to Embeddings<br>
Creating the Model<br>
Train for Meaning<br>


In [1]:
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Creating the Vocabulary

In [2]:
def clean_text(raw_text: str) -> list:
    remove_breaks = raw_text.replace('<br />', ' ')
    lower = remove_breaks.lower()
    valid_characters = [c for c in lower if c not in punctuation]
    cleaned = ''.join(valid_characters)
    return cleaned

In [3]:
training_paragraph = """Football players are fast runners.
Baseball players are fast runners.
Soccer players are fast runners.
A football player is an athlete.
A baseball player is an athlete.
"""

training_paragraph = clean_text(training_paragraph)
training_words = training_paragraph.split()

# Turning our list of words into a set has the effect of eliminating duplicates.
# This is a useful technique for getting a distinct list of words.
vocab = set(training_words)

In [4]:
print('Clean sentence:', training_paragraph, '\n')
print('Word List:', training_words, '\n')
print('Vocabulary:', vocab, '\n')

Clean sentence: football players are fast runners
baseball players are fast runners
soccer players are fast runners
a football player is an athlete
a baseball player is an athlete
 

Word List: ['football', 'players', 'are', 'fast', 'runners', 'baseball', 'players', 'are', 'fast', 'runners', 'soccer', 'players', 'are', 'fast', 'runners', 'a', 'football', 'player', 'is', 'an', 'athlete', 'a', 'baseball', 'player', 'is', 'an', 'athlete'] 

Vocabulary: {'a', 'soccer', 'baseball', 'fast', 'player', 'athlete', 'an', 'are', 'runners', 'is', 'football', 'players'} 



### Creating the N-Grams

In [5]:
def create_n_grams(word_list: list, context_size: int) -> list:
    n_grams = [
        (
            [word_list[i - j] for j in range(context_size, 0, -1)],
            word_list[i]
        )
        for i in range(context_size, len(word_list))
    ]
    return n_grams


def get_word_from_token(token: int, words_to_tokens: dict) -> str:
    word = [w for w in words_to_tokens if words_to_tokens[w] == token]
    return word[0]


In [6]:
CONTEXT_SIZE = 3

# Create the n_grams and target using the context size.
n_grams = create_n_grams(training_words, CONTEXT_SIZE)

In [7]:
# Print the first 3 n-grams to get a feel for their shape.
print('n_grams:', n_grams[:3], '\n')

n_grams: [(['football', 'players', 'are'], 'fast'), (['players', 'are', 'fast'], 'runners'), (['are', 'fast', 'runners'], 'baseball')] 



### Tokenizing the N-Grams

In [8]:
# Enumerate over the vocabulary and create a word to token mapping.
# The index for each word within th set will become the token for that word.
words_to_tokens = {word: i for i, word in enumerate(vocab)}
print('Tokens:', words_to_tokens)

Tokens: {'a': 0, 'soccer': 1, 'baseball': 2, 'fast': 3, 'player': 4, 'athlete': 5, 'an': 6, 'are': 7, 'runners': 8, 'is': 9, 'football': 10, 'players': 11}


In [9]:
def tokenize_n_grams(n_grams: list, words_to_tokens: dict) -> list:
    n_grams_tokenized = [
        (
            [words_to_tokens[w] for w in context], 
            words_to_tokens[target]
        )
        for context, target in n_grams
    ]
    return n_grams_tokenized

In [10]:
# Tokenize the context and targets in the n_grams.
n_grams_tokenized = tokenize_n_grams(n_grams, words_to_tokens)
print('n_grams_tokenized:', n_grams_tokenized[:3])

n_grams_tokenized: [([10, 11, 7], 3), ([11, 7, 3], 8), ([7, 3, 8], 2)]


### From Tokens to Embeddings

In [11]:
# If you want to get the same result each time you create an embedding then set
# the manual seed to the same value every time.
torch.manual_seed(42)

embeddings = nn.Embedding(len(vocab), 10)
print('Embeddings object:', embeddings)

lookup = torch.tensor(words_to_tokens['athlete'], dtype=torch.int64)
sample_embedding = embeddings(lookup)
print('Embedding for the word athlete:', sample_embedding)

Embeddings object: Embedding(12, 10)
Embedding for the word athlete: tensor([ 0.0780,  0.5258, -0.4880,  1.1914, -0.8140, -0.7360, -1.4032,  0.0360,
        -0.0635,  0.6756], grad_fn=<EmbeddingBackward0>)


### Creating the Model

In [12]:
class NextWordModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, context_size):
        super(NextWordModel, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.relu = nn.ReLU()  # Rectified Linear Unit Function
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, log=False):
        embeds = self.embeddings(inputs)
        #out = F.relu(self.linear1(embeds))
        #out = self.linear2(out)
        #log_probs = F.log_softmax(out, dim=1)
        #return log_probs

        l1 = self.linear1(embeds.view((1, -1)))
        rect = self.relu(l1)
        l2 = self.linear2(rect)
        log_probs = self.log_softmax(l2) # F.log_softmax(out, dim=1)
        if log:
            print('Embeddings:', embeds, embeds.shape)
            print('\nLinear 1:', l1, l1.shape)
            print('\nReLU:', rect, rect.shape)
            print('\nLinear 2:', l2, l2.shape)
            print('\nLog Probs:', log_probs, log_probs.shape)
        return log_probs


In [13]:
model = NextWordModel(len(vocab), 3, 25, CONTEXT_SIZE)
context = [(['player', 'is', 'an'], 'athlete')] 
context_tokens = tokenize_n_grams(context, words_to_tokens)
context_tokens = torch.tensor(context_tokens[0][0], dtype=torch.int64)
prediction = model(context_tokens, log=True)

Embeddings: tensor([[-0.4253,  0.2625, -1.4391],
        [-0.2076, -1.1586, -0.9637],
        [-2.2933,  0.4976, -1.2956]], grad_fn=<EmbeddingBackward0>) torch.Size([3, 3])

Linear 1: tensor([[-0.2223,  0.0219,  0.2296, -0.3568,  1.0570,  1.1476,  0.5103,  0.9337,
         -0.8416, -0.3021,  1.0448,  1.7986,  0.9115, -1.3132, -0.6403,  1.0402,
          0.2021,  0.6796, -0.3059,  0.0673,  0.5142, -0.7614, -0.3377, -0.4474,
          0.2870]], grad_fn=<AddmmBackward0>) torch.Size([1, 25])

ReLU: tensor([[0.0000, 0.0219, 0.2296, 0.0000, 1.0570, 1.1476, 0.5103, 0.9337, 0.0000,
         0.0000, 1.0448, 1.7986, 0.9115, 0.0000, 0.0000, 1.0402, 0.2021, 0.6796,
         0.0000, 0.0673, 0.5142, 0.0000, 0.0000, 0.0000, 0.2870]],
       grad_fn=<ReluBackward0>) torch.Size([1, 25])

Linear 2: tensor([[ 0.2720,  0.3860,  0.1721,  0.3574,  0.2004,  0.0144, -0.5396, -0.7439,
          0.2958, -0.2228,  0.2883, -0.1979]], grad_fn=<AddmmBackward0>) torch.Size([1, 12])

Log Probs: tensor([[-2.2925, -2.1

### Training for Meaning

In [14]:
EMBEDDING_DIM = 10
EPOCHS = 1000
HIDDEN_DIM = 128
LR = 0.001

In [15]:
torch.manual_seed(42)
losses = []
loss_function = nn.NLLLoss()
model = NextWordModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in n_grams_tokenized:

        # Wrap the context in a tensor)
        context_tokens = torch.tensor(context, dtype=torch.int64)

        # Pytorch accumulates gradients so before passing in a new
        # context (features) you need to zero out the gradients from the 
        # previous context.
        model.zero_grad()

        # Forward pass - this will get log probabilities for every word 
        # in our vocabulary which is now represented as embeddings.
        log_probs = model(context_tokens)

        # Compute the loss.
        # target has to be a list for some reason.
        loss = loss_function(log_probs, torch.tensor([target], dtype=torch.int64))

        # Backward pass to update the gradients.
        loss.backward()

        # What does this do?
        optimizer.step()

        # Get the loss for this context.
        total_loss += loss.item()

    # Save the total loss for this epoch.
    losses.append(total_loss)

# The loss should decrease with every iteration (epoch) over the training data.
# When you have a large number of iterations over a small training set you are basically
# memorizing your training set.
# Print the losses of every 100th epoch.
for epoch in range(0, EPOCHS, 100):
    print(epoch, ':', losses[epoch])
# This will print the very last epoch so we can see the
# final loss value.
print(EPOCHS-1, ':', losses[EPOCHS-1])

0 : 61.27088236808777
100 : 18.320321902632713
200 : 7.726442538201809
300 : 5.318262588232756
400 : 4.52228725515306
500 : 4.159920261241496
600 : 3.960211994126439
700 : 3.8365809861570597
800 : 3.7534883515909314
900 : 3.6943230908364058
999 : 3.650792679283768


### Testing the Model

In [16]:
# This is how to get the embedding of a word in our vocabulary 
# after a model has been trained.
print(model.embeddings.weight[words_to_tokens['player']])

tensor([-1.6316,  1.0098, -0.8707, -0.6497, -1.3264,  2.1738, -1.3129, -0.5343,
        -0.9512, -0.6675], grad_fn=<SelectBackward0>)


In [17]:
# The tag for this context is 'athlete'.
# Make sure all text here is lower case since the original
# text was converted to lower case.
#context = ['player', 'is', 'an'] 
context = ['athlete', 'is', 'a']

context_tokens = torch.tensor([words_to_tokens[w] for w in context], dtype=torch.int64)
log_probs = model(context_tokens)

print(log_probs)
print(log_probs.shape)

# The tensor of log probabilities is a two dimensional tensor (matrix)
# because the model is expecting a batch of contextes.
max_prob_index = torch.argmax(log_probs, dim=1)
print(max_prob_index)

print(log_probs[0, max_prob_index])
predicted_word = get_word_from_token(max_prob_index, words_to_tokens)
print('Predicted word:', predicted_word)

top_predictions = torch.topk(log_probs, 5, dim=1, largest=True, sorted=True)
print(top_predictions.indices[0])

i = 0
for index in top_predictions.indices[0]:
    i += 1
    predicted_word = get_word_from_token(index, words_to_tokens)
    print(i, predicted_word)

tensor([[-3.5737, -3.7320, -0.7212, -4.7295, -2.4370, -2.6533, -3.5015, -5.9721,
         -5.0122, -4.7887, -1.4039, -6.2874]], grad_fn=<LogSoftmaxBackward0>)
torch.Size([1, 12])
tensor([2])
tensor([-0.7212], grad_fn=<IndexBackward0>)
Predicted word: baseball
tensor([ 2, 10,  4,  5,  6])
1 baseball
2 football
3 player
4 athlete
5 an


In [18]:
print(model.embeddings.weight[words_to_tokens['player']])
print(model.embeddings.weight[words_to_tokens['players']])
print(model.embeddings.weight[words_to_tokens['athlete']])

tensor([-1.6316,  1.0098, -0.8707, -0.6497, -1.3264,  2.1738, -1.3129, -0.5343,
        -0.9512, -0.6675], grad_fn=<SelectBackward0>)
tensor([-0.6358, -0.2375, -0.7234,  0.5403, -0.5226, -0.6257,  0.3453, -0.6121,
         0.8003,  0.4612], grad_fn=<SelectBackward0>)
tensor([ 0.0708,  0.5251, -0.4969,  1.2200, -0.8312, -0.7442, -1.4389,  0.0616,
        -0.0735,  0.6839], grad_fn=<SelectBackward0>)


tensor([ 0.7278,  0.6263,  1.2314, -0.5148,  1.1015, -0.7679,  1.0683, -0.3027,
         0.5859,  0.3694], grad_fn=<SelectBackward0>)
<br/>
tensor([-1.1315,  1.3011, -0.1857,  0.5292,  0.0905,  0.4474,  0.6157, -0.7282,
        -2.4235, -0.7869], grad_fn=<SelectBackward0>)
<br/>
tensor([-0.0883,  1.8781, -1.2263,  1.4076,  1.4836,  0.8571,  2.2892,  0.5454,
         0.3926, -0.1998], grad_fn=<SelectBackward0>)
