# Linguistic Meaning and Embeddings
### Contents
Creating the Vocabulary<br>
Creating the N-Grams<br>
Tokenizing the N-Grams<br>
From Tokens to Embeddings<br>
Creating the Model<br>
Train for Meaning<br>


In [1]:
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Creating the Vocabulary

In [2]:
def clean_text(raw_text: str) -> list:
    remove_breaks = raw_text.replace('<br />', ' ')
    lower = remove_breaks.lower()
    valid_characters = [c for c in lower if c not in punctuation]
    cleaned = ''.join(valid_characters)
    return cleaned

In [3]:
training_paragraph = """Football players are fast runners.
Baseball players are fast runners.
Soccer players are fast runners.
A football player is an athlete.
A baseball player is an athlete.
"""

training_paragraph = clean_text(training_paragraph)
training_words = training_paragraph.split()

# Turning our list of words into a set has the effect of eliminating duplicates.
# This is a useful technique for getting a distinct list of words.
vocab = set(training_words)

In [4]:
print('Clean sentence:', training_paragraph, '\n')
print('Word List:', training_words, '\n')
print('Vocabulary:', vocab, '\n')

Clean sentence: football players are fast runners
baseball players are fast runners
soccer players are fast runners
a football player is an athlete
a baseball player is an athlete
 

Word List: ['football', 'players', 'are', 'fast', 'runners', 'baseball', 'players', 'are', 'fast', 'runners', 'soccer', 'players', 'are', 'fast', 'runners', 'a', 'football', 'player', 'is', 'an', 'athlete', 'a', 'baseball', 'player', 'is', 'an', 'athlete'] 

Vocabulary: {'is', 'fast', 'football', 'are', 'players', 'soccer', 'baseball', 'a', 'player', 'an', 'athlete', 'runners'} 



### Creating the N-Grams

In [5]:
def create_n_grams(word_list: list, context_size: int) -> list:
    n_grams = [
        (
            [word_list[i - j] for j in range(context_size, 0, -1)],
            word_list[i]
        )
        for i in range(context_size, len(word_list))
    ]
    return n_grams


def get_word_from_token(token: int, words_to_tokens: dict) -> str:
    word = [w for w in words_to_tokens if words_to_tokens[w] == token]
    return word[0]


In [6]:
CONTEXT_SIZE = 3

# Create the n_grams and target using the context size.
n_grams = create_n_grams(training_words, CONTEXT_SIZE)

In [7]:
# Print the first 3 n-grams to get a feel for their shape.
print('n_grams:', n_grams[:3], '\n')

n_grams: [(['football', 'players', 'are'], 'fast'), (['players', 'are', 'fast'], 'runners'), (['are', 'fast', 'runners'], 'baseball')] 



### Tokenizing the N-Grams

In [8]:
# Enumerate over the vocabulary and create a word to token mapping.
# The index for each word within th set will become the token for that word.
words_to_tokens = {word: i for i, word in enumerate(vocab)}
print('Tokens:', words_to_tokens)

Tokens: {'is': 0, 'fast': 1, 'football': 2, 'are': 3, 'players': 4, 'soccer': 5, 'baseball': 6, 'a': 7, 'player': 8, 'an': 9, 'athlete': 10, 'runners': 11}


In [9]:
def tokenize_n_grams(n_grams: list, words_to_tokens: dict) -> list:
    n_grams_tokenized = [
        (
            [words_to_tokens[w] for w in context], 
            words_to_tokens[target]
        )
        for context, target in n_grams
    ]
    return n_grams_tokenized

In [10]:
# Tokenize the context and targets in the n_grams.
n_grams_tokenized = tokenize_n_grams(n_grams, words_to_tokens)
print('n_grams_tokenized:', n_grams_tokenized[:3])

n_grams_tokenized: [([2, 4, 3], 1), ([4, 3, 1], 11), ([3, 1, 11], 6)]


### From Tokens to Embeddings

In [11]:
# If you want to get the same result each time you create an embedding then set
# the manual seed to the same value every time.
torch.manual_seed(42)

embeddings = nn.Embedding(len(vocab), 10)
print('Embeddings object:', embeddings)

lookup = torch.tensor(words_to_tokens['athlete'], dtype=torch.int64)
sample_embedding = embeddings(lookup)
print('Embedding for the word athlete:', sample_embedding)

Embeddings object: Embedding(12, 10)
Embedding for the word athlete: tensor([ 0.6408,  0.5832,  1.0669, -0.4502,  1.0311, -0.7048,  1.0131, -0.3308,
         0.5177,  0.3878], grad_fn=<EmbeddingBackward0>)


### Creating the Model

In [35]:
class NextWordModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, context_size):
        super(NextWordModel, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.relu = nn.ReLU()  # Rectified Linear Unit Function
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, log=False):
        embeds = self.embeddings(inputs)
        #out = F.relu(self.linear1(embeds))
        #out = self.linear2(out)
        #log_probs = F.log_softmax(out, dim=1)
        #return log_probs

        l1 = self.linear1(embeds.view((1, -1)))
        rect = self.relu(l1)
        l2 = self.linear2(rect)
        log_probs = self.log_softmax(l2) # F.log_softmax(out, dim=1)
        if log:
            print('Embeddings:', embeds)
            print('\nLinear 1:', l1)
            print('\nReLU:', rect)
            print('\nLinear 2:', l2)
            print('\nLog Probs:', log_probs)
        return log_probs


In [37]:
model = NextWordModel(len(vocab), 3, 25, CONTEXT_SIZE)
context = [(['player', 'is', 'an'], 'athlete')] 
context_tokens = tokenize_n_grams(context, words_to_tokens)
context_tokens = torch.tensor(context_tokens[0][0], dtype=torch.int64)
prediction = model(context_tokens, log=True)

Embeddings: tensor([[ 1.5502, -1.0978, -0.8919],
        [-0.5756, -1.4387, -0.1491],
        [ 1.3085,  2.1217,  0.0478]], grad_fn=<EmbeddingBackward0>)

Linear 1: tensor([[ 0.2590, -1.4024, -0.3151, -0.1871,  0.5986,  0.4474,  0.7647,  0.2096,
         -0.5723, -0.3835,  0.4502,  0.6856, -0.2956, -0.2554, -0.3235,  0.8177,
          0.1214, -1.1402, -0.0268,  1.2861,  0.2946,  0.1654,  0.3595,  0.5408,
          1.4968]], grad_fn=<AddmmBackward0>)

ReLU: tensor([[0.2590, 0.0000, 0.0000, 0.0000, 0.5986, 0.4474, 0.7647, 0.2096, 0.0000,
         0.0000, 0.4502, 0.6856, 0.0000, 0.0000, 0.0000, 0.8177, 0.1214, 0.0000,
         0.0000, 1.2861, 0.2946, 0.1654, 0.3595, 0.5408, 1.4968]],
       grad_fn=<ReluBackward0>)

Linear 2: tensor([[-0.0700,  0.1695,  0.1949, -0.7051,  0.1194, -0.2859, -0.1849, -0.1043,
         -0.1327, -0.6623, -0.0593,  0.2066]], grad_fn=<AddmmBackward0>)

Log Probs: tensor([[-2.4679, -2.2284, -2.2030, -3.1030, -2.2785, -2.6838, -2.5828, -2.5022,
         -2.5305, -3

### Training for Meaning

In [12]:
EMBEDDING_DIM = 10
EPOCHS = 1000
HIDDEN_DIM = 128
LR = 0.001

In [14]:
torch.manual_seed(42)
losses = []
loss_function = nn.NLLLoss()
model = NextWordModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in n_grams_tokenized:

        # Wrap the context in a tensor)
        context_tokens = torch.tensor(context, dtype=torch.int64)

        # Pytorch accumulates gradients so before passing in a new
        # context (features) you need to zero out the gradients from the 
        # previous context.
        model.zero_grad()

        # Forward pass - this will get log probabilities for every word 
        # in our vocabulary which is now represented as embeddings.
        log_probs = model(context_tokens)

        # Compute the loss.
        # target has to be a list for some reason.
        loss = loss_function(log_probs, torch.tensor([target], dtype=torch.int64))

        # Backward pass to update the gradients.
        loss.backward()

        # What does this do?
        optimizer.step()

        # Get the loss for this context.
        total_loss += loss.item()

    # Save the total loss for this epoch.
    losses.append(total_loss)

# The loss should decrease with every iteration (epoch) over the training data.
# When you have a large number of iterations over a small training set you are basically
# memorizing your training set.
# Print the losses of every 100th epoch.
for epoch in range(0, EPOCHS, 100):
    print(epoch, ':', losses[epoch])
# This will print the very last epoch so we can see the
# final loss value.
print(EPOCHS-1, ':', losses[EPOCHS-1])

0 : 59.95197677612305
100 : 19.475807011127472
200 : 8.627903487533331
300 : 5.792961858212948
400 : 4.8123498018831015
500 : 4.362085836939514
600 : 4.114962687715888
700 : 3.9621226326562464
800 : 3.8599168458022177
900 : 3.787520877085626
999 : 3.7343910494819283


### Testing the Model

In [15]:
# This is how to get the embedding of a word in our vocabulary 
# after a model has been trained.
print(model.embeddings.weight[words_to_tokens['player']])

tensor([-0.6405, -0.1944, -0.6511,  0.5523, -0.5914, -0.5250,  0.2237, -0.6562,
         0.7460,  0.4346], grad_fn=<SelectBackward0>)


In [16]:
# The tag for this context is 'athlete'.
# Make sure all text here is lower case since the original
# text was converted to lower case.
context = ['player', 'is', 'an'] 

context_indecies = torch.tensor([words_to_tokens[w] for w in context], dtype=torch.int64)
log_probs = model(context_indecies)

In [17]:
print(log_probs)
print(log_probs.shape)

# The tensor of log probabilities is a two dimensional tensor (matrix)
# because the model is expecting a batch of contextes.
max_prob_index = torch.argmax(log_probs, dim=1)
print(max_prob_index)

print(log_probs[0, max_prob_index])
predicted_word = get_word_from_token(max_prob_index, words_to_tokens)
print('Predicted word:', predicted_word)

top_predictions = torch.topk(log_probs, 5, dim=1, largest=True, sorted=True)
print(top_predictions.indices[0])

i = 0
for index in top_predictions.indices[0]:
    i += 1
    predicted_word = get_word_from_token(index, words_to_tokens)
    print(i, predicted_word)

tensor([[-7.9228, -6.6111, -6.7434, -0.0103, -7.9844, -7.0675, -7.5933, -7.8334,
         -5.7246, -8.1556, -6.8679, -7.2781]], grad_fn=<LogSoftmaxBackward0>)
torch.Size([1, 12])
tensor([3])
tensor([-0.0103], grad_fn=<IndexBackward0>)
Predicted word: athlete
tensor([ 3,  8,  1,  2, 10])
1 athlete
2 is
3 baseball
4 an
5 players


In [18]:
print(model.embeddings.weight[words_to_tokens['player']])
print(model.embeddings.weight[words_to_tokens['players']])
print(model.embeddings.weight[words_to_tokens['athlete']])

tensor([-0.6405, -0.1944, -0.6511,  0.5523, -0.5914, -0.5250,  0.2237, -0.6562,
         0.7460,  0.4346], grad_fn=<SelectBackward0>)
tensor([ 0.6887,  0.5703,  1.1483, -0.5263,  1.0720, -0.7966,  1.0411, -0.3138,
         0.5755,  0.3967], grad_fn=<SelectBackward0>)
tensor([-0.2747,  0.9012, -1.4918, -0.9536, -0.1928,  1.7755,  0.2302, -0.4631,
         0.3223, -0.8082], grad_fn=<SelectBackward0>)


tensor([ 0.7278,  0.6263,  1.2314, -0.5148,  1.1015, -0.7679,  1.0683, -0.3027,
         0.5859,  0.3694], grad_fn=<SelectBackward0>)
<br/>
tensor([-1.1315,  1.3011, -0.1857,  0.5292,  0.0905,  0.4474,  0.6157, -0.7282,
        -2.4235, -0.7869], grad_fn=<SelectBackward0>)
<br/>
tensor([-0.0883,  1.8781, -1.2263,  1.4076,  1.4836,  0.8571,  2.2892,  0.5454,
         0.3926, -0.1998], grad_fn=<SelectBackward0>)
