In [1]:
"""
simple embedding example

torch.nn.Embedding(vocav_size, dimensions)
The resulting table is indexed using torch.LongTensor, because the indices are integers, not floats
"""

<torch._C.Generator at 0x7f4d3a3e53f0>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

Basic Embedding


In [37]:
word_to_ix = {"hello":0, "world":1}
print(word_to_ix)
vocab_size = len(word_to_ix)
dimensions = 5
embeds = torch.nn.Embedding(vocab_size, dimensions)
print("Initial weights")
print(embeds.weight)

word = "world"
lookup_tensor = torch.tensor([word_to_ix[word]], dtype=torch.long)
print("Lookup tensor for {}: {}".format(word, lookup_tensor))

hello_embed = embeds(lookup_tensor)
print("Word {} embeddings".format(word))
print(hello_embed)


{'hello': 0, 'world': 1}
Initial weights
Parameter containing:
tensor([[ 1.7995,  0.5176, -1.1685,  0.7591, -1.8689],
        [ 0.5573,  0.1582, -0.9813, -0.8757, -0.2630]], requires_grad=True)
Lookup tensor for world: tensor([1])
Word world embeddings
tensor([[ 0.5573,  0.1582, -0.9813, -0.8757, -0.2630]],
       grad_fn=<EmbeddingBackward>)


In [25]:
"""
N-gram Language Modeling

we will compute the loss function on some training examples
and update the parameters with backpropagation.
"""

'\n\n'

N-gram Training with loss function and backpropagation

In [259]:
CONTEXT_SIZE = 2
EMBEDDING_SIZE = 100
EPOCHS = 20

raw_test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold."""

test_tokens = raw_test_sentence.lower().split()
print("Total tokens:", len(test_tokens))
print("Total unique tokens:", len(set(test_tokens)))

# Build the list of tuples. ([word_i-2, word_i-1], target_word)

trigrams = [([test_tokens[i], test_tokens[i+1]], test_tokens[i+2]) for i in range(len(test_tokens) - 2)]
print("Total trigrams:", len(trigrams))
print("First 5 trigrams")
print(trigrams[:5])

vocab = set(test_tokens)
vocab_size = len(vocab)
word_to_ix = {word:i for i, word in enumerate(vocab)}
print(word_to_ix)

Total tokens: 115
Total unique tokens: 90
Total trigrams: 113
First 5 trigrams
[(['when', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege'), (['shall', 'besiege'], 'thy'), (['besiege', 'thy'], 'brow,')]
{'brow,': 0, 'where': 1, 'more': 2, 'count,': 3, "deserv'd": 4, 'his': 5, 'sum': 6, 'blood': 7, 'praise': 8, 'it': 9, 'asked,': 10, 'now,': 11, 'days;': 12, 'cold.': 13, 'if': 14, 'succession': 15, 'make': 16, 'the': 17, 'an': 18, 'small': 19, 'my': 20, 'old,': 21, 'all-eating': 22, 'lusty': 23, 'held:': 24, 'thy': 25, 'eyes,': 26, 'much': 27, 'livery': 28, 'gazed': 29, 'weed': 30, 'of': 31, 'worth': 32, 'all': 33, 'proving': 34, 'winters': 35, "youth's": 36, 'thou': 37, "'this": 38, 'made': 39, 'proud': 40, 'field,': 41, 'child': 42, 'a': 43, "feel'st": 44, 'new': 45, 'mine': 46, 'on': 47, 'warm': 48, 'old': 49, 'own': 50, 'be': 51, 'trenches': 52, 'say,': 53, 'then': 54, 'thine!': 55, 'this': 56, 'use,': 57, 'were': 58, 'to': 59, 'deep': 60, 'wil

In [85]:
# Define the NN
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_size, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(context_size * embedding_size, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1,-1))
        #print(embeds.shape)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [118]:
# Compile the NN
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(vocab_size, EMBEDDING_SIZE, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Training
for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in trigrams:

        # Step 1: Prepare the inputs to be passed to the model
        # Turn words into integer indices and wrap then in tensors
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2: Zero out the gradients from the old gradients before passing in a new instance
        model.zero_grad()

        # Step 3: Run a forward pass, getting the log probabilities over next words
        log_probs = model(context_idxs)

        # Step 4. Compute the loss function.
        # Wrap the target word in a tensor
        target_word = torch.tensor([word_to_ix[target]], dtype=torch.long)
        loss = loss_function(log_probs, target_word)

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the python number from a 1-element Tensor
        total_loss += loss.item()

    mean_loss = total_loss/len(trigrams)
    print("Epoch {}; Loss {:.4f}".format(epoch+1, mean_loss))

    losses.append(mean_loss)
print(losses)

tensor([84, 62])
tensor([62, 35])
tensor([35, 71])
tensor([71, 67])
tensor([67, 25])
tensor([25,  0])
tensor([ 0, 81])
tensor([81, 78])
tensor([78, 60])
tensor([60, 52])
tensor([52, 88])
tensor([88, 25])
tensor([25, 72])
tensor([72, 41])
tensor([41, 25])
tensor([25, 36])
tensor([36, 40])
tensor([40, 28])
tensor([28, 87])
tensor([87, 29])
tensor([29, 47])
tensor([47, 11])
tensor([11, 61])
tensor([61, 51])
tensor([51, 43])
tensor([43, 70])
tensor([70, 30])
tensor([30, 31])
tensor([31, 19])
tensor([19, 32])
tensor([32, 24])
tensor([24, 54])
tensor([54, 82])
tensor([82, 10])
tensor([10,  1])
tensor([ 1, 33])
tensor([33, 25])
tensor([25, 79])
tensor([79, 65])
tensor([65,  1])
tensor([ 1, 33])
tensor([33, 17])
tensor([17, 73])
tensor([73, 31])
tensor([31, 25])
tensor([25, 23])
tensor([23, 12])
tensor([12, 59])
tensor([59, 53])
tensor([53, 86])
tensor([86, 83])
tensor([83, 50])
tensor([50, 60])
tensor([60, 89])
tensor([89, 26])
tensor([26, 58])
tensor([58, 18])
tensor([18, 22])
tensor([22, 66

In [117]:
print("Vocabulary size:", vocab_size)
print("Embedding size:")
print(model.embeddings.weight.shape)
print("Embeddings:")
print(model.embeddings.weight)

word = "beauty"
word_tensor = torch.tensor([word_to_ix[word]], dtype=torch.long)
print("Lookup tensor for {}: {}".format(word, word_tensor))
word_embedding = model.embeddings(word_tensor)
print("Word {} embeddings".format(word))
print(word_embedding)
print(word_embedding.shape)

Vocabulary size: 90
Embedding size:
torch.Size([90, 20])
Embeddings:
Parameter containing:
tensor([[-0.0715,  0.6002,  0.4448,  ..., -0.1975, -0.6606,  0.2085],
        [ 1.1495,  0.5372,  0.0978,  ...,  0.4253, -0.6698,  0.5892],
        [ 0.5206, -0.0339, -0.7048,  ..., -1.9731, -1.1109, -1.0216],
        ...,
        [-1.1528, -0.6887,  0.3358,  ...,  0.6819,  0.9988, -1.0535],
        [-0.2648,  0.0995, -1.4730,  ..., -0.8223, -1.1571, -0.1485],
        [ 1.0819,  1.1170, -0.7826,  ..., -0.2714, -0.2224, -0.9763]],
       requires_grad=True)
Lookup tensor for beauty: tensor([79])
Word beauty embeddings
tensor([[ 0.1322, -0.3402, -0.2000,  0.6007,  0.1257,  0.6941,  0.7279,  1.8358,
         -0.0501,  1.9949,  0.0692, -0.8441, -1.4098, -0.2287, -0.9542,  0.1309,
          1.6214,  1.0110,  1.2602, -0.9956]], grad_fn=<EmbeddingBackward>)
torch.Size([1, 20])


In [109]:
"""
CBOW
tries to predict words given the context of a few words before and a few words after the target word.
CBOW is used to quickly train word embeddings, and these embeddings are used to initialize the embeddings
of some more complicated model.
this is referred to as pretraining embeddings.

"""

In [250]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".lower().split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
print("Vocab size:",vocab_size)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {word_to_ix[k]:k for k in word_to_ix}

print(word_to_ix)
print(ix_to_word)

Vocab size: 46
{'about': 0, 'beings': 1, 'rules': 2, 'spells.': 3, 'direct': 4, 'inhabit': 5, 'conjure': 6, 'our': 7, 'directed': 8, 'people': 9, 'is': 10, 'they': 11, 'a': 12, 'process.': 13, 'data.': 14, 'process': 15, 'create': 16, 'computer': 17, 'processes.': 18, 'as': 19, 'with': 20, 'pattern': 21, 'things': 22, 'by': 23, 'we': 24, 'abstract': 25, 'processes': 26, 'of': 27, 'computational': 28, 'called': 29, 'effect,': 30, 'manipulate': 31, 'evolve,': 32, 'idea': 33, 'programs': 34, 'program.': 35, 'computers.': 36, 'evolution': 37, 'that': 38, 'in': 39, 'to': 40, 'the': 41, 'are': 42, 'study': 43, 'other': 44, 'spirits': 45}
{0: 'about', 1: 'beings', 2: 'rules', 3: 'spells.', 4: 'direct', 5: 'inhabit', 6: 'conjure', 7: 'our', 8: 'directed', 9: 'people', 10: 'is', 11: 'they', 12: 'a', 13: 'process.', 14: 'data.', 15: 'process', 16: 'create', 17: 'computer', 18: 'processes.', 19: 'as', 20: 'with', 21: 'pattern', 22: 'things', 23: 'by', 24: 'we', 25: 'abstract', 26: 'processes', 27

In [251]:
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['we', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [260]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, dimension_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, dimension_size)
        self.linear1 = nn.Linear(dimension_size, 128)
        self.activation_function1 = nn.ReLU()
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, EMBEDDING_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [253]:
def make_context_vector(context, word_to_ix):
    #print("context")
    #print(context)
    #print("W2i")
    #print(word_to_ix)
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)

tensor([24, 42, 40, 43])

In [261]:
# Training
for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in data:

        # Step 1: Prepare the inputs to be passed to the model
        # Turn words into integer indices and wrap then in tensors
        #context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        context_idxs = make_context_vector(context, word_to_ix)
        # Step 2: Zero out the gradients from the old gradients before passing in a new instance
        model.zero_grad()

        # Step 3: Run a forward pass, getting the log probabilities over next words
        log_probs = model(context_idxs)

        # Step 4. Compute the loss function.
        # Wrap the target word in a tensor
        target_word = torch.tensor([word_to_ix[target]], dtype=torch.long)
        loss = loss_function(log_probs, target_word)

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the python number from a 1-element Tensor
        total_loss += loss.item()

    mean_loss = total_loss/len(trigrams)
    print("Epoch {}; Loss {:.4f}".format(epoch+1, mean_loss))

    losses.append(mean_loss)
print(losses)

KeyError: 'we'

Test

In [255]:
def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i
    return index

def get_max_prob_result(input, ix_to_word):
    return ix_to_word[get_index_of_max(input)]

In [257]:
context = ["people", "create", "to", "direct"]
print(context)

context_vector = make_context_vector(context, word_to_ix)
print(context_vector)

['people', 'create', 'to', 'direct']
tensor([ 9, 16, 40,  4])


In [258]:
a = model(context_vector).data.numpy()
print("Raw text: {}\n".format(" ".join(raw_text)))
print("Context: {}\n".format(context))
print("Prediction: {}".format(get_max_prob_result(a[0], ix_to_word)))

Raw text: we are about to study the idea of a computational process. computational processes are abstract beings that inhabit computers. as they evolve, processes manipulate other abstract things called data. the evolution of a process is directed by a pattern of rules called a program. people create programs to direct processes. in effect, we conjure the spirits of the computer with our spells.

Context: ['people', 'create', 'to', 'direct']

Prediction: the
