In [9]:
from nltk.tokenize import WordPunctTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(0)

<torch._C.Generator at 0x22675dab5b0>

Raw text:

In [2]:
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""

Build vocab:

In [3]:
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(raw_text.lower())
vocab = list(set(tokens))
vocab_size = len(vocab)

Build context and target pari:

In [6]:
from pprint import pprint

context_size = 2

data = []
for token in tokens[context_size:-context_size]:  # ignore boundary tokens
    token_index = tokens.index(token)
    context = tokens[token_index - context_size: token_index] + tokens[token_index + 1: token_index + context_size + 1]
    target = token
    data.append((context, target))

print("5 random samples")
pprint(data[:5])

5 random samples
[(['we', 'are', 'to', 'study'], 'about'),
 (['are', 'about', 'study', 'the'], 'to'),
 (['about', 'to', 'the', 'idea'], 'study'),
 (['to', 'study', 'idea', 'of'], 'the'),
 (['study', 'the', 'of', 'a'], 'idea')]


Helper function to create context vector:

In [8]:

ind2word = {i:w for i, w in enumerate(vocab)}
word2ind = {w:i for i, w in enumerate(vocab)}

def make_context_vector(context, word2ind):
    idxs = [word2ind[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)    

CBOW:

In [12]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.proj = nn.Linear(in_features=embedding_dim, out_features=128)
        self.output = nn.Linear(in_features=128, out_features=vocab_size)

    def forward(self, inputs):
        embeds = sum(self.embedding(inputs)).view(1, -1)
        proj = self.proj(embeds)
        out = self.output(proj)
        nll_prob = F.log_softmax(out, dim=-1)
        return nll_prob

Train model:

In [13]:
embedding_dim = 10

cbow = CBOW(vocab_size=vocab_size, embedding_dim=embedding_dim)
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

losses = []
loss = nn.NLLLoss()

In [15]:
epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cbow.to(device)

for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        context_vector = make_context_vector(context, word2ind).to(device)
        target_vector = torch.tensor([word2ind[target]], dtype=torch.long).to(device)

        nll_prob = cbow(context_vector)

        cbow.zero_grad()

        l = loss(nll_prob, target_vector)
        l.backward()
        optimizer.step()

        total_loss += l.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")
    losses.append(total_loss)

Epoch 1, Loss: 140.92402005195618
Epoch 2, Loss: 134.67801320552826
Epoch 3, Loss: 128.90018022060394
Epoch 4, Loss: 123.53096604347229
Epoch 5, Loss: 118.5208885371685
Epoch 6, Loss: 113.82911732792854
Epoch 7, Loss: 109.42193755507469
Epoch 8, Loss: 105.27136409282684
Epoch 9, Loss: 101.353968501091
Epoch 10, Loss: 97.64996945858002
Epoch 11, Loss: 94.14246702194214
Epoch 12, Loss: 90.81686639785767
Epoch 13, Loss: 87.660389482975
Epoch 14, Loss: 84.66169247031212
Epoch 15, Loss: 81.81060990691185
Epoch 16, Loss: 79.09790739417076
Epoch 17, Loss: 76.51513168215752
Epoch 18, Loss: 74.05449417233467
Epoch 19, Loss: 71.70878595113754
Epoch 20, Loss: 69.4712925851345
Epoch 21, Loss: 67.33575594425201
Epoch 22, Loss: 65.29632915556431
Epoch 23, Loss: 63.347537979483604
Epoch 24, Loss: 61.484277084469795
Epoch 25, Loss: 59.70174977183342
Epoch 26, Loss: 57.99547204375267
Epoch 27, Loss: 56.361248672008514
Epoch 28, Loss: 54.79515051841736
Epoch 29, Loss: 53.293517887592316
Epoch 30, Loss: 

Predict:

In [None]:
context = ['we','are','to', 'study']
context_vector = make_context_vector(context, word2ind).to(device)
a = cbow(context_vector).detach().cpu().squeeze()
print(f'Raw text: {raw_text}')
print(f'Test Context: {context}')
max_idx = torch.argmax(a, axis=-1).item()
print(f'\nPrediction: {ind2word[max_idx]}')

Raw text: We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.
Test Context: ['we', 'are', 'to', 'study']

Prediction: about


Visualize embedding results:

In [44]:
import numpy as np

In [45]:
embedder = cbow.embedding
sample_words = ['process', 'computer', 'data', 'program', 'people', 'we']
embedded_words = {word: embedder.weight[word2ind[word]].detach().cpu().numpy() for word in sample_words}

RuntimeError: Numpy is not available

In [42]:
embedded_words

{'process': tensor([ 1.0636, -0.6141,  0.4515,  1.0341, -1.0069, -2.1825,  0.3395, -0.4446,
          0.1524,  0.0487]),
 'computer': tensor([-0.8820,  0.0048, -1.2720, -1.0824,  0.7433,  0.3217,  0.3317, -1.5989,
          0.2196,  0.2818]),
 'data': tensor([-1.0211,  0.3959,  0.9917,  0.9879, -1.1777, -0.5731, -1.4529, -0.3053,
         -0.2835,  0.1194]),
 'program': tensor([ 0.8851, -0.4767, -0.1831, -0.1010,  0.1492,  2.4369,  1.3275, -0.2773,
          0.7176, -0.1568]),
 'people': tensor([-0.3989, -0.0460, -0.0509, -0.9866,  0.0163, -1.0345, -0.0870, -0.6354,
          0.8794,  0.1424]),
 'we': tensor([-0.6532, -1.6436, -0.0586,  0.6232,  0.2843, -0.6979,  1.6385,  0.3350,
          0.0041,  0.3089])}