In [1]:
# see http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7ff518029768>

In [2]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
train_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
test_text = "We can study the idea of a computational process.".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(train_text + test_text)
vocab_size = len(vocab)
print('vocab_size:', vocab_size)

w2i = {w: i for i, w in enumerate(vocab)}
i2w = {i: w for i, w in enumerate(vocab)}

vocab_size: 50


In [3]:
def create_cbow_dataset(text):
    # context window is two, so four words are context
    data = []
    for i in range(2, len(text) - 2):
        context = [text[i - 2], text[i - 1],
                   text[i + 1], text[i + 2]]
        target = text[i]
        data.append((context, target))
    return data

cbow_train = create_cbow_dataset(train_text)
cbow_test = create_cbow_dataset(test_text)

In [4]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        embedded = self.embeddings(inputs).view((1, -1))
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        log_probs = F.log_softmax(out)
        return log_probs

In [5]:
embd_size = 128
hidden_size = 64
losses = []
loss_fn = nn.NLLLoss()
model = CBOW(vocab_size, embd_size, CONTEXT_SIZE, hidden_size)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = torch.Tensor([0])
    for ctx, target in cbow_train:
        ctx_idxs = [w2i[w] for w in ctx]
        ctx_var = Variable(torch.LongTensor(ctx_idxs))
        
        model.zero_grad()
        log_probs = model(ctx_var)
        
        loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]])))
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.data
    losses.append(total_loss)
print(losses)

[
 228.0627
[torch.FloatTensor of size 1]
, 
 222.6559
[torch.FloatTensor of size 1]
, 
 217.4730
[torch.FloatTensor of size 1]
, 
 212.5053
[torch.FloatTensor of size 1]
, 
 207.6947
[torch.FloatTensor of size 1]
, 
 203.0198
[torch.FloatTensor of size 1]
, 
 198.4566
[torch.FloatTensor of size 1]
, 
 193.9582
[torch.FloatTensor of size 1]
, 
 189.5747
[torch.FloatTensor of size 1]
, 
 185.2126
[torch.FloatTensor of size 1]
]


In [12]:
correct_ct = 0
for ctx, target in cbow_test:
    ctx_idxs = [w2i[w] for w in ctx]
    ctx_var = Variable(torch.LongTensor(ctx_idxs))

    model.zero_grad()
    log_probs = model(ctx_var)
    _, predicted = torch.max(log_probs.data, 1)
    predicted_word = i2w[predicted[0]]
    print('predicted:', predicted_word)
    print('label    :', target, '\n')
    if predicted_word == target:
        correct_ct += 1

    loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]])))

    loss.backward()
    optimizer.step()

    total_loss += loss.data
print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(cbow_test)*100, correct_ct, len(cbow_test)))

predicted: they
label    : study 

predicted: the
label    : the 

predicted: idea
label    : idea 

predicted: of
label    : of 

predicted: a
label    : a 

Accuracy: 80.0% (4/5)
