In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

dtype = torch.FloatTensor

In [3]:
sentences = [ "i like dog", "i love coffee", "i hate milk"]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict) # number of Vocabulary

# NNLM Parameter
emb_size = 2 # m in paper
n_step   = 2 # n-1 in paper
n_hidden = 2 # h in paper

def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch

# Model
class LM(nn.Module):
    def __init__(self):
        super(LM, self).__init__()
        self.E  = nn.Embedding(n_class, emb_size)                                     # Embedding
        self.W1 = nn.Parameter(torch.randn(n_step * emb_size, n_hidden).type(dtype))  # Dense 1 weights
        self.B1 = nn.Parameter(torch.randn(n_hidden).type(dtype))                     # Dense 1 bias
        self.W2 = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))            # Dense 2 weights
        self.RW = nn.Parameter(torch.randn(n_step * emb_size, n_class).type(dtype))   # Dense 2 residual weights
        self.B2 = nn.Parameter(torch.randn(n_class).type(dtype))                      # Dense 2 bias

    def forward(self, X):
        X = self.E(X)                     # Embeding layer          [bs, n_step,  emb_size]
        X = X.view(-1, n_step * emb_size) # Embedings concatenation [bs, n_step * emb_size]
        tanh = torch.tanh(self.B1 + torch.mm(X, self.W1)) # Dense layer 1 [bs, hidden_size]
        output = self.B2 + torch.mm(X, self.RW) + torch.mm(tanh, self.W2) # Dense layer 2 with residual [bs, vocab_size]
        return output

model = LM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

# Training
for epoch in range(5000):

    optimizer.zero_grad()
    output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

Epoch: 1000 cost = 0.077206
Epoch: 2000 cost = 0.014483
Epoch: 3000 cost = 0.005580
Epoch: 4000 cost = 0.002632
Epoch: 5000 cost = 0.001359
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']


In [4]:
input_batch

tensor([[2, 4],
        [2, 5],
        [2, 1]])

In [15]:
model(input_batch)

tensor([[-1.6254, -2.5839, -5.4058, -1.3580, -2.9153, -3.0252,  5.5526],
        [-0.7710, -1.8598, -5.1981,  6.9356, -4.8507, -1.7408, -0.1593],
        [ 6.0050, -7.2826, -7.4941, -1.7030, -1.9721, -3.7917, -0.5306]],
       grad_fn=<AddBackward0>)

In [20]:
target_batch

tensor([6, 3, 0])

In [21]:
number_dict

{0: 'milk', 1: 'hate', 2: 'i', 3: 'coffee', 4: 'like', 5: 'love', 6: 'dog'}

In [19]:
torch.sigmoid(model(input_batch))

tensor([[1.6446e-01, 7.0180e-02, 4.4704e-03, 2.0457e-01, 5.1403e-02, 4.6301e-02,
         9.9614e-01],
        [3.1626e-01, 1.3472e-01, 5.4965e-03, 9.9903e-01, 7.7625e-03, 1.4921e-01,
         4.6025e-01],
        [9.9754e-01, 6.8694e-04, 5.5604e-04, 1.5407e-01, 1.2217e-01, 2.2059e-02,
         3.7037e-01]], grad_fn=<SigmoidBackward>)

In [13]:
emb_layer = model.C
n_embs = emb_layer(input_batch)
n_embs

tensor([[[-1.0639,  1.4894],
         [-1.4890, -0.5873]],

        [[-1.0639,  1.4894],
         [ 1.2035,  2.2967]],

        [[-1.0639,  1.4894],
         [ 0.8084, -1.6483]]], grad_fn=<EmbeddingBackward>)

In [16]:
embs_concat = n_embs.view(-1, n_step * emb_size)
embs_concat

tensor([[-1.0639,  1.4894, -1.4890, -0.5873],
        [-1.0639,  1.4894,  1.2035,  2.2967],
        [-1.0639,  1.4894,  0.8084, -1.6483]], grad_fn=<ViewBackward>)

In [17]:
model.H

Parameter containing:
tensor([[ 0.1639, -0.6477],
        [-1.6906, -0.5368],
        [ 2.4811, -1.9047],
        [ 0.4444, -0.0475]], requires_grad=True)

In [18]:
tanh = torch.tanh(model.d + torch.mm(embs_concat, model.H))
tanh

tensor([[-1.0000,  0.9813],
        [ 0.8723, -0.9944],
        [-0.8834, -0.9636]], grad_fn=<TanhBackward>)