In [1]:
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim

In [2]:
vocab_size = 256  # ascii size
x_ = list(map(ord, "hello"))  # convert to list of ascii codes
y_ = list(map(ord, "hola"))   # convert to list of ascii codes
print("hello -> ", x_)
print("hola  -> ", y_)

hello ->  [104, 101, 108, 108, 111]
hola  ->  [104, 111, 108, 97]


In [3]:
x = Variable(th.LongTensor(x_))
y = Variable(th.LongTensor(y_))

In [4]:
print(x)

tensor([104, 101, 108, 108, 111])


In [5]:
'''
Model using GRU and conventional concatenating motion.
'''
class Seq2Seq_GRU(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Seq2Seq_GRU, self).__init__()

        self.n_layers = 1
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.encoder = nn.GRU(hidden_size, hidden_size)
        self.decoder = nn.GRU(hidden_size * 2, hidden_size)
        self.project = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs, targets):
        # Encoder inputs and states
        initial_state = self._init_state()
        embedding = self.embedding(inputs).unsqueeze(1)
        encoder_output, encoder_state = self.encoder(embedding, initial_state)
        outputs = []

        decoder_state = encoder_state
        for i in range(targets.size()[0]): 
            decoder_input = self.embedding(targets)[i].view(1,-1, self.hidden_size)
            decoder_input = th.cat((decoder_input, encoder_state), 2)
            decoder_output, decoder_state = self.decoder(decoder_input, decoder_state)
            projection = self.project(decoder_output)#.unsqueeze(0))
            outputs.append(projection)
            
            #_, top_i = prediction.data.topk(1)
            
        outputs = th.stack(outputs, 1).squeeze()

        return outputs
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return Variable(weight.new(self.n_layers, batch_size, self.hidden_size).zero_()) 



In [6]:
model = Seq2Seq_GRU(vocab_size, 16)
pred = model(x, y)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = th.optim.Adam(model.parameters(), lr=1e-3)

In [8]:
y_.append(3)
y_label = Variable(th.LongTensor(y_[1:]))

In [9]:
print(y_label.shape)
print(y_label)

torch.Size([4])
tensor([111, 108,  97,   3])


In [10]:
log = []
for i in range(10000):
    prediction = model(x, y)
    loss = criterion(prediction, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_val = loss.data
    log.append(loss_val)
    if i % 100 == 0:
        print("%d loss: %s" % (i, loss_val.item()))
        _, top1 = prediction.data.topk(1, 1)
        for c in top1.squeeze().numpy().tolist():
            print(chr(c), end=" ")
        print()

0 loss: 5.611233234405518
E 7 V 4 
100 loss: 1.9432737827301025
h o l a 
200 loss: 0.5616824626922607
h o l a 
300 loss: 0.2673376798629761
h o l a 
400 loss: 0.16206204891204834
h o l a 
500 loss: 0.11266088485717773
h o l a 
600 loss: 0.08434617519378662
h o l a 
700 loss: 0.06614077091217041
h o l a 
800 loss: 0.05353283882141113
h o l a 
900 loss: 0.04404950141906738
h o l a 
1000 loss: 0.03680562973022461
h o l a 
1100 loss: 0.03102242946624756
h o l a 
1200 loss: 0.026388049125671387
h o l a 
1300 loss: 0.022706985473632812
h o l a 
1400 loss: 0.019757390022277832
h o l a 
1500 loss: 0.0173567533493042
h o l a 
1600 loss: 0.015371441841125488
h o l a 
1700 loss: 0.013707399368286133
h o l a 
1800 loss: 0.012296199798583984
h o l a 
1900 loss: 0.011086702346801758
h o l a 
2000 loss: 0.010039806365966797
h o l a 
2100 loss: 0.009126663208007812
h o l a 
2200 loss: 0.008322954177856445
h o l a 
2300 loss: 0.007612466812133789
h o l a 
2400 loss: 0.006980180740356445
h o l a 
2500 l

In [11]:
import matplotlib.pyplot as plt
plt.plot(log)
plt.xlim(0,150)
plt.ylim(0,15)
plt.ylabel('cross entropy loss')
plt.show()


<Figure size 640x480 with 1 Axes>