In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

- Additive Attention

In [6]:
device = torch.device('cpu')

SEQ_LEN     = 15
BATCH_SIZE  = 5
INPUT_DIM   = 30
OUTPUT_DIM  = 37
HID_DIM     = 256
ENC_EMB_DIM = DEC_EMB_DIM = 32
ENC_HID_DIM = DEC_HID_DIM = 64
ENC_LAYERS  = DEC_LAYERS  = 3
ENC_HEADS   = DEC_HEADS   = 8
ENC_PF_DIM  = DEC_PF_DIM  = 512
ENC_DROPOUT = DEC_DROPOUT = 0.1

In [3]:
SRC_PAD_IDX = TRG_PAD_IDX = 1
MIN_WORDS   = 5

src_seq_length = torch.randint(MIN_WORDS, SEQ_LEN-1, (BATCH_SIZE,))
trg_seq_length = torch.randint(MIN_WORDS, SEQ_LEN-1, (BATCH_SIZE,))
if SEQ_LEN - 1 not in src_seq_length:
    src_seq_length[-1] = SEQ_LEN - 2
if SEQ_LEN - 1 not in trg_seq_length:
    trg_seq_length[-1] = SEQ_LEN - 2

x = torch.randint(0+2, INPUT_DIM-2, size=(BATCH_SIZE, SEQ_LEN))
x[:, 0] = 0
for i, ind in enumerate(src_seq_length):
    x[i, ind+1 ] = INPUT_DIM - 1
    x[i, ind+2:] = SRC_PAD_IDX

y = torch.randint(0+2, OUTPUT_DIM-2, size=(BATCH_SIZE, SEQ_LEN))
y[:, 0] = 0
for i, ind in enumerate(trg_seq_length):
    y[i, ind+1 ] = OUTPUT_DIM - 1
    y[i, ind+2:] = TRG_PAD_IDX

print(x, x.shape, end='\n\n')
print(y, y.shape)

tensor([[ 0, 25,  4, 25, 21, 25,  9,  6, 29,  1,  1,  1,  1,  1,  1],
        [ 0, 26,  7, 11,  7, 18, 11,  6, 12,  7, 29,  1,  1,  1,  1],
        [ 0,  8,  8, 25,  9, 24, 17, 14, 16, 24, 21, 10, 24, 29,  1],
        [ 0,  5, 25,  3, 17, 21, 17,  4, 25,  3, 15, 29,  1,  1,  1],
        [ 0, 10,  7, 17, 23,  7,  9, 13,  8,  8,  7,  9, 20, 16, 29]]) torch.Size([5, 15])

tensor([[ 0, 11, 17, 27,  3, 28, 28, 36,  1,  1,  1,  1,  1,  1,  1],
        [ 0, 13, 26, 30,  6, 17,  2, 11, 13, 19, 17, 32, 36,  1,  1],
        [ 0, 33, 24, 32,  7, 30, 36,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 0, 13, 19, 34, 21,  9, 27, 15, 36,  1,  1,  1,  1,  1,  1],
        [ 0, 28, 22, 15,  9, 20, 11, 27, 16,  7, 25,  7, 20, 22, 36]]) torch.Size([5, 15])


In [19]:
class Encoder(nn.Module):

    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, 
                          enc_hid_dim, 
                          bidirectional=True,
                          batch_first=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        H = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        hidden = torch.tanh(self.fc(H))
        return outputs, hidden

In [20]:
encoder = Encoder(INPUT_DIM, 
                  ENC_EMB_DIM, 
                  ENC_HID_DIM, 
                  DEC_HID_DIM, 
                  ENC_DROPOUT)

In [21]:
encoder_outputs, hidden = encoder(x)

In [22]:
encoder_outputs.shape, hidden.shape

(torch.Size([5, 15, 128]), torch.Size([5, 64]))

In [29]:
attn = nn.Linear((ENC_HID_DIM * 2) + DEC_HID_DIM, DEC_HID_DIM)
v    = nn.Linear(DEC_HID_DIM, 1, bias=False)

In [31]:
batch_size = encoder_outputs.shape[0]
src_len    = encoder_outputs.shape[1]
batch_size, src_len

(5, 15)

In [34]:
hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
hidden.shape

torch.Size([5, 15, 64])

In [36]:
encoder_outputs.shape

torch.Size([5, 15, 128])

In [37]:
attn_input = torch.cat((hidden, encoder_outputs), dim=2)
attn_input.shape

torch.Size([5, 15, 192])

In [38]:
energy = torch.tanh(attn(attn_input))
energy.shape

torch.Size([5, 15, 64])

In [39]:
attention = v(energy).squeeze(2)
attention

tensor([[-0.0539, -0.0065,  0.0062,  0.0372, -0.0093,  0.0023, -0.0123, -0.0547,
         -0.0239, -0.0520, -0.0222, -0.0278,  0.0077,  0.0225,  0.0639],
        [-0.1007, -0.0163,  0.0256, -0.0813,  0.0054, -0.0546, -0.0751, -0.0746,
         -0.0089,  0.0729,  0.0307, -0.0212, -0.0064,  0.0150,  0.0343],
        [-0.0608, -0.0555, -0.0218,  0.0031,  0.0006,  0.0082, -0.0632, -0.0951,
         -0.0731, -0.0216, -0.0165,  0.0403,  0.0202, -0.0054,  0.0102],
        [-0.0029,  0.1008,  0.0611,  0.0713,  0.0479,  0.0121,  0.0158,  0.0386,
          0.0447,  0.0643,  0.1064,  0.0596,  0.0050,  0.0535,  0.0291],
        [-0.0200,  0.0215,  0.0154, -0.0228, -0.0933, -0.0283,  0.0089, -0.0378,
         -0.0633, -0.0663, -0.0248,  0.0035,  0.0336, -0.0549, -0.0254]],
       grad_fn=<SqueezeBackward1>)

In [51]:
annotation = F.softmax(attention, dim=1)
annotation

tensor([[0.0637, 0.0667, 0.0676, 0.0697, 0.0666, 0.0673, 0.0664, 0.0636, 0.0656,
         0.0638, 0.0657, 0.0653, 0.0677, 0.0687, 0.0716],
        [0.0612, 0.0666, 0.0695, 0.0624, 0.0681, 0.0641, 0.0628, 0.0629, 0.0671,
         0.0729, 0.0698, 0.0663, 0.0673, 0.0688, 0.0701],
        [0.0641, 0.0644, 0.0666, 0.0683, 0.0681, 0.0687, 0.0639, 0.0619, 0.0633,
         0.0666, 0.0670, 0.0709, 0.0695, 0.0677, 0.0688],
        [0.0634, 0.0703, 0.0676, 0.0683, 0.0667, 0.0643, 0.0646, 0.0661, 0.0665,
         0.0678, 0.0707, 0.0675, 0.0639, 0.0671, 0.0654],
        [0.0669, 0.0697, 0.0693, 0.0667, 0.0621, 0.0663, 0.0688, 0.0657, 0.0640,
         0.0638, 0.0665, 0.0685, 0.0705, 0.0646, 0.0665]],
       grad_fn=<SoftmaxBackward>)

In [53]:
annotation = annotation.unsqueeze(1)
annotation.shape

torch.Size([5, 1, 15])

In [54]:
weighted = torch.bmm(annotation, encoder_outputs)
weighted.shape
# (b, n, m) X (b, m, p) ==>> (b, n, p)

torch.Size([5, 1, 128])

In [57]:
rnn_input = torch.cat((embedded, weighted), dim=2)
rnn_input.shape

torch.Size([5, 1, 160])

- Additive Self-attention

In [58]:
x

tensor([[ 0, 25,  4, 25, 21, 25,  9,  6, 29,  1,  1,  1,  1,  1,  1],
        [ 0, 26,  7, 11,  7, 18, 11,  6, 12,  7, 29,  1,  1,  1,  1],
        [ 0,  8,  8, 25,  9, 24, 17, 14, 16, 24, 21, 10, 24, 29,  1],
        [ 0,  5, 25,  3, 17, 21, 17,  4, 25,  3, 15, 29,  1,  1,  1],
        [ 0, 10,  7, 17, 23,  7,  9, 13,  8,  8,  7,  9, 20, 16, 29]])

In [64]:
device = torch.device('cpu')

SEQ_LEN     = 15
BATCH_SIZE  = 5
INPUT_DIM   = 30
OUTPUT_DIM  = 37
HID_DIM     = 256
ENC_EMB_DIM = DEC_EMB_DIM = 32
ENC_HID_DIM = DEC_HID_DIM = 64
ENC_LAYERS  = DEC_LAYERS  = 3
ENC_HEADS   = DEC_HEADS   = 8
ENC_PF_DIM  = DEC_PF_DIM  = 512
ENC_DROPOUT = DEC_DROPOUT = 0.1

r = 8

In [61]:
embed = nn.Embedding(INPUT_DIM,  # vocab_size
                     ENC_EMB_DIM # embedding_size
                    )

In [63]:
rnn = nn.LSTM(ENC_EMB_DIM,
              ENC_HID_DIM, # hidden_size
              num_layers=1,
              batch_first=True,
              bidirectional=True
             )

In [66]:
attn = nn.Linear(2 * ENC_HID_DIM, # num_directions*hidden_size
                 DEC_HID_DIM,     # attention_dimension
                 bias=False
                )
attn2 = nn.Linear(DEC_HID_DIM,    # attention_dimension
                  r,              # keywords
                                  # (different parts to be expected
                                  #  from the sentence)
                  bias=False
                 )

In [67]:
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
attn_dist = nn.Softmax(dim=2)

In [68]:
fc = nn.Sequential(
    nn.Linear(r * ENC_HID_DIM, 2, 16),
    nn.ReLU(),
    nn.Linear(16, 2), # fc의 hidden_size, output_size
)

In [69]:
embedded = embed(x)

In [71]:
embedded.shape

torch.Size([5, 15, 32])

In [73]:
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

In [74]:
hidden = Variable(
    torch.zeros(1*2, batch_size, ENC_HID_DIM)
)
cell = Variable(
    torch.zeros(1*2, batch_size, ENC_HID_DIM)
)

In [78]:
embedded.shape

torch.Size([5, 15, 32])

In [88]:
packed = pack_padded_sequence(embed, 
                              (x != 1).sum(dim=1).tolist(), 
                              batch_first=True)
packed

TypeError: _pack_padded_sequence(): argument 'input' (position 1) must be Tensor, not Embedding