## Preparing

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')

In [None]:
from recibrew.nn.gru_bahdanau import Encoder

In [None]:
from recibrew.data_util import construct_torchtext_iterator
train_csv = '../data/processed/train.csv'
dev_csv = '../data/processed/dev.csv'
test_csv = '../data/processed/test.csv'
constructed_ttext = construct_torchtext_iterator(train_csv, dev_csv, test_csv, device='cpu', fix_length=None)

In [5]:
train_iter = constructed_ttext['train_iter']

In [6]:
src_field = constructed_ttext['src_field']
max_vocab = len(src_field.vocab)


In [7]:
btch = next(train_iter.__iter__())
src, tgt = btch.src, btch.tgt

In [8]:
hidden_dim = 100

In [34]:
enc_gru = Encoder(max_vocab, dropout=0.1)

TypeError: super(type, obj): obj must be an instance or subtype of type

In [12]:
enc_gru(src)[0].shape

torch.Size([12, 64, 100])

In [13]:
enc_gru(src)[1].shape

torch.Size([1, 64, 100])

Bahdanau Attention

In [14]:
import torch

In [15]:
# Input hidden = [1, bs, hidden_size]
# Input enc_out = [seq_len, bs, hidden_size]

In [16]:
W1 = torch.nn.Linear(hidden_dim, hidden_dim)
W2 = torch.nn.Linear(hidden_dim, hidden_dim)
V = torch.nn.Linear(hidden_dim, 1)


In [21]:
query = enc_gru(src)[1]
values = enc_gru(src)[0]

In [25]:
score = V(torch.tanh(
        W1(query) + W2(values)))


In [28]:
# score = [seqlen, bs, 1]

In [30]:
attention_weights = torch.softmax(score, axis=0)

In [None]:
# att weight score : [seqlen, bs, 1]

In [31]:
context_vector = attention_weights * values

In [35]:
context_vector = torch.sum(context_vector, dim=0)


In [37]:
context_vector.shape

torch.Size([64, 100])

In [None]:
# context_vector_shape : [bs, hidden_dim]


In [18]:
from recibrew.nn.gru_bahdanau import BahdanauAttention

In [19]:
b_att = BahdanauAttention(100)

In [24]:
b_att(query, values)

tensor([[ 0.1402, -0.0971, -0.0019,  ..., -0.1482,  0.0383,  0.1337],
        [ 0.0377, -0.0620, -0.1322,  ...,  0.0523,  0.0596,  0.0571],
        [-0.1380,  0.1144, -0.0734,  ...,  0.0583, -0.0331,  0.1092],
        ...,
        [ 0.0687, -0.0148,  0.0182,  ..., -0.0234,  0.0804,  0.1758],
        [ 0.0045, -0.0015, -0.1421,  ..., -0.0637,  0.1635, -0.0751],
        [ 0.1869,  0.0188, -0.1835,  ..., -0.1759,  0.0602,  0.1225]],
       grad_fn=<SumBackward1>)

Decoder Time

In [112]:
import torch
from recibrew.nn.gru_bahdanau import Encoder

In [113]:
num_embeddings=100
hidden_dim = 100

In [114]:
embedding = torch.nn.Embedding(num_embeddings=max_vocab, embedding_dim=num_embeddings)

In [115]:
src_embedded = embedding(src)

In [116]:
encoder = Encoder(hidden_dim, dropout=0.1, enc_gru_layers=2)

In [117]:
enc_out, hidden = encoder(src_embedded)

In [118]:
from recibrew.nn.lstm_bahdanau import Decoder, BahdanauAttention

In [119]:
attention = BahdanauAttention(num_embeddings, enc_gru_layers=2, bidirectional=True)

In [120]:
hidden = torch.cat([ x for x in hidden], axis=1)

In [121]:
hidden = hidden.unsqueeze(0)

In [122]:
# hidden : 1, bs, hidden_size * (2 or 1 according to bidirectional * gru_layers)
# enc_out : [ seq_len, bs, hidden_size * (2 or 1 according to bidirectional))]

In [124]:
tgt_input = torch.LongTensor([2] * 64).unsqueeze(0)

In [125]:
tgt_input.shape

torch.Size([1, 64])

In [126]:
tgt_input_embedded = embedding(tgt_input)

In [127]:
context_vector, attention_weights = attention.forward(concatenate_hidden, enc_out)

In [128]:
context_vector.shape

torch.Size([64, 200])

In [139]:
x = torch.cat([context_vector.unsqueeze(0), tgt_input_embedded], axis=2)

In [141]:
x.shape

torch.Size([1, 64, 300])

In [None]:
# x shape : [ seq_len, bs, num_embedding + hidden_encoder_unit (* 2 according to bidirectional)]

In [136]:
bidirectional=True
dropout=0.1
gru_layers=1

In [137]:
gru = torch.nn.GRU(num_embeddings + (hidden_dim * 2 if bidirectional else hidden_dim), hidden_dim, dropout=dropout, num_layers=gru_layers)



In [142]:
output, state = gru(x)

In [149]:
# output : [seq_len, bs, hidden_dim]

In [146]:
linear = torch.nn.Linear(hidden_dim, max_vocab)

In [154]:
out_linear = linear(output)

In [160]:
# out_linear shape = [ seq_len, bs, vocab_size ]

In [159]:
out_linear = out_linear.view(-1, out_linear.shape[2])

In [162]:
# out_linear shape : [ bs * seq_len, vocab_size]

## Trying decoder

In [267]:
from recibrew.nn.gru_bahdanau import Decoder, Encoder

In [268]:
btch = next(train_iter.__iter__())
src, tgt = btch.src, btch.tgt

In [269]:
num_embedding = 100
hidden_dim = 100
enc_gru_layers = 2
enc_bidirectional = True
dropout=0.1
vocab_size = max_vocab

In [270]:
encoder = Encoder(hidden_dim, dropout=dropout, enc_gru_layers=enc_gru_layers, enc_bidirectional=enc_bidirectional)

In [271]:
dcd = Decoder(num_embedding, hidden_dim, vocab_size = vocab_size, enc_gru_layers=enc_gru_layers, enc_bidirectional=enc_bidirectional)

In [272]:
embedding = torch.nn.Embedding(num_embeddings=max_vocab, embedding_dim=num_embedding)

In [273]:
src_embedded = embedding(src)

In [274]:
enc_out, hidden = encoder(src_embedded)

In [285]:
tgt_input = tgt[3,:].unsqueeze(0)

In [296]:
tgt_targets = tgt[1:, :]
tgt_inputs = tgt[:-1, :]

In [320]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=1)

In [321]:
loss = 0
counter = 0
for i in range(tgt_targets.shape[0]):
    tgt_input = tgt_inputs[i:i+1, :]
    tgt_gold = tgt_targets[i, :]
    tgt_embedded = embedding(tgt_input)
    pred, hidden, _ = dcd.forward(tgt_embedded, hidden, enc_out)
    loss += criterion.forward(pred, tgt_gold)
    counter += 1

In [322]:
loss = loss / counter

tensor(8.0194, grad_fn=<DivBackward0>)

In [309]:
tgt_gold.shape

torch.Size([64])

In [310]:
pred.shape

torch.Size([64, 3004])

In [286]:
tgt_embedded = embedding(tgt_input)

In [287]:
out_linear, state, attention_weights = dcd.forward(tgt_embedded, hidden, enc_out)

In [288]:
hidden = state