In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from pprint import pprint
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.sequence import pad_sequences
torch.manual_seed(1)

Using TensorFlow backend.


<torch._C.Generator at 0x109ff3bf0>

In [2]:
batch_size = 4
sequence_len = 6
sample_batch = torch.t(torch.tensor(pad_sequences([[1,2,3,4,4],[3,2,1],[1,2],[1,1,4,3,4,2]], dtype=np.int64)))
sample_batch

tensor([[ 0,  0,  0,  1],
        [ 1,  0,  0,  1],
        [ 2,  0,  0,  4],
        [ 3,  3,  0,  3],
        [ 4,  2,  1,  4],
        [ 4,  1,  2,  2]])

In [3]:
embedding_dim = 8
embedding = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=5)
embedding

Embedding(5, 8)

In [4]:
embedded_input = embedding(sample_batch)
embedded_input.shape # (sequence, batch, token)

torch.Size([6, 4, 8])

In [5]:
hidden_size = 16
layer_a_cell = nn.GRUCell(input_size=embedding_dim, hidden_size=hidden_size)
layer_b_cell = nn.GRUCell(input_size=hidden_size, hidden_size=hidden_size)

In [6]:
tok_1 = embedded_input[0, :, :]
tok_2 = embedded_input[1, :, :]

In [7]:
ha_0 = torch.zeros((batch_size, hidden_size))
hb_0 = torch.zeros((batch_size, hidden_size))
ha = layer_a_cell(tok_1, ha_0)
hb = layer_b_cell(ha, hb_0)

In [8]:
torch.cat([ha, hb], dim=1).shape

torch.Size([4, 32])

In [9]:
class SelfAttention(nn.Module):
    
    def __init__(self, input_vector_size = 16, hidden_size = 16, input_context_size = None):
        super(SelfAttention, self).__init__()
        if input_context_size is not None:
            self.W_a = torch.randn(input_context_size, hidden_size, requires_grad=True)
        else:
            self.W_a = None
        self.U_a = torch.randn(input_vector_size, hidden_size, requires_grad=True)
        self.v_a = torch.randn(1, hidden_size, requires_grad=True)
        
    def forward(self, vectors, predicate = None):
        if self.W_a is None or predicate is None:
            pre_activation_energies = F.tanh(vectors @ self.U_a)
        else:
            pre_activation_energies = F.tanh(predicate @ self.W_a + vectors @ self.U_a)
        activation_energies = torch.einsum("tbd,ad->tb", (pre_activation_energies.clone(), self.v_a.clone()))
        
        return torch.t(activation_energies)

    def combine(self, vectors, predicate = None):
        activation_energies = torch.t(self(vectors, predicate))
        activations = F.softmax(activation_energies, dim=0)
        context = torch.einsum("tb,tbd->bd", (activations.clone(), vectors.clone()))
        return context

In [10]:
#ha_new = ha + hb
#composition = nn.Linear(2 * hidden_size, hidden_size); ha_new = composition(torch.cat([ha, hb], dim=1))
att_hidden_size=10
ha_attention = SelfAttention(input_vector_size=hidden_size, hidden_size=att_hidden_size)
ha_new = ha_attention.combine(torch.stack([ha, hb]))
ha = layer_a_cell(tok_2, ha_new)
hb = layer_b_cell(ha, hb)

In [11]:
class FBRNN(nn.Module):
    
    def __init__(self, embedding_dim, vocab_size, hidden_size, num_layers, attention_hidden_size):
        super(FBRNN, self).__init__()
        # HyperParameters
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention_hidden_size = attention_hidden_size
        
        # Parameters
        self.embedding = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)
        self.cells = nn.ModuleList([nn.GRUCell(input_size=embedding_dim, hidden_size=hidden_size)] + 
                                   [nn.GRUCell(input_size=hidden_size, hidden_size=hidden_size) for i in range(num_layers-1)])
        self.hidden_attentions = nn.ModuleList([SelfAttention(input_vector_size=hidden_size, hidden_size=attention_hidden_size) for i in range(num_layers)])
    
    def forward(self, batch):
        embedded_input = self.embedding(batch)
        batch_size = batch.shape[1]
        h_s = [torch.zeros((batch_size, self.hidden_size)) for i in range(self.num_layers)]
         
        out_vecs = []
        for token in embedded_input.split(1):
            input_vec = token.squeeze()
            # get all outputs (go up)
            new_h_s = []
            for h, cell in zip(h_s, self.cells):
                input_vec = cell(input_vec, h)
                new_h_s.append(input_vec)

            h_s = []
            # compute new hidden states using attention (go right)
            for i, att in enumerate(self.hidden_attentions):
                h_s.append(att.combine(torch.stack(new_h_s[i:])))
            out_vecs.append(h_s[-1])
        return out_vecs

In [12]:
model = FBRNN(embedding_dim=embedding_dim, vocab_size=5, hidden_size=hidden_size, num_layers=8, attention_hidden_size=att_hidden_size)

In [13]:
torch.stack(model(sample_batch)).shape, sample_batch.shape

(torch.Size([6, 4, 16]), torch.Size([6, 4]))

In [14]:
for epoch in range(epochs):
    for batch in train_generator:
        model.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, targets)
        loss.backward()
        opt.step()

NameError: name 'epochs' is not defined