In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torch.nn import Parameter
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch import nn
from torch.nn import functional as F

import numpy as np
import itertools
from tqdm import tqdm

import argparse

In [2]:
keys = torch.rand(5, 256, 512*2)
projected_k = nn.Conv1d(512*2, 512, 1, 1)
projected_k(keys.permute(1, 2, 0)).shape

torch.Size([256, 512, 5])

In [3]:
# Convert the predicted probabilities to a tensor of shape `(batch_size, num_classes)`.
predicted_probabilities = torch.tensor([[0.1, 0.2, 0.3, 0.4, 0.5],
                                      [0.2, 0.3, 0.4, 0.5, 0.6]])

# Take the index of the maximum probability in each row of the predicted probabilities tensor.
true_labels = torch.argmax(predicted_probabilities, dim=1)

# Convert the true labels to a tensor of shape `(batch_size)`.
true_labels = true_labels.unsqueeze(1)

In [4]:
# Create a `torch.nn.NLLLoss` object.
loss_fn = torch.nn.NLLLoss()

# Convert the predicted probabilities to a tensor of shape `(batch_size, num_classes)`.
predicted_probabilities = torch.tensor([[0.1, 0.2, 0.3, 0.4, 0.5],
                                      [0.2, 0.3, 0.4, 0.5, 0.6]])

# Convert the true labels to a tensor of shape `(batch_size)`.
true_labels = torch.tensor([1, 3])

# Calculate the loss using the `torch.nn.NLLLoss` object.
loss = loss_fn(predicted_probabilities, true_labels)

In [5]:
predicted_probabilities.shape

torch.Size([2, 5])

In [6]:
parser = argparse.ArgumentParser(description="Pytorch implementation of Pointer-Net")

# Data
parser.add_argument('--train_size', default=10000, type=int, help='Training data size')
parser.add_argument('--val_size', default=1000, type=int, help='Validation data size')
parser.add_argument('--test_size', default=1000, type=int, help='Test data size')
parser.add_argument('--batch_size', default=256, type=int, help='Batch size')
# Train
parser.add_argument('--nof_epoch', default=5, type=int, help='Number of epochs')
parser.add_argument('--lr', type=float, default=0.00001, help='Learning rate')
# GPU
parser.add_argument('--gpu', default=True, action='store_true', help='Enable gpu')
# TSP
parser.add_argument('--nof_points', type=int, default=5, help='Number of points in TSP')
# Network
parser.add_argument('--embedding_size', type=int, default=128, help='Embedding size')
parser.add_argument('--hidden_size', type=int, default=512, help='Number of hidden units')
parser.add_argument('--nof_lstms', type=int, default=2, help='Number of LSTM layers')
parser.add_argument('--dropout', type=float, default=0., help='Dropout value')
parser.add_argument('--bidir', default=True, action='store_true', help='Bidirectional')

params, unknown = parser.parse_known_args()

**DATA GENERATOR**

In [7]:
def tsp_opt(points):
    """
    Dynamic programing solution for TSP - O(2^n*n^2)
    https://gist.github.com/mlalevic/6222750

    :param points: List of (x, y) points
    :return: Optimal solution
    """

    def length(x_coord, y_coord):
        return np.linalg.norm(np.asarray(x_coord) - np.asarray(y_coord))

    # Calculate all lengths
    all_distances = [[length(x, y) for y in points] for x in points]
    # Initial value - just distance from 0 to every other point + keep the track of edges
    A = {(frozenset([0, idx+1]), idx+1): (dist, [0, idx+1]) for idx, dist in enumerate(all_distances[0][1:])}
    cnt = len(points)
    for m in range(2, cnt):
        B = {}
        for S in [frozenset(C) | {0} for C in itertools.combinations(range(1, cnt), m)]:
            for j in S - {0}:
                # This will use 0th index of tuple for ordering, the same as if key=itemgetter(0) used
                B[(S, j)] = min([(A[(S-{j}, k)][0] + all_distances[k][j], A[(S-{j}, k)][1] + [j])
                                 for k in S if k != 0 and k != j])
        A = B
    res = min([(A[d][0] + all_distances[0][d[1]], A[d][1]) for d in iter(A)])
    return np.asarray(res[1])

In [8]:
def length(x_coord, y_coord):
    return np.linalg.norm(np.asarray(x_coord) - np.asarray(y_coord))

points = [(0,3), (4,0), (2,2), (1,1)]
all_distances = [[length(x, y) for y in points] for x in points]
print('all_distances -', all_distances)

for idx, dist, in enumerate(all_distances[0][1:]):
    print('idx, dist -', (idx, dist))

A = {(frozenset([0, idx+1]), idx+1): (dist, [0, idx+1]) for idx, dist in enumerate(all_distances[0][1:])}
print('A -', A)

cnt = len(points)
for m in range(2, cnt):
    B = {}
    for S in [frozenset(C) | {0} for C in itertools.combinations(range(1, cnt), m)]:
        print('S -', S)
        for j in S - {0}:
            B[(S, j)] = min([(A[(S-{j}, k)][0] + all_distances[k][j], A[(S-{j}, k)][1] + [j]) for k in S if k != 0 and k != j])
            print(B[(S, j)])
    A = B
res = min([(A[d][0] + all_distances[0][d[1]], A[d][1]) for d in iter(A)])
print('result -', np.asarray(res[1]))

all_distances - [[0.0, 5.0, 2.23606797749979, 2.23606797749979], [5.0, 0.0, 2.8284271247461903, 3.1622776601683795], [2.23606797749979, 2.8284271247461903, 0.0, 1.4142135623730951], [2.23606797749979, 3.1622776601683795, 1.4142135623730951, 0.0]]
idx, dist - (0, 5.0)
idx, dist - (1, 2.23606797749979)
idx, dist - (2, 2.23606797749979)
A - {(frozenset({0, 1}), 1): (5.0, [0, 1]), (frozenset({0, 2}), 2): (2.23606797749979, [0, 2]), (frozenset({0, 3}), 3): (2.23606797749979, [0, 3])}
S - frozenset({0, 1, 2})
(5.06449510224598, [0, 2, 1])
(7.82842712474619, [0, 1, 2])
S - frozenset({0, 1, 3})
(5.39834563766817, [0, 3, 1])
(8.16227766016838, [0, 1, 3])
S - frozenset({0, 2, 3})
(3.6502815398728847, [0, 3, 2])
(3.6502815398728847, [0, 2, 3])
S - frozenset({0, 1, 2, 3})
(6.4787086646190755, [0, 3, 2, 1])
(8.22677276241436, [0, 3, 1, 2])
(8.22677276241436, [0, 2, 1, 3])
result - [0 2 1 3]


In [9]:
max_len = 10
X = [np.random.RandomState(seed = i).random() for i in range (0, max_len)]
Y = [np.random.RandomState(seed = i).random() for i in range (10000-max_len, 10000)]
points = [(x, y) for x, y in zip(X, Y)]
points

[(0.5488135039273248, 0.3450895789457291),
 (0.417022004702574, 0.9512867249982255),
 (0.43599490214200376, 0.7226542571564433),
 (0.5507979025745755, 0.7452819903836411),
 (0.9670298390136767, 0.17371858915349359),
 (0.22199317108973948, 0.43601566470464426),
 (0.8928601514360016, 0.6696328241771722),
 (0.07630828937395717, 0.04175597567215328),
 (0.8734294027918162, 0.19354372191863545),
 (0.010374153885699955, 0.8233890742543671)]

In [10]:
class TSPDataset(Dataset):
    """
    Random TSP dataset

    """

    def __init__(self, data_size, seq_len, solver=tsp_opt, solve=True):
        self.data_size = data_size
        self.seq_len = seq_len
        self.solve = solve
        self.solver = solver
        self.data = self._generate_data()

    def __len__(self):
        return self.data_size

    def __getitem__(self, idx):
        tensor = torch.from_numpy(self.data['Points_List'][idx]).float()
        solution = torch.from_numpy(self.data['Solutions'][idx]).long() if self.solve else None

        sample = {'Points':tensor, 'Solution':solution}

        return sample

    def _generate_data(self):
        """
        :return: Set of points_list ans their One-Hot vector solutions
        """
        points_list = []
        solutions = []
        data_iter = tqdm(range(self.data_size), unit='data')
        for i, _ in enumerate(data_iter):
            data_iter.set_description('Data points %i/%i' % (i+1, self.data_size))
            points_list.append(np.random.random((self.seq_len, 2)))
        solutions_iter = tqdm(points_list, unit='solve')
        if self.solve:
            for i, points in enumerate(solutions_iter):
                solutions_iter.set_description('Solved %i/%i' % (i+1, len(points_list)))
                solutions.append(self.solver(points))
        else:
            solutions = None

        return {'Points_List':points_list, 'Solutions':solutions}

    def _to1hotvec(self, points):
        """
        :param points: List of integers representing the points indexes
        :return: Matrix of One-Hot vectors
        """
        vec = np.zeros((len(points), self.seq_len))
        for i, v in enumerate(vec):
            v[points[i]] = 1

        return vec

In [11]:
dataset = TSPDataset(data_size = 1000, seq_len = 5)
dataset[0]

Data points 1000/1000: 100%|██████████| 1000/1000 [00:00<00:00, 2022.51data/s]
Solved 1000/1000: 100%|██████████| 1000/1000 [00:00<00:00, 1404.98solve/s]


{'Points': tensor([[0.5211, 0.1678],
         [0.8013, 0.2398],
         [0.8029, 0.0372],
         [0.0552, 0.8748],
         [0.6464, 0.6626]]),
 'Solution': tensor([0, 2, 1, 4, 3])}

In [12]:
dataloader = DataLoader(
    dataset,
    batch_size = 64,
    shuffle = True,
    num_workers = 0
)

In [13]:
len(dataloader.dataset)

1000

In [14]:
iterator = tqdm(dataloader, unit='Batch')
for i, batched in enumerate(iterator):
    print(batched['Points'].shape)

100%|██████████| 16/16 [00:00<00:00, 881.32Batch/s]

torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([64, 5, 2])
torch.Size([40, 5, 2])





**ENCODER**

In [15]:
params

Namespace(train_size=10000, val_size=1000, test_size=1000, batch_size=256, nof_epoch=5, lr=1e-05, gpu=True, nof_points=5, embedding_size=128, hidden_size=512, nof_lstms=2, dropout=0.0, bidir=True)

In [16]:
x = torch.rand(64, 5, 2)
embedding = nn.Linear(2, params.embedding_size)
x_embed = embedding(x).permute(1, 0, 2)
print('x embed -', x_embed.shape)

lstm = nn.LSTM(
    input_size = params.embedding_size,
    hidden_size = params.hidden_size,
    num_layers = params.nof_lstms,
    bias = False,
    batch_first = False,
    dropout = params.dropout,
    bidirectional = params.bidir
)

h0 = Parameter(torch.zeros(1), requires_grad=False).unsqueeze(0).unsqueeze(0).repeat(
    2*params.nof_lstms, 64, params.hidden_size
)
c0 = Parameter(torch.zeros(1), requires_grad=False).unsqueeze(0).unsqueeze(0).repeat(
    2*params.nof_lstms, 64, params.hidden_size
)
print('h0-', h0.shape)

en_o, (en_h, en_c) = lstm(x_embed, (h0, c0))
print('en_o -', en_o.shape)
print('en_h -', en_h.shape)
print('en_c -', en_c.shape)

x embed - torch.Size([5, 64, 128])
h0- torch.Size([4, 64, 512])
en_o - torch.Size([5, 64, 1024])
en_h - torch.Size([4, 64, 512])
en_c - torch.Size([4, 64, 512])


In [17]:
class Encoder(nn.Module):
    """
    Encoder class for Pointer-Net
    """

    def __init__(
        self, embedding_dim,
        hidden_dim,
        n_layers,
        dropout,
        bidir
    ):
        """
        Initiate Encoder

        :param Tensor embedding_dim: Number of embbeding channels
        :param int hidden_dim: Number of hidden units for the LSTM
        :param int n_layers: Number of layers for LSTMs
        :param float dropout: Float between 0-1
        :param bool bidir: Bidirectional
        """

        super(Encoder, self).__init__()
        self.n_layers = n_layers*2 if bidir else n_layers
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_dim,
            num_layers = n_layers,
            bias = False,
            batch_first = False,
            dropout = dropout,
            bidirectional = bidir
        )

        # Used for propagating .cuda() command
        self.h0 = Parameter(torch.zeros(1), requires_grad = False)
        self.c0 = Parameter(torch.zeros(1), requires_grad = False)

    def forward(self, embedded_inputs, hidden):
        """
        Encoder - Forward-pass

        :param Tensor embedded_inputs: Embedded inputs of Pointer-Net
        :param Tensor hidden: Initiated hidden units for the LSTMs (h, c)
        :return: LSTMs outputs and hidden units (h, c)
        """
        embedded_inputs = embedded_inputs.permute(1, 0, 2)

        outputs, hidden = self.lstm(embedded_inputs, hidden)

        return outputs.permute(1, 0, 2), hidden

    def init_hidden(self, embedded_inputs):
        """
        Initiate hidden units

        :param Tensor embedded_inputs: The embedded input of Pointer-NEt
        :return: Initiated hidden units for the LSTMs (h, c)
        """

        batch_size = embedded_inputs.shape[0]

        # Reshaping (Expanding)
        h0 = self.h0.unsqueeze(0).unsqueeze(0).repeat(self.n_layers, batch_size, self.hidden_dim)
        c0 = self.h0.unsqueeze(0).unsqueeze(0).repeat(self.n_layers, batch_size, self.hidden_dim)

        return h0, c0

In [18]:
x = torch.rand(256, 5, 2)

In [19]:
x_embed = nn.Linear(2, params.embedding_size)(x)

encoder = Encoder(
    embedding_dim = params.embedding_size,
    hidden_dim = params.hidden_size,
    n_layers = params.nof_lstms,
    dropout = params.dropout,
    bidir = params.bidir
)

en_o, (en_h, en_c) = encoder(x_embed, encoder.init_hidden(x_embed))
print('en_o -', en_o.shape)
print('en_h -', en_h.shape)
print('en_c -', en_c.shape)

en_o - torch.Size([256, 5, 1024])
en_h - torch.Size([4, 256, 512])
en_c - torch.Size([4, 256, 512])


**ATTENTION**

In [20]:
class Attention(nn.Module):
    """
    Attention model for Pointer-Net
    """

    def __init__(self, q_dim, k_dim, hidden_dim):
        """
        Initiate Attention

        :param int input_dim: Input's diamention
        :param int hidden_dim: Number of hidden units in the attention
        """

        super(Attention, self).__init__()

        self.q_dim = q_dim
        self.k_dim = k_dim
        self.hidden_dim = hidden_dim

        self.project_queries = nn.Linear(q_dim, hidden_dim, bias = False)
        self.project_keys = nn.Linear(k_dim, hidden_dim, bias = False)
        self.V = nn.Linear(hidden_dim, 1, bias = False)
        self._inf = Parameter(torch.FloatTensor([float('-inf')]), requires_grad = False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, queries, keys, mask):
        """
        Attention - Forward-pass

        :param Tensor queries: hidden state of decoder from step function in Decoder object (batch, hidden_dim)
        :param Tensor keys: encoder outputs, or context (seq_len, batch, 2*hidden_dim)
        :param ByteTensor mask: Selection mask
        :return: tuple of - (Attentioned hidden state, Alphas)
        """
        # Initialize -inf mask
        self.inf = self._inf.unsqueeze(1).expand(mask.size())

        # (batch, hidden_dim)
        projected_q = self.project_queries(queries)
        # (seq_len, batch, hidden_dim)
        projected_k = self.project_keys(keys)
        
        # (batch, seq_len)
        attention_pointer = self.V(self.tanh(projected_q + projected_k)).squeeze(-1).permute(-1, 0)
        # if len(attention_pointer[mask]) > 0:
        #     attention_pointer[mask] = self.inf[mask]
        alpha = self.softmax(attention_pointer)

        hidden_state = torch.bmm(projected_k.permute(1, 2, 0), alpha.unsqueeze(2)).squeeze(2)

        return hidden_state, alpha

In [21]:
A = Attention(
    q_dim = 512, 
    k_dim = 512*2,
    hidden_dim = 512
)

mask = Parameter(torch.ones(1), requires_grad=False).repeat(5).unsqueeze(0).repeat(256, 1)

att_de_h, alpha = A(
    queries = torch.rand(256, 512),
    keys = torch.rand(5, 256, 512*2),
    mask = torch.eq(mask, 0).type(torch.bool)
)
print('att_de_h -', att_de_h.shape)
print('alpha -', alpha.shape)
print(alpha[0])

att_de_h - torch.Size([256, 512])
alpha - torch.Size([256, 5])
tensor([0.1857, 0.1927, 0.2253, 0.1916, 0.2047], grad_fn=<SelectBackward0>)


**DECODER**

In [22]:
class Decoder(nn.Module):
    """
    Decoder model for Pointer-Net
    """

    def __init__(self, embedding_dim,
                 hidden_dim):
        """
        Initiate Decoder

        :param int embedding_dim: Number of embeddings in Pointer-Net
        :param int hidden_dim: Number of hidden units for the decoder's RNN
        """

        super(Decoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.input_to_hidden = nn.Linear(embedding_dim, 4 * hidden_dim)
        self.hidden_to_hidden = nn.Linear(hidden_dim, 4 * hidden_dim)
        self.hidden_out = nn.Linear(hidden_dim * 2, hidden_dim)
        self.att = Attention(q_dim = hidden_dim, k_dim = 2 * hidden_dim, hidden_dim = hidden_dim)

        # Used for propagating .cuda() command
        self.mask = Parameter(torch.ones(1), requires_grad = False)
        self.runner = Parameter(torch.zeros(1), requires_grad = False)
        self.index0 = Parameter(torch.zeros(1), requires_grad = False)

    def forward(
        self, embedded_inputs,
        decoder_input,
        hidden,
        context, 
        beam_size
    ):
        """
        Decoder - Forward-pass

        :param Tensor embedded_inputs: Embedded inputs of Pointer-Net
        :param Tensor decoder_input: First decoder's input
        :param Tensor hidden: First decoder's hidden states
        :param Tensor context: Encoder's outputs
        :return: (Output probabilities, Pointers indices), last hidden state
        """
        batch_size = embedded_inputs.size(0)
        input_length = embedded_inputs.size(1)
        assert (beam_size <= input_length-1) & (beam_size >= 1), f"Current beam size is {beam_size} while input_length is {input_length}."

        # (batch, seq_len)
        mask = self.mask.repeat(input_length).unsqueeze(0).repeat(batch_size, 1)

        # Generating arang(input_length), broadcasted across batch_size
        runner = self.runner.repeat(input_length)
        for i in range(input_length):
            runner.data[i] = i
        runner = runner.unsqueeze(0).expand(batch_size, -1).long()

        # Initilize the first iteration of index, which is 0 since we want to start at that index
        index0 = self.index0.repeat(batch_size)

        def step(x, hidden, mask_step):
            """
            Recurrence step function

            :param Tensor x: Input at time t
            :param tuple(Tensor, Tensor) hidden: Hidden states at time t-1
            :return: Hidden states at time t (h, c), Attention probabilities (Alpha)
            """
            # Regular LSTM
            h, c = hidden

            gates = self.input_to_hidden(x) + self.hidden_to_hidden(h)
            input, forget, cell, out = gates.chunk(4, -1)

            input = F.sigmoid(input)
            forget = F.sigmoid(forget)
            cell = F.tanh(cell)
            out = F.sigmoid(out)

            c_t = (forget * c) + (input * cell)
            h_t = out * F.tanh(c_t)

            # Attention section
            hidden_t, output = self.att(h_t, context, torch.eq(mask_step, 0))
            hidden_t = F.tanh(self.hidden_out(torch.cat((hidden_t, h_t), 1)))

            return hidden_t, c_t, output

        def masking(index, mask, runner):
            one_hot_pointers = (runner == index.unsqueeze(1).expand(-1, input_length)).float()
            # Update mask to ignore seen indices
            mask = mask * (1 - one_hot_pointers)
            # Get embedded inputs by max indices
            embedding_mask = one_hot_pointers.unsqueeze(2).expand(-1, -1, self.embedding_dim).byte()
            decoder_input = embedded_inputs[embedding_mask.data > 0].view(batch_size, self.embedding_dim)
            return decoder_input, mask

        outputs = []
        pointers = []
        decoder_input_list = []
        mask_list = []

        # 1st recurrent loop, the idea is to get the maximum probabilities and indices for each iteration 
        # Force the first iteration to take indice 0: 
            # Input: 
                # hidden states from the last step of the encoder (256, 512) x 2
                # initialized decoder input (256, 128)
            # Output: 
                # 1 decoder input for the next step (256, 128)
                # 1 updated hidden state pair (256, 512) x 2
                # 1 decoder output, softmax of the attention (256, 5)
                # 1 pointer - index0 (256)
        # At the second iteration (i.e. at sequence length index 1):
            # Input: 
                # Updated hidden states from the last step of the decoder (256, 512) x 2
                # pointed decoder input (256, 128)
            # Output: 
                # <beam_size> decoder inputs for the next step (256, 128) x <beam_size>
                # 1 updated hidden state pair (256, 512) x 2
                # 1 decoder output, softmax of the attention (256, 5)
                # <beam_size> pointers (256) x <beam_size>
        for _ in range(2):
            h_t, c_t, outs = step(decoder_input, hidden, mask_step = mask)
            hidden = (h_t, c_t)
            masked_outs = outs * mask # Masking selected inputs
            
            if _ == 0:
                max_probs, indices = masked_outs.max(1)
                # decoder_input, mask = masking(index = index0, mask = mask, runner = runner)
                # Save the decoder output and pointer at the 1st step
                outputs.append(outs.unsqueeze(0)) # -> (1, 256, 5)
                pointers.append(index0.unsqueeze(1)) # -> (256, 1) ~ (beam, batch, seq_len)
            
            elif _ == 1:
                max_probs, indices = torch.topk(masked_outs, k = beam_size, dim = 1, largest = True, sorted = False)
                # Save the decoder output at the 2nd step
                outputs.append(outs.unsqueeze(0)) # -> (1, 256, 5) x 2

                for ind in indices.permute(1, 0): # (256) in (<beam_size>, 256)
                    # Save the 2 pointers at the 2nd step
                    pointers.append(ind.unsqueeze(1)) # -> (256, 1) x (1 + <beam_size>)
                    decoder_input, mask_1_ = masking(index = ind, mask = mask, runner = runner)
                    decoder_input_list.append(decoder_input) # -> (256, 128) x <beam_size>
                    mask_list.append(mask_1_) # -> (256, 5) x <beam_size>
        
        # Concatenate everything from the 1st recurrent loop to torch tensor type, pay attention to the dimensions of the outputs and pointers
        outputs = torch.cat(outputs, 0) # -> (2, 256, 5) dtype torch tensor
        master_outputs = []
        master_pointers = []

        # 2nd recurrent loop, run through the remaining steps in the decoder
        for de_inp, mask_, i in zip(decoder_input_list, mask_list, range(1, beam_size + 1)):
            
            # Assign hidden and input right here to keep them constant while switching beam
            hidden_2_ = hidden
            decoder_input_2_ = de_inp
            mask_2_ = mask_
            outputs_2_ = []
            pointers_2_ = []
            for _ in range(input_length - 2): # -2 refers to the initial loop, +2 refers to the length having one more token at the beginning, i.e. the <bos>
                h_t, c_t, outs = step(decoder_input_2_, hidden_2_, mask_2_)
                hidden_2_ = (h_t, c_t)
                masked_outs = outs * mask_2_
                max_probs, indices_2_ = masked_outs.max(1)
                decoder_input_2_, mask_2_ = masking(index = indices_2_, mask = mask_2_, runner = runner)
                # Save the current beam's decoder outputs and pointers to a dummy list
                outputs_2_.append(outs.unsqueeze(0)) # -> (1, 256, 5) x (seq_len - 2 + 1)
                pointers_2_.append(indices_2_.unsqueeze(1)) # -> (256, 1) x (seq_len - 2 + 1)

            # Concatenate the current beam's decoder outputs and pointers to the master list
            master_outputs.append(torch.cat(
                [
                    outputs, # (2, 256, 5)
                    torch.cat(outputs_2_, 0) # -> (seq_len - 2 + 2, 256, 5)
                ], 0
            ).unsqueeze(0)) # -> (1, seq_len + 2, 256, 5) x <beam_size> torch tensors in list

            master_pointers.append(torch.cat(
                [
                    pointers[0], pointers[i], # (256, 1) x 2
                    torch.cat(pointers_2_, 1) # -> (256, seq_len - 2 + 2)
                ], 1
            ).unsqueeze(0)) # -> (1, 256, 5 + 2) x <beam_size> torch tensors in list
        
        # -> (<beam_size>, 256, 5, 5 + 2) dtype torch tensor ~ (beam, batch, seq_len as cond. probability, seq_len + 1)
        master_outputs = torch.cat(master_outputs, 0).permute(0, 2, 3, 1)

        # -> (<beam_size>, 256, 5 + 2) dtype torch tensor ~ (beam , batch, seq_len + 2)
        master_pointers = torch.cat(master_pointers, 0)

        return (master_outputs, master_pointers)

In [23]:
decoder = Decoder(
    embedding_dim = 128,
    hidden_dim = 512
)

decoder_hidden0 = (en_h[-1].squeeze(0), en_c[-1].squeeze(0))

(de_o, pt) = decoder(
    embedded_inputs = x_embed,
    decoder_input = torch.rand(256, 128),
    hidden = decoder_hidden0,
    context = en_o.permute(1, 0, 2),
    beam_size = 1
)

de_o = de_o.squeeze(0)
pt = pt.squeeze(0)

print('de_o -', de_o.shape)
print('pt -', pt.shape)

de_o[0]

de_o - torch.Size([256, 5, 5])
pt - torch.Size([256, 5])


tensor([[0.2008, 0.2008, 0.2008, 0.2008, 0.2008],
        [0.2001, 0.2001, 0.2001, 0.2001, 0.2001],
        [0.1997, 0.1997, 0.1997, 0.1997, 0.1997],
        [0.1996, 0.1996, 0.1996, 0.1996, 0.1996],
        [0.1998, 0.1998, 0.1998, 0.1998, 0.1998]], grad_fn=<SelectBackward0>)

$$
X = \begin{bmatrix}
x_0 & x_1 & x_2 & x_3 & x_4 
\end{bmatrix}
$$

$$
\text{iter 0 : } P\left( x_2|\theta \right) = \frac{\exp{x_2}}{\sum_{i=0}^{5}\exp{x_i}}
$$
$$
\text{iter 1 : } P\left( x_4|x_2, \theta \right) = \frac{\exp{x_4}}{\sum_{i=0}^{{0, 1, 3, 4}}\exp{x_i}}
$$
$$
\text{iter 2 : } P\left( x_3|x_2,x_4, \theta \right) = \frac{\exp{x_3}}{\sum_{i=0}^{{0, 1, 3}}\exp{x_i}}
$$
$$
\text{iter 3 : } P\left( x_1|x_2,x_3,x_4, \theta \right) = \frac{\exp{x_3}}{\sum_{i=0}^{{0, 1}}\exp{x_i}}
$$
$$
\text{iter 4 : } P\left( x_0|x_1,x_2,x_3,x_4, \theta \right) = \frac{\exp{x_0}}{\sum_{i=0}^{{0}}\exp{x_i}}
$$

In [24]:
class PointerNet(nn.Module):
    """
    Pointer-Net
    """

    def __init__(
        self, embedding_dim,
        hidden_dim,
        lstm_layers,
        dropout,
        bidir = False
    ):
        """
        Initiate Pointer-Net

        :param int embedding_dim: Number of embbeding channels
        :param int hidden_dim: Encoders hidden units
        :param int lstm_layers: Number of layers for LSTMs
        :param float dropout: Float between 0-1
        :param bool bidir: Bidirectional
        """

        super(PointerNet, self).__init__()
        self.embedding_dim = embedding_dim
        self.bidir = bidir
        self.embedding = nn.Linear(2, embedding_dim)
        self.encoder = Encoder(
            embedding_dim,
            hidden_dim,
            lstm_layers,
            dropout,
            bidir
        )
        self.decoder = Decoder(embedding_dim, hidden_dim)
        self.decoder_input0 = Parameter(torch.FloatTensor(embedding_dim), requires_grad=False)

        # Initialize decoder_input0
        nn.init.uniform_(self.decoder_input0, -1, 1)

    def forward(self, inputs):
        """
        PointerNet - Forward-pass

        :param Tensor inputs: Input sequence
        :return: Pointers probabilities and indices
        """

        batch_size = inputs.size(0)
        input_length = inputs.size(1)

        decoder_input0 = self.decoder_input0.unsqueeze(0).expand(batch_size, -1)

        inputs = inputs.view(batch_size * input_length, -1)
        embedded_inputs = self.embedding(inputs).view(batch_size, input_length, -1)

        encoder_hidden0 = self.encoder.init_hidden(embedded_inputs)
        encoder_outputs, encoder_hidden = self.encoder(embedded_inputs, encoder_hidden0)
        
        if self.bidir:
            decoder_hidden0 = (encoder_hidden[0][-1].squeeze(0), encoder_hidden[1][-1].squeeze(0))
        else:
            decoder_hidden0 = (encoder_hidden[0][-1], encoder_hidden[1][-1])
        
        (outputs, pointers) = self.decoder(
            embedded_inputs,
            decoder_input0,
            decoder_hidden0,
            encoder_outputs.permute(1, 0, 2),
            beam_size = 1
        )

        return  outputs, pointers

**TRAIN**

In [25]:
dataset = torch.load(r"C://Users//PHAM DUY//python projects//d2l//data//100K_train_5.pt")
print('len dataset -', len(dataset))

dataloader = DataLoader(
    dataset,
    batch_size = params.batch_size,
    shuffle = True,
    num_workers = 0
)

len dataset - 100000


In [29]:
dataset[0]

{'Points': tensor([[0.3157, 0.1738],
         [0.4446, 0.9743],
         [0.2807, 0.3822],
         [0.2848, 0.8318],
         [0.0605, 0.2040]]),
 'Solution': tensor([0, 2, 1, 3, 4])}

In [26]:
nn.NLLLoss(reduction = 'mean')(
    torch.rand(256, 5, 5).type(torch.float32),
    torch.randint(0, 5, (256, 5)).type(torch.int64)
)

tensor(-0.4830)

**TEST**