# Model

I - Layers



In [1]:
import torch

In [2]:
device =  torch.device("cuda:0" if True == True else "cpu")
device = None 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

def scaled_attention(query, key, value, mask=None):
    """ Function that performs scaled attention given q, k, v and mask.
    q, k, v can have multiple batches and heads, defined across the first dimensions
    and the last 2 dimensions for a given sample of them are in row vector format.
    matmul is brodcasted across batches.
    """
    qk = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.shape[-1])
    if mask is not None: qk = qk.masked_fill(mask == 1, -1e9)
    qk = F.softmax(qk, dim=-1)
    return torch.matmul(qk, value)

class MultiHeadAttention(nn.Module):
    """ Attention Layer - multi-head scaled dot product attention (for encoder and decoder)
        Observation: This MHA is currently implemented to only support singe-gpu machines

        Args:
            num_heads: number of attention heads which will be computed in parallel
            d_model: embedding size of output AND input features
            * in reality it shouldn't be neccesary that input and ouptut features are the same dimension
              but its the current case for this class.

        Call arguments:
            q: query, shape (..., seq_len_q, depth_q)
            k: key, shape == (..., seq_len_k, depth_k)
            v: value, shape == (..., seq_len_v, depth_v)
            mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k) or None.

            Since we use scaled-product attention, we assume seq_len_k = seq_len_v

        Returns:
              attention outputs of shape (batch_size, seq_len_q, d_model)
    """
    def __init__(self, n_heads, d_model, **kwargs):
        super(MultiHeadAttention, self).__init__()
        self.n_heads, self.d_model = n_heads, d_model
        self.head_depth = self.d_model // self.n_heads

        assert self.d_model % self.n_heads == 0

        # define weight matrices
        self.wq = nn.Linear(self.d_model, self.d_model, bias=False)
        self.wk = nn.Linear(self.d_model, self.d_model, bias=False)
        self.wv = nn.Linear(self.d_model, self.d_model, bias=False)

        self.w_out = nn.Linear(self.d_model, self.d_model, bias=False)

    def split_heads(self, tensor, batch_size):
        """ Function that splits the heads. This happens in the same tensor since this class doesn't
        support multiple-gpu. Observe inline comments for more details on shapes.
        """
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, n_heads, head_depth)
        splitted_tensor = tensor.view(batch_size, -1, self.n_heads, self.head_depth)
        return splitted_tensor.transpose(1, 2) # (batch_size, n_heads, seq_len, head_depth)

    def forward(self, query, key, value, mask=None):
        # shape of q: (batch_size, seq_len_q, d_query)
        batch_size = query.shape[0]

        # project query, key and value to d_model dimensional space
        # this is equivalent to projecting them each to a head_depth dimensional space (for every head)
        # but with a single matrix
        Q = self.wq(query) # (batch_size, seq_len_q, d_query) -> (batch_size, seq_len_q, d_model)
        K = self.wk(key) # ... -> (batch_size, seq_len_k, d_model)
        V = self.wv(value) # ... -> (batch_size, seq_len_v, d_model)

        # split individual heads
        Q = self.split_heads(Q, batch_size) # ... -> (batch_size, n_heads, seq_len_q, head_depth)
        K = self.split_heads(K, batch_size) # ... -> (batch_size, n_heads, seq_len_k, head_depth)
        V = self.split_heads(V, batch_size) # ... -> (batch_size, n_heads, seq_len_v, head_depth)


        # Add dimension to mask so that it can be broadcasted across heads
        # (batch_size, seq_len_q, seq_len_k) --> (batch_size, 1, seq_len_q, seq_len_k)
        if mask is not None:
            mask = mask.unsqueeze(1)

        # perform attention for each q=(seq_len_q, head_depth), k=(seq_len_k, head_depth), v=(seq_len_v, head_depth)
        attention = scaled_attention(Q, K, V, mask) # (batch_size, n_heads, seq_len_q, head_depth)
        # transpose attention to (batch_size, seq_len_q, n_heads, head_depth)
        attention = attention.transpose(1, 2).contiguous()
        # concatenate results of all heads (batch_size, seq_len_q, self.d_model)
        attention = attention.view(batch_size, -1, self.d_model)

        # project attention to same dimension; observe this is equivalent to summing individual projection
        # as sugested in paper
        output = self.w_out(attention) # (batch_size, seq_len_q, d_model)

        return output

II - Graph-Encoder

In [4]:


class MultiHeadAttentionLayer(nn.Module):
    """Feed-Forward Sublayer: fully-connected Feed-Forward network,
    built based on MHA vectors from MultiHeadAttention layer with skip-connections

        Args:
            num_heads: number of attention heads in MHA layers.
            input_dim: embedding size that will be used as d_model in MHA layers.
            feed_forward_hidden: number of neuron units in each FF layer.

        Call arguments:
            x: batch of shape (batch_size, n_nodes, node_embedding_size).
            mask: mask for MHA layer

        Returns:
               outputs of shape (batch_size, n_nodes, input_dim)

    """

    def __init__(self, input_dim, num_heads, feed_forward_hidden=512, **kwargs):
        super().__init__(**kwargs)
        self.mha = MultiHeadAttention(n_heads=num_heads, d_model=input_dim)

        self.ff1 = nn.Linear(input_dim, feed_forward_hidden)
        self.ff2 = nn.Linear(feed_forward_hidden, input_dim)

    def forward(self, x, mask=None):
        mha_out = self.mha(x, x, x, mask)
        sc1_out = torch.add(x, mha_out)
        tanh1_out = torch.tanh(sc1_out)

        ff1_out = self.ff1(tanh1_out)
        relu1_out = F.relu(ff1_out)
        ff2_out = self.ff2(relu1_out)
        sc2_out = torch.add(tanh1_out, ff2_out)
        tanh2_out = torch.tanh(sc2_out)

        return tanh2_out

class GraphAttentionEncoder(nn.Module):
    """Graph Encoder, which uses MultiHeadAttentionLayer sublayer.

        Args:
            input_dim: embedding size that will be used as d_model in MHA layers.
            num_heads: number of attention heads in MHA layers.
            num_layers: number of attention layers that will be used in encoder.
            feed_forward_hidden: number of neuron units in each FF layer.

        Call arguments:
            x: tuples of 3 tensors:  (batch_size, 2), (batch_size, n_nodes-1, 2), (batch_size, n_nodes-1)
            First tensor contains coordinates for depot, second one is for coordinates of other nodes,
            Last tensor is for normalized demands for nodes except depot

            mask: mask for MHA layer

        Returns:
               Embedding for all nodes + mean embedding for graph.
               Tuples ((batch_size, n_nodes, input_dim), (batch_size, input_dim))
    """

    def __init__(self, input_dim, num_heads, num_layers, feed_forward_hidden=512):
        super().__init__()

        self.input_dim = input_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.feed_forward_hidden = feed_forward_hidden

        # initial embeddings (batch_size, n_nodes-1, 2) --> (batch-size, input_dim), separate for depot and other nodes
        self.init_embed_depot = nn.Linear(2, self.input_dim)  # nn.Linear(2, embedding_dim)
        self.init_embed = nn.Linear(6, self.input_dim) # x,y,demand,time,finish,service

        self.mha_layers = [MultiHeadAttentionLayer(self.input_dim, self.num_heads, self.feed_forward_hidden)
                            for _ in range(self.num_layers)]
        self.mha_layers = nn.ModuleList(self.mha_layers)

    def forward(self, x, mask=None, cur_num_nodes=None):
        # print(x)

        x = torch.cat((self.init_embed_depot(x[0])[:, None, :],  # (batch_size, 2) --> (batch_size, 1, 2)
                       self.init_embed(torch.cat((x[1], x[2][:, :, None], x[3][:, :, None], x[4][:, :, None], x[5][:, :, None]), -1))  # (batch_size, n_nodes-1, 2) + (batch_size, n_nodes-1)
                       ), 1)         # (batch_size, n_nodes, input_dim)


        # stack attention layers
        for i in range(self.num_layers):
            x = self.mha_layers[i](x, mask)

        if mask is not None:
            output = (x, torch.sum(x, 1) / cur_num_nodes)
        else:
            output = (x, torch.mean(x, 1))

        return output # (embeds of nodes, avg graph embed)=((batch_size, n_nodes, input), (batch_size, input_dim))

III - Attention Dynamic Model

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.categorical import Categorical
import math
import numpy as np

def get_dev_of_mod(model):
    return next(model.parameters()).device

def set_decode_type(model, decode_type):
    model.set_decode_type(decode_type)

class AttentionDynamicModel(nn.Module):

    def __init__(self,
                 embedding_dim,
                 n_encode_layers=2,
                 n_heads=8,
                 tanh_clipping=10.,
                 device = device
                 ):

        super().__init__()

        # attributes for MHA
        self.embedding_dim = embedding_dim
        self.n_encode_layers = n_encode_layers
        self.decode_type = None

        # attributes for VRP problem
        self.problem = AgentVRP
        self.n_heads = n_heads

        # Encoder part
        self.embedder = GraphAttentionEncoder(input_dim=self.embedding_dim,
                                              num_heads=self.n_heads,
                                              num_layers=self.n_encode_layers
                                              )

        # Decoder part

        self.output_dim = self.embedding_dim
        self.num_heads = n_heads

        self.head_depth = self.output_dim // self.num_heads
        self.dk_mha_decoder = float(self.head_depth)  # for decoding in mha_decoder
        self.dk_get_loc_p = float(self.output_dim)  # for decoding in mha_decoder

        if self.output_dim % self.num_heads != 0:
            raise ValueError("number of heads must divide d_model=output_dim")

        self.tanh_clipping = tanh_clipping

        # we split projection matrix Wq into 2 matrices: Wq*[h_c, h_N, D] = Wq_context*h_c + Wq_step_context[h_N, D]
        self.wq_context = nn.Linear(self.embedding_dim, self.output_dim)  # (d_q_context, output_dim)
        self.wq_step_context = nn.Linear(self.embedding_dim + 1, self.output_dim, bias=False)  # (d_q_step_context, output_dim)

        # we need two Wk projections since there is MHA followed by 1-head attention - they have different keys K
        self.wk = nn.Linear(self.embedding_dim, self.output_dim, bias=False)  # (d_k, output_dim)
        self.wk_tanh = nn.Linear(self.embedding_dim, self.output_dim, bias=False)  # (d_k_tanh, output_dim)

        # we dont need Wv projection for 1-head attention: only need attention weights as outputs
        self.wv = nn.Linear(self.embedding_dim, self.output_dim, bias=False)  # (d_v, output_dim)

        # we dont need wq for 1-head tanh attention, since we can absorb it into w_out
        self.w_out = nn.Linear(self.embedding_dim, self.output_dim, bias=False)  # (d_model, d_model)

        self.dev = device

    def set_decode_type(self, decode_type):
        self.decode_type = decode_type

    def split_heads(self, tensor, batch_size):
        """Function for computing attention on several heads simultaneously
        Splits tensor to be multi headed.
        """
        # (batch_size, seq_len, output_dim) -> (batch_size, seq_len, n_heads, head_depth)
        splitted_tensor = tensor.view(batch_size, -1, self.n_heads, self.head_depth)
        return splitted_tensor.transpose(1, 2) # (batch_size, n_heads, seq_len, head_depth)

    def _select_node(self, logits):
        """Select next node based on decoding type.
        """

        # assert tf.reduce_all(logits) == logits, "Probs should not contain any nans"

        if self.decode_type == "greedy":
            selected = torch.argmax(logits, dim=-1)  # (batch_size, 1)

        elif self.decode_type == "sampling":
            # logits has a shape of (batch_size, 1, n_nodes), we have to squeeze it
            # to (batch_size, n_nodes) since tf.random.categorical requires matrix
            cat_dist = Categorical(logits=logits[:, 0, :]) # creates categorical distribution from tensor (batch_size)
            selected = cat_dist.sample() # takes a single sample from distribution
        else:
            assert False, "Unknown decode type"

        return torch.squeeze(selected, -1)  # (batch_size,)

    def get_step_context(self, state, embeddings):
        """Takes a state and graph embeddings,
           Returns a part [h_N, D] of context vector [h_c, h_N, D],
           that is related to RL Agent last step.
        """
        # index of previous node
        prev_node = state.prev_a.to(self.dev)  # (batch_size, 1)

        # from embeddings=(batch_size, n_nodes, input_dim) select embeddings of previous nodes
        cur_embedded_node = embeddings.gather(1, prev_node.view(prev_node.shape[0], -1, 1)
                            .repeat_interleave(embeddings.shape[-1], -1)) # (batch_size, 1, input_dim)

        # add remaining capacity
        step_context = torch.cat([cur_embedded_node, (self.problem.VEHICLE_CAPACITY - state.used_capacity[:, :, None]).to(self.dev)], dim=-1)

        return step_context  # (batch_size, 1, input_dim + 1)

    def decoder_mha(self, Q, K, V, mask=None):
        """ Computes Multi-Head Attention part of decoder
        Args:
            mask: a mask for visited nodes,
                has shape (batch_size, seq_len_q, seq_len_k), seq_len_q = 1 for context vector attention in decoder
            Q: query (context vector for decoder)
                    has shape (batch_size, n_heads, seq_len_q, head_depth) with seq_len_q = 1 for context_vector attention in decoder
            K, V: key, value (projections of nodes embeddings)
                have shape (batch_size, n_heads, seq_len_k, head_depth), (batch_size, n_heads, seq_len_v, head_depth),
                                                                with seq_len_k = seq_len_v = n_nodes for decoder
        """

        # Add dimension to mask so that it can be broadcasted across heads
        # (batch_size, seq_len_q, seq_len_k) --> (batch_size, 1, seq_len_q, seq_len_k)
        if mask is not None:
            mask = mask.unsqueeze(1)

        attention = scaled_attention(Q, K, V, mask) # (batch_size, n_heads, seq_len_q, head_depth)
        # transpose attention to (batch_size, seq_len_q, n_heads, head_depth)
        attention = attention.transpose(1, 2).contiguous()
        # concatenate results of all heads (batch_size, seq_len_q, self.output_dim)
        attention = attention.view(self.batch_size, -1, self.output_dim)

        output = self.w_out(attention)

        return output

    def get_log_p(self, Q, K, mask=None):
        """Single-Head attention sublayer in decoder,
        computes log-probabilities for node selection.

        Args:
            mask: mask for nodes
            Q: query (output of mha layer)
                    has shape (batch_size, seq_len_q, output_dim), seq_len_q = 1 for context attention in decoder
            K: key (projection of node embeddings)
                    has shape  (batch_size, seq_len_k, output_dim), seq_len_k = n_nodes for decoder
        """

        compatibility = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.shape[-1])
        compatibility = torch.tanh(compatibility) * self.tanh_clipping
        if mask is not None: compatibility = compatibility.masked_fill(mask == 1, -1e9)

        log_p = F.log_softmax(compatibility, dim=-1)  # (batch_size, seq_len_q, seq_len_k)

        return log_p

    def get_likelihood_selection(self, _log_p, a):

        # Get log_p corresponding to selected actions for every batch
        indices = a.view(a.shape[0], -1)
        select = _log_p.gather(-1, indices)
        return select.view(-1)


    def get_projections(self, embeddings, context_vectors):

        # we compute some projections (common for each policy step) before decoding loop for efficiency
        K = self.wk(embeddings)  # (batch_size, n_nodes, output_dim)
        K_tanh = self.wk_tanh(embeddings)  # (batch_size, n_nodes, output_dim)
        V = self.wv(embeddings)  # (batch_size, n_nodes, output_dim)
        Q_context = self.wq_context(context_vectors[:, None, :])  # (batch_size, 1, output_dim)

        # we dont need to split K_tanh since there is only 1 head; Q will be split in decoding loop
        K = self.split_heads(K, self.batch_size)  # (batch_size, num_heads, n_nodes, head_depth)
        V = self.split_heads(V, self.batch_size)  # (batch_size, num_heads, n_nodes, head_depth)

        return K_tanh, Q_context, K, V


    def fwd_rein_loss(self, inputs, baseline, bl_vals, num_batch, return_pi=False):
        """
        Forward and calculate loss for REINFORCE algorithm in a memory efficient way.
        This sacrifices a bit of performance but is way better in memory terms and works
        by reordering the terms in the gradient formula such that we don't store gradients
        for all the seguence for a long time which hence produces a lot of memory consumption.
        """

        on_training = self.training
        self.eval()
        with torch.no_grad():
            cost, log_likelihood, seq = self(inputs, True)
            bl_val = bl_vals[num_batch] if bl_vals is not None else baseline.eval(inputs, cost)
            pre_cost = cost - bl_val.detach()
            detached_loss = torch.mean((pre_cost) * log_likelihood)

        if on_training: self.train()
        return detached_loss, self(inputs, return_pi, seq, pre_cost)

    def forward(self, inputs, return_pi=False, pre_selects=None, pre_cost=None):
        """
        Forward method. Works as expected except and as described on the paper, however
        if pre_selects is None which hence implies that pre_cost should be none it's because
        fwd_rein_loss is calling it; check that method for a description of why this is useful.
        """

        self.batch_size = inputs[0].shape[0]
        # print(inputs)
        # inputs = self.set_input_device(inputs) # sent inputs to GPU for training if it's being used

        state = self.problem(inputs) # use CPU inputs for state
        inputs = self.set_input_device(inputs) # sent inputs to GPU for training if it's being used
        # print(inputs)

        sequences = []
        ll = torch.zeros(self.batch_size)

        if pre_selects is not None:
            pre_selects = pre_selects.transpose(0, 1)
        # Perform decoding steps
        pre_select_idx = 0
        state.i = torch.zeros(1, dtype=torch.int64)
        att_mask, cur_num_nodes = state.get_att_mask()
        att_mask, cur_num_nodes = att_mask.to(self.dev), cur_num_nodes.to(self.dev)
        embeddings, context_vectors = self.embedder(inputs, att_mask, cur_num_nodes)
        K_tanh, Q_context, K, V = self.get_projections(embeddings, context_vectors)
        while not state.all_finished():

            # state.i = torch.zeros(1, dtype=torch.int64)
            # att_mask, cur_num_nodes = state.get_att_mask()
            # att_mask, cur_num_nodes = att_mask.to(self.dev), cur_num_nodes.to(self.dev)
            # embeddings, context_vectors = self.embedder(inputs, att_mask, cur_num_nodes)
            # K_tanh, Q_context, K, V = self.get_projections(embeddings, context_vectors)

            # while not state.partial_finished():

                step_context = self.get_step_context(state, embeddings)  # (batch_size, 1, input_dim + 1)
                Q_step_context = self.wq_step_context(step_context)  # (batch_size, 1, output_dim)
                Q = Q_context + Q_step_context

                # split heads for Q
                Q = self.split_heads(Q, self.batch_size)  # (batch_size, num_heads, 1, head_depth)

                # get current mask
                mask = state.get_mask().to(self.dev)  # (batch_size, 1, n_nodes) True -> mask, i.e. agent can NOT go

                # compute MHA decoder vectors for current mask
                mha = self.decoder_mha(Q, K, V, mask)  # (batch_size, 1, output_dim)

                # compute probabilities
                log_p = self.get_log_p(mha, K_tanh, mask)  # (batch_size, 1, n_nodes)

                # next step is to select node
                if pre_selects is None:
                    selected = self._select_node(log_p.detach()) # (batch_size,)
                else:
                    selected = pre_selects[pre_select_idx]

                state.step(selected.detach().cpu())

                curr_ll = self.get_likelihood_selection(log_p[:, 0, :].cpu(), selected.detach().cpu())
                if pre_selects is not None:
                    curr_loss = (curr_ll * pre_cost).sum() / self.batch_size
                    curr_loss.backward(retain_graph=True)
                    curr_ll = curr_ll.detach()
                ll += curr_ll

                sequences.append(selected.detach().cpu())
                pre_select_idx += 1
                # torch.cuda.empty_cache()
            # torch.cuda.empty_cache()

        pi = torch.stack(sequences, dim=1) # (batch_size, len(outputs))
        cost = self.problem.get_costs((inputs[0].detach().cpu(), inputs[1].detach().cpu(), inputs[2].detach().cpu()), pi)

        ret = [cost, ll]
        if return_pi: ret.append(pi)
        return ret

    def set_input_device(self, inp_tens):
        if self.dev is None: self.dev = get_dev_of_mod(self)
        return(inp_tens[0].to(device), inp_tens[1].to(device), inp_tens[2].to(device), inp_tens[3].to(device), inp_tens[4].to(device), inp_tens[5].to(device))

Problem

In [6]:
import torch

class AgentVRP():
    VEHICLE_CAPACITY = 1.0

    def __init__(self, input):
        depot = input[0] # (batch_size, 2)
        loc = input[1] # (batch_size, n_nodes, 2)
        self.demand = input[2] # (batch_size, n_nodes)
        self.time = input[3]
        self.finish = input[4]
        self.service = input[5]

        self.batch_size, self.n_loc, _ = loc.shape

        # Coordinates of depot + other nodes -> (batch_size, 1+n_nodes, 2)
        self.coords = torch.cat((depot[:, None, :], loc), dim=-2)

        # Indices of graphs in batch
        self.ids = torch.arange(self.batch_size) # (batch_size)

        # State
        self.prev_a = torch.zeros(self.batch_size, 1, dtype=torch.int64)
        self.from_depot = (self.prev_a == 0)
        self.used_capacity = torch.zeros(self.batch_size, 1)
        self.used_time = torch.zeros(self.batch_size, 1)
        self.ids=torch.arange(self.batch_size, dtype=torch.int64)[:, None]
        self.ids_1=torch.arange(self.batch_size, dtype=torch.int64)

        # Nodes that have been visited will be marked with 1
        self.visited = torch.zeros(self.batch_size, 1, self.n_loc+1)

        # Step counter
        self.i = torch.zeros(1, dtype=torch.int64)

    @staticmethod
    def outer_pr(a, b):
        """ Outer product of a and b row vectors.
            result[k] = matmul( a[k].t(), b[k] )
        """
        return torch.einsum('ki,kj->kij', a, b)

    def get_att_mask(self):
        """ Mask (batchsize, n_nodes, n_nodes) for attention encoder.
            We maks alredy visited nodes except for depot (can be visited multiple times).

            True -> should mask (can NOT visit)
            False -> shouldn't mask (can visit)
        """
        # Remove depot from mask (1st column)
        att_mask = torch.squeeze(self.visited, dim=-2)[:, 1:] # (batch_size, 1, n_nodes) -> (batch_size, n_nodes-1)

        # Number of nodes in new instance after masking
        cur_num_nodes = self.n_loc + 1 - att_mask.sum(dim=1, keepdims=True) # (batch_size, 1)

        att_mask = torch.cat((torch.zeros(att_mask.shape[0], 1), att_mask), dim=-1) # add depot -> (batch_size, n_nodes)

        ones_mask = torch.ones_like(att_mask)

        # Create square attention mask.
        # In a (n_nodes, n_nodes) matrix this masks all rows and columns of visited nodes
        att_mask = AgentVRP.outer_pr(att_mask, ones_mask) \
                    + AgentVRP.outer_pr(ones_mask, att_mask) \
                    - AgentVRP.outer_pr(att_mask, att_mask) # (batch_size, n_nodes, n_nodes)
        return att_mask == 1, cur_num_nodes

    def all_finished(self):
        """ Checks if all routes are finished
        """
        return torch.all(self.visited == 1).item()

    def partial_finished(self):
        """Checks if partial solution for all graphs has been built; i.e. all agents came back to depot
        """
        return (torch.all(self.from_depot == 1) and self.i != 0).item()
    def dist(self): #ok
        return (self.coords[:, :, None, :] - self.coords[:, None, :, :]).norm(p=2, dim=-1)

    def get_mask(self):
        """ Returns a mask (batch_size, 1, n_nodes) with available actions.
            Impossible nodes are masked.

            True -> should mask (can NOT visit)
            False -> shouldn't mask (can visit)
        """

        # Exclude depot
        visited_loc = self.visited[:, :, 1:]

        # Mark nodes which exceed vehicle capacity
        exceeds_cap = self.demand + self.used_capacity > self.VEHICLE_CAPACITY

        # For demand steps_dim is inserted by indexing with id, for used_capacity insert node dim for broadcasting


        exceeds_time = ((self.service[self.ids, torch.clamp(self.prev_a - 1, 0, self.n_loc - 1)] * (self.prev_a != 0)).view(self.batch_size,1,1).float() + self.dist()[self.ids, self.prev_a, 1:] > self.finish[:,None,:])



        mask_loc = (visited_loc == 1) | (exceeds_cap[:, None, :]) | exceeds_time
        # print('mask_loc', mask_loc)


        # We can choose depot if we are not in depot OR all nodes are visited
        # equivalent to: we mask the depot if we are in it AND there're still mode nodes to visit
        mask_depot = self.from_depot[:, None, :] & ((mask_loc == False).sum(dim=-1, keepdims=True) > 0)

        return torch.cat([mask_depot, mask_loc], dim=-1)

    def step(self, action):

        # Update current state
        selected = action[:, None]

        cur_coord = self.coords[self.ids, selected]
        selected_time = self.service[self.ids, torch.clamp(self.prev_a - 1, 0, self.n_loc - 1)] + (cur_coord - self.coords[self.ids, self.prev_a]).norm(p=2, dim=-1)

        used_time = self.used_time + selected_time
        time = self.time[self.ids, torch.clamp(selected - 1, 0, self.n_loc - 1)]
        # used_time = time

        used_time = torch.where(used_time < time, time, used_time)
        used_time_1 = (used_time * (selected != 0)).float()
        self.used_time = used_time_1
        # print('Used_time', self.used_time)
        self.prev_a = selected
        self.from_depot = self.prev_a == 0

        # Shift indices by 1 since self.demand doesn't consider depot
        selected_demand = self.demand.gather(-1, (self.prev_a - 1).clamp_min(0).view(-1, 1)) # (batch_size, 1)

        # Add current node capacity to used capacity and set it to 0 if we return from depot
        self.used_capacity = (self.used_capacity + selected_demand) * (self.from_depot == False)


        # Update visited nodes (set 1 to visited nodes)
        self.prev_a = selected
        self.from_depot = self.prev_a == 0
        # print('Before', self.visited)
        # print('action', action)
        # print('selected', selected)

        self.visited[self.ids_1, [0], action] = 1
        # print('After', self.visited)

        self.i += 1

    @staticmethod
    def get_costs(dataset, pi):

        # Place nodes with coordinates in order of decoder tour
        loc_with_depot = torch.cat((dataset[0][:, None, :], dataset[1]), dim=1) # (batch_size, n_nodes, 2)
        d = loc_with_depot.gather(1, pi.view(pi.shape[0], -1, 1).repeat_interleave(2, -1))

        # Calculation of total distance
        # Note: first element of pi is not depot, but the first selected node in path
        # and last element from longest path is not depot

        return ((torch.norm(d[:, 1:] - d[:, :-1], dim=-1)).sum(dim=-1) # intra node distances
            + (torch.norm(d[:, 0] - dataset[0], dim=-1))  # distance from depot to first
            + (torch.norm(d[:, -1] - dataset[0], dim=-1))) # distance from last node of longest path to depot

Ultis

In [7]:
import pickle
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
from datetime import datetime
import time


def set_random_seed(seed):
    torch.manual_seed(seed)

class Data_100():
    def __init__(self):
        self.size = 200
        self.graph_size = 100
        self.time_factor = 100
        self.service_duration = 10
        self.tw_expansion = 3
        self.service_window = 1000
        self.loc = 15
        self.scale = 10
        self.max = 42
        self.q = 1

def generate_data_onfly(cfg, rnds=None,
                         ):

    rnds = np.random if rnds is None else rnds

    # sample locations
    dloc = rnds.uniform(size=(cfg.size, 2))  # depot location
    nloc = rnds.uniform(size=(cfg.size, cfg.graph_size, 2))  # node locations

    # TW start needs to be feasibly reachable directly from depot
    min_t = np.ceil(np.linalg.norm(dloc[:, None, :]*cfg.time_factor - nloc*cfg.time_factor, axis=-1)) + 1
    # TW end needs to be early enough to perform service and return to depot until end of service window
    max_t = np.ceil(np.linalg.norm(dloc[:, None, :]*cfg.time_factor - nloc*cfg.time_factor, axis=-1) + cfg.service_duration) + 1

    # horizon allows for the feasibility of reaching nodes / returning from nodes within the global tw (service window)
    horizon = list(zip(min_t, cfg.service_window - max_t))
    epsilon = np.maximum(np.abs(rnds.standard_normal([cfg.size, cfg.graph_size])), 1 / cfg.time_factor)

    # sample earliest start times a
    a = [rnds.randint(*h) for h in horizon]

    tw = [np.transpose(np.vstack((rt,  # a
                                  np.minimum(rt + cfg.tw_expansion * cfg.time_factor * sd, h[-1]).astype(int)  # b
                                  ))).tolist()
          for rt, sd, h in zip(a, epsilon, horizon)]

    depo = torch.FloatTensor(dloc.tolist()) * cfg.time_factor / cfg.service_window
    graphs = torch.FloatTensor(nloc.tolist()) * cfg.time_factor / cfg.service_window
    demand = torch.FloatTensor(np.minimum(np.maximum(np.abs(rnds.normal(loc=cfg.loc, scale=cfg.scale, size=[cfg.size, cfg.graph_size])).astype(int), 1), cfg.max).tolist()) / 200.
    tw = torch.FloatTensor(tw) / cfg.service_window
    time = tw[:,:,0]
    finish = tw[:,:,1]
    service = torch.tensor(np.full([cfg.size, cfg.graph_size], cfg.service_duration).tolist()) / cfg.service_window

    return (depo, graphs, demand, time, finish, service)
              



# def generate_data_onfly(num_samples=100, graph_size=20):
#     """Generate temp dataset in memory
#     """

#     depo = torch.FloatTensor(num_samples, 2).uniform_(0, 100) / 1200.
#     graphs = torch.FloatTensor(num_samples, graph_size, 2).uniform_(0, 100) / 1200.
#     demand = (torch.FloatTensor(num_samples, graph_size).uniform_(0, 50).int() + 1).float() / 200
#     time = (torch.FloatTensor(num_samples, graph_size).uniform_(150, 1000).int() + 1).float() / 1200.
#     finish = time + (torch.FloatTensor(num_samples, graph_size).uniform_(0, 200).int()).float() / 1200.
#     service = (torch.FloatTensor(num_samples, graph_size).uniform_(0, 100).int() + 1).float() / 1200.


    

#     return (depo, graphs, demand, time, finish, service)



class FastTensorDataLoader:
    """
    A DataLoader-like object for a set of tensors that can be much faster than
    TensorDataset + DataLoader because dataloader grabs individual indices of
    the dataset and calls cat (slow).
    Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
    """
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        """
        Initialize a FastTensorDataLoader.
        :param *tensors: tensors to store. Must have the same length @ dim 0.
        :param batch_size: batch size to load.
        :param shuffle: if True, shuffle the data *in-place* whenever an
            iterator is created out of this object.
        :returns: A FastTensorDataLoader.
        """
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
    def __iter__(self):
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = [t[r] for t in self.tensors]
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches


def get_cur_time():
    """Returns local time as string
    """
    ts = time.time()
    return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')




def get_dev_of_mod(model):
    return next(model.parameters()).device




In [8]:
cfg = Data_100()
data = generate_data_onfly(cfg)

In [9]:
data[2]

tensor([[0.0100, 0.0950, 0.1550,  ..., 0.0550, 0.1000, 0.0850],
        [0.0450, 0.1750, 0.0050,  ..., 0.0850, 0.1050, 0.1450],
        [0.0500, 0.0850, 0.1500,  ..., 0.0400, 0.0300, 0.1000],
        ...,
        [0.0800, 0.0300, 0.0700,  ..., 0.0550, 0.0850, 0.1000],
        [0.0400, 0.0800, 0.1150,  ..., 0.1550, 0.1800, 0.0800],
        [0.1150, 0.0100, 0.0350,  ..., 0.0750, 0.1100, 0.1350]])

Reinforce_Baseline

In [10]:
import torch
from scipy.stats import ttest_rel
from tqdm import tqdm
import numpy as np



def copy_of_pt_model(model, embedding_dim=128, graph_size=20):


    new_model = AttentionDynamicModel(embedding_dim).to(get_dev_of_mod(model))
    set_decode_type(new_model, "sampling")

    model_dict = model.state_dict()
    new_model.load_state_dict(model_dict)

    new_model.eval()

    return new_model

def get_costs_rollout(model, train_batches, disable_tqdm):
    costs_list = []
    for batch in tqdm(train_batches, disable=disable_tqdm, desc="Rollout greedy execution"):
        cost, _ = model(batch)
        costs_list.append(cost)
    return costs_list

def rollout(model, dataset, batch_size = 32, disable_tqdm = False):
    # Evaluate model in greedy mode
    set_decode_type(model, "greedy")

    train_batches = FastTensorDataLoader(dataset[0],dataset[1],dataset[2], dataset[3],dataset[4],dataset[5], batch_size=batch_size, shuffle=False)

    model_was_training = model.training
    model.eval()

    with torch.no_grad():
        costs_list = get_costs_rollout(model, train_batches, disable_tqdm)

    if model_was_training: model.train() # restore original model training state

    return torch.cat(costs_list, dim=0)


def validate(dataset, model, batch_size=1000):
    """Validates model on given dataset in greedy mode
    """
    # rollout will set the model to eval mode and turn it back to it's original mode after it finishes
    val_costs = rollout(model, dataset, batch_size=batch_size)
    set_decode_type(model, "sampling")
    mean_cost = torch.mean(val_costs)
    print(f"Validation score: {np.round(mean_cost, 4)}")
    return mean_cost


class RolloutBaseline:

    def __init__(self, model, filename = None,
                 from_checkpoint=False,
                 path_to_checkpoint=None,
                 cfg = Data_100(),
                 wp_n_epochs=1,
                 epoch=0,
                 num_samples=10000,
                 warmup_exp_beta=0.8,
                 embedding_dim=128,
                 graph_size=100
                 ):
        """
        Args:
            model: current model
            filename: suffix for baseline checkpoint filename
            from_checkpoint: start from checkpoint flag
            path_to_checkpoint: path to baseline model weights
            wp_n_epochs: number of warm-up epochs
            epoch: current epoch number
            num_samples: number of samples to be generated for baseline dataset
            warmup_exp_beta: warmup mixing parameter (exp. moving average parameter)

        """
        self.cfg = cfg
        self.num_samples = num_samples
        self.cur_epoch = epoch
        self.wp_n_epochs = wp_n_epochs
        self.beta = warmup_exp_beta

        # controls the amount of warmup
        self.alpha = 0.0

        self.running_average_cost = None

        # Checkpoint params
        self.filename = filename
        self.from_checkpoint = from_checkpoint
        self.path_to_checkpoint = path_to_checkpoint

        # Problem params
        self.embedding_dim = embedding_dim
        self.graph_size = graph_size

        # create and evaluate initial baseline
        self._update_baseline(model, epoch)


    def _update_baseline(self, model, epoch):
        self.model = copy_of_pt_model(model,
                                          embedding_dim=self.embedding_dim,
                                          graph_size=self.graph_size)

        self.model.eval()


        # We generate a new dataset for baseline model on each baseline update to prevent possible overfitting
        self.dataset = generate_data_onfly(self.cfg)

        print(f"Evaluating baseline model on baseline dataset (epoch = {epoch})")
        self.bl_vals = rollout(self.model, self.dataset)
        self.mean = torch.mean(self.bl_vals)
        self.cur_epoch = epoch

    def ema_eval(self, cost):
        """This is running average of cost through previous batches (only for warm-up epochs)
        """

        if self.running_average_cost is None:
            self.running_average_cost = torch.mean(cost)
        else:
            self.running_average_cost = self.beta * self.running_average_cost + (1. - self.beta) * torch.mean(cost)

        return self.running_average_cost

    def eval(self, batch, cost):
        """Evaluates current baseline model on single training batch
        """

        if self.alpha == 0:
            return self.ema_eval(cost)

        if self.alpha < 1:
            v_ema = self.ema_eval(cost)
        else:
            v_ema = torch.tensor(0.0)

        with torch.no_grad():
            v_b, _ = self.model(batch)

        # Combination of baseline cost and exp. moving average cost
        return self.alpha * v_b.detach() + (1 - self.alpha) * v_ema.detach()

    def eval_all(self, dataset, batch):
        """Evaluates current baseline model on the whole dataset only for non warm-up epochs
        """

        if self.alpha < 1:
            return None

        val_costs = rollout(self.model, dataset, batch_size=batch)

        return val_costs

    def epoch_callback(self, model, epoch):
        """Compares current baseline model with the training model and updates baseline if it is improved
        """

        self.cur_epoch = epoch

        print(f"Evaluating candidate model on baseline dataset (callback epoch = {self.cur_epoch})")
        candidate_vals = rollout(model, self.dataset)  # costs for training model on baseline dataset
        candidate_mean = torch.mean(candidate_vals)

        diff = candidate_mean - self.mean

        print(f"Epoch {self.cur_epoch} candidate mean {candidate_mean}, baseline epoch {self.cur_epoch} mean {self.mean}, difference {diff}")

        if diff < 0:
            # statistic + p-value
            t, p = ttest_rel(candidate_vals, self.bl_vals)

            p_val = p / 2
            print(f"p-value: {p_val}")

            if p_val < 0.05:
                print('Update baseline')
                self._update_baseline(model, self.cur_epoch)

        # alpha controls the amount of warmup
        if self.alpha < 1.0:
            self.alpha = (self.cur_epoch + 1) / float(self.wp_n_epochs)
            print(f"alpha was updated to {self.alpha}")



Training

In [11]:
from tqdm import tqdm
import pandas as pd
import torch



class IterativeMean():
    def __init__(self):
        self.sum = 0
        self.n = 0

    def update_state(self, val):
        self.sum += val
        self.n += 1

    def result(self):
        return self.sum / self.n

def train_model(optimizer,
                model_torch,
                baseline,
                batch = 128,
                start_epoch = 0,
                end_epoch = 5,
                from_checkpoint = False,
                grad_norm_clipping = 1.0,
                batch_verbose = 1000,
                mem_efficient=True,
                ):


    def rein_loss(model, inputs, baseline, num_batch):
        """Calculate loss for REINFORCE algorithm
        """

        # Evaluate model, get costs and log probabilities
        cost, log_likelihood = model(inputs)

        # Evaluate baseline
        # For first wp_n_epochs we take the combination of baseline and ema for previous batches
        # after that we take a slice of precomputed baseline values
        bl_val = bl_vals[num_batch] if bl_vals is not None else baseline.eval(inputs, cost)

        # Calculate loss
        reinforce_loss = torch.mean((cost - bl_val.detach()) * log_likelihood)

        return reinforce_loss, torch.mean(cost)

    def mem_efficient_rein_loss(model, inputs, baseline, num_batch):
        rein_detached_loss, (cost, log_likelihood) = model.fwd_rein_loss(inputs, baseline, bl_vals, num_batch)
        return rein_detached_loss, torch.mean(cost)

    def grad(model, inputs, baseline, num_batch):
        """Calculate gradients
        """
        if mem_efficient:
            loss, cost = mem_efficient_rein_loss(model, inputs, baseline, num_batch)
        else:
            loss, cost = rein_loss(model, inputs, baseline, num_batch)
            loss.backward()
        grads = [param.grad.view(-1) for param in model.parameters()]
        grads = torch.cat(grads)
        # we can return detached loss since it's backwarded already above
        return loss.detach(), cost, grads


    # Training loop
    cfg = Data_100()
    for epoch in range(start_epoch, end_epoch):

        # Create dataset on current epoch
        data = generate_data_onfly(cfg)

        epoch_loss_avg = IterativeMean()
        epoch_cost_avg = IterativeMean()

        # Skip warm-up stage when we continue training from checkpoint
        if from_checkpoint and baseline.alpha != 1.0:
            print('Skipping warm-up mode')
            baseline.alpha = 1.0

        # If epoch > wp_n_epochs then precompute baseline values for the whole dataset else None
        bl_vals = baseline.eval_all(data, batch)  # (samples, ) or None
        bl_vals = torch.reshape(bl_vals, (-1, batch)) if bl_vals is not None else None # (n_batches, batch) or None



        train_batches = FastTensorDataLoader(data[0],data[1],data[2], data[3],data[4],data[5], batch_size=batch, shuffle=False)

        for num_batch, x_batch in enumerate(train_batches):
            optimizer.zero_grad()
            loss_value, cost_val, grads = grad(model_torch, x_batch, baseline, num_batch)

            torch.nn.utils.clip_grad_norm_(model_torch.parameters(), grad_norm_clipping)

            optimizer.step()

            # Track progress
            epoch_loss_avg.update_state(loss_value)
            epoch_cost_avg.update_state(cost_val)

            if num_batch%batch_verbose == 0:
                print("Epoch {} (batch = {}): Loss: {}: Cost: {}".format(epoch, num_batch, epoch_loss_avg.result(), epoch_cost_avg.result()))

        # Update baseline if the candidate model is good enough. In this case also create new baseline dataset
        baseline.epoch_callback(model_torch, epoch)
        set_decode_type(model_torch, "sampling")
        print('--------------')



        print(get_cur_time(), "Epoch {}: Loss: {}: Cost: {}".format(epoch, epoch_loss_avg.result(), epoch_cost_avg.result()))
        print('--------------')
        torch.save(model.state_dict(), 'twdynamic_cvrp_100.pt')
        print('Saved!')


In [12]:
BATCH = 32
START_EPOCH = 0
END_EPOCH = 200
FROM_CHECKPOINT = False
embedding_dim = 128
LEARNING_RATE = 0.001
ROLLOUT_SAMPLES = 1000
NUMBER_OF_WP_EPOCHS = 1
GRAD_NORM_CLIPPING = 1.0
BATCH_VERBOSE = 5
SEED = 1234
GRAPH_SIZE = 100


In [13]:

# model = AttentionDynamicModel(embedding_dim).to(device)
# set_decode_type(model, "sampling")

In [14]:
cfg = Data_100()
data = generate_data_onfly(cfg)
model = AttentionDynamicModel(embedding_dim).to(device)
set_decode_type(model, "sampling")
train_batches = FastTensorDataLoader(data[0],data[1],data[2], data[3],data[4],data[5], batch_size=2, shuffle=False)
for batch in train_batches:
  # print(batch)
  a = model(batch, return_pi = True)
  print(a)
  break

[tensor([5.4446, 5.7982]), tensor([-379.3987, -371.0754], grad_fn=<AddBackward0>), tensor([[ 72,  51,   8,  85,  48,  42,  25,  47,  19,  13,  34,  66,   0,  29,
         100,  95,  78,  23,  96,   6,  91,  84,  89,  10,  76,   0,  35,   0,
          99,  11,  93,  80,  38,  32,  59,   3,  87,  64,  14,  45,  62,   0,
          37,   0,  16,  46,  71,  26,  53,  69,  56,  49,  90,  82,  61,  12,
           0,  94,  15,   0,  60,  17,  65,  79,  43,   2,  73,  31,  77,  67,
          27,  88,  40,  24,  28,  41,   0,   7,  33,   9,  50,  58,  18,  36,
          83,  54,  92,  86,  21,  70,   0,  55,  20,   4,   0,  74,   0,  22,
          68,   0,  30,  97,  57,   5,  63,  81,  44,  75,  98,   1,   0,  39,
           0,  52],
        [ 38,  43,  59,   0,  79,  69,  10,  97,  81,  70, 100,  14,  39,  49,
          41,  66,   2,   0,  30,  98,  71,  54,   7,  83,  63,  86,  26,  15,
          75,   8,  78,  73,  47,  19,   0,  57,  17,  80,  65,  45,  18,  27,
           3,  60,  37,  87,

In [15]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [16]:
# # Initialize baseline
# baseline = RolloutBaseline(model,
#                            wp_n_epochs = NUMBER_OF_WP_EPOCHS,
#                            epoch = 0,
#                            num_samples=ROLLOUT_SAMPLES,
#                            filename = None,
#                            from_checkpoint = FROM_CHECKPOINT,
#                            embedding_dim=embedding_dim,
#                            graph_size=GRAPH_SIZE
#                            )

In [17]:
# torch.cuda.empty_cache()

In [18]:
# train_model(optimizer,
#             model,
#             baseline,
#             batch = BATCH,
#             start_epoch = START_EPOCH,
#             end_epoch = END_EPOCH,
#             from_checkpoint = FROM_CHECKPOINT,
#             grad_norm_clipping = GRAD_NORM_CLIPPING,
#             batch_verbose = BATCH_VERBOSE,
#             )


In [19]:
cfg = Data_100()
data = generate_data_onfly(cfg) 

In [20]:
from Load_data import load_data
file = ['R101.txt', 'R103.txt', 'R102.txt', 'R112.txt', 'R106.txt', 'R107.txt', 'R105.txt', 'R111.txt', 'C108.txt', 
 'RC208.txt', 'C109.txt', 'R110.txt', 'R104.txt', 'R201.txt', 'R202.txt', 'R203.txt', 'R207.txt', 'R206.txt',
'R210.txt', 'R204.txt', 'C208.txt', 'R205.txt', 'R211.txt', 'RC108.txt', 'RC105.txt', 'R208.txt', 'C205.txt',
'C204.txt', 'R209.txt', 'RC104.txt', 'RC106.txt', 'C206.txt', 'C207.txt', 'RC107.txt', 'RC103.txt', 'C203.txt', 
'C202.txt', 'RC102.txt', 'C201.txt', 'RC101.txt', 'R109.txt', 'C104.txt', 'RC204.txt', 'RC205.txt', 'C105.txt',
'R108.txt', 'C107.txt', 'RC207.txt', 'RC206.txt', 'C106.txt', 'C102.txt', 'RC202.txt', 'RC203.txt', 'C103.txt',
'C101.txt', 'RC201.txt']

idx = 0
with open('Data.txt', 'w') as outfile:
    file_path = 'txt/' + file[idx]
    with open(file_path) as infile:
        for line in infile:
            outfile.write(line)

max_cap, xcoord, ycoord, demand, e_time, l_time, s_time, _ = load_data() 

In [21]:
#     depo = torch.FloatTensor(num_samples, 2).uniform_(0, 100) / 1200.
#     graphs = torch.FloatTensor(num_samples, graph_size, 2).uniform_(0, 100) / 1200.
#     demand = (torch.FloatTensor(num_samples, graph_size).uniform_(0, 50).int() + 1).float() / 200
#     time = (torch.FloatTensor(num_samples, graph_size).uniform_(150, 1000).int() + 1).float() / 1200.
#     finish = time + (torch.FloatTensor(num_samples, graph_size).uniform_(0, 200).int()).float() / 1200.
#     service = (torch.FloatTensor(num_samples, graph_size).uniform_(0, 100).int() + 1).float() / 1200.

In [23]:
file = ['R101.txt', 'R103.txt', 'R102.txt', 'R112.txt', 'R106.txt', 'R107.txt', 'R105.txt', 'R111.txt', 'C108.txt', 
 'RC208.txt', 'C109.txt', 'R110.txt', 'R104.txt', 'R201.txt', 'R202.txt', 'R203.txt', 'R207.txt', 'R206.txt',
'R210.txt', 'R204.txt', 'C208.txt', 'R205.txt', 'R211.txt', 'RC108.txt', 'RC105.txt', 'R208.txt', 'C205.txt',
'C204.txt', 'R209.txt', 'RC104.txt', 'RC106.txt', 'C206.txt', 'C207.txt', 'RC107.txt', 'RC103.txt', 'C203.txt', 
'C202.txt', 'RC102.txt', 'C201.txt', 'RC101.txt', 'R109.txt', 'C104.txt', 'RC204.txt', 'RC205.txt', 'C105.txt',
'R108.txt', 'C107.txt', 'RC207.txt', 'RC206.txt', 'C106.txt', 'C102.txt', 'RC202.txt', 'RC203.txt', 'C103.txt',
'C101.txt', 'RC201.txt'] 
model.load_state_dict(torch.load('D-AM.pt', map_location=torch.device('cpu')))
for idx in range(len(file)):
    with open('Data.txt', 'w') as outfile:
        file_path = 'txt/' + file[idx]
        with open(file_path) as infile:
            for line in infile:
                outfile.write(line)

    max_cap, xcoord, ycoord, demand, e_time, l_time, s_time, _ = load_data() 

    scale1 = l_time[0]
    scale2 = max_cap 
    num_samples = 10 
    depo = torch.FloatTensor([[xcoord[0], ycoord[0]]]).repeat(num_samples, 1) / scale1
    graphs = torch.FloatTensor(np.stack((xcoord[1:], ycoord[1:]), axis=1)).repeat(num_samples, 1, 1) / scale1
    demand = torch.FloatTensor(demand[1:]).repeat(num_samples, 1) / scale2 
    time = torch.FloatTensor(e_time[1:]).repeat(num_samples, 1) / scale1
    finish = torch.FloatTensor(l_time[1:]).repeat(num_samples, 1) / scale1
    service = torch.FloatTensor(s_time[1:]).repeat(num_samples, 1) / scale1
    assert depo.shape == (num_samples, 2)
    assert graphs.shape == (num_samples, 100, 2)
    assert demand.shape == (num_samples, 100)
    assert time.shape == (num_samples, 100)
    assert finish.shape == (num_samples, 100)
    assert service.shape == (num_samples, 100)

    data = (depo, graphs, demand, time, finish, service)
    set_decode_type(model, "sampling")
    train_batches = FastTensorDataLoader(data[0],data[1],data[2], data[3],data[4],data[5], batch_size=5, shuffle=False)
    lst = []
    for batch in train_batches:
        a = model(batch, return_pi = True)
        lst.append(torch.min(a[0]*scale1).item())
    with open('Data.txt', 'r') as f:
        name = f.readline()
    name.rstrip()
    with open('AM.txt', 'a') as f:
        f.write(name + '\n')
        f.write("Distance: {}".format(lst[0])+ '\n')
        f.write("------------------------------"+ '\n')
    with open('D-AM.txt', 'a') as f:
        f.write(name + '\n')
        f.write("Distance: {}".format(lst[1])+ '\n')
        f.write("------------------------------"+ '\n')

with open('AM.txt', 'a') as f:
        f.write(name + '\n')
        f.write("Distance: {}".format(lst[0])+ '\n')
        f.write("----------------------------------------------------------------------------------------------"+ '\n')
with open('D-AM.txt', 'a') as f:
        f.write(name + '\n')
        f.write("Distance: {}".format(lst[1])+ '\n')
        f.write("----------------------------------------------------------------------------------------------"+ '\n')

file = ['data_gen/gen0.txt', 'data_gen/gen1.txt','data_gen/gen2.txt','data_gen/gen3.txt',
        'data_gen/gen4.txt','data_gen/gen5.txt','data_gen/gen6.txt','data_gen/gen7.txt',
        'data_gen/gen8.txt','data_gen/gen9.txt']

for idx in range(len(file)):
    with open('Data.txt', 'w') as outfile:
        file_path = file[idx]
        with open(file_path) as infile:
            for line in infile:
                outfile.write(line)

    max_cap, xcoord, ycoord, demand, e_time, l_time, s_time, _ = load_data() 

    scale1 = l_time[0]
    scale2 = max_cap 
    num_samples = 10 
    depo = torch.FloatTensor([[xcoord[0], ycoord[0]]]).repeat(num_samples, 1) / scale1
    graphs = torch.FloatTensor(np.stack((xcoord[1:], ycoord[1:]), axis=1)).repeat(num_samples, 1, 1) / scale1
    demand = torch.FloatTensor(demand[1:]).repeat(num_samples, 1) / scale2 
    time = torch.FloatTensor(e_time[1:]).repeat(num_samples, 1) / scale1
    finish = torch.FloatTensor(l_time[1:]).repeat(num_samples, 1) / scale1
    service = torch.FloatTensor(s_time[1:]).repeat(num_samples, 1) / scale1
    assert depo.shape == (num_samples, 2)
    assert graphs.shape == (num_samples, 100, 2)
    assert demand.shape == (num_samples, 100)
    assert time.shape == (num_samples, 100)
    assert finish.shape == (num_samples, 100)
    assert service.shape == (num_samples, 100)

    data = (depo, graphs, demand, time, finish, service)
    set_decode_type(model, "sampling")
    train_batches = FastTensorDataLoader(data[0],data[1],data[2], data[3],data[4],data[5], batch_size=5, shuffle=False)
    lst = []
    for batch in train_batches:
        a = model(batch, return_pi = True)
        lst.append(torch.min(a[0]*scale1).item())
    with open('Data.txt', 'r') as f:
        name = f.readline()
    name.rstrip()
    with open('AM.txt', 'a') as f:
        f.write(name + '\n')
        f.write("Distance: {}".format(lst[0])+ '\n')
        f.write("---------------------------"+ '\n')
    with open('D-AM.txt', 'a') as f:
        f.write(name + '\n')
        f.write("Distance: {}".format(lst[1])+ '\n')
        f.write("---------------------------"+ '\n')


