In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [13]:
import numpy as np
from torch.nn.functional import one_hot
from eda.TSP import TSP_Instance, TSP_Environment, TSP_State
from eda.solveTSP_v2 import solve

def generate_data(n_cities=50, nb_sample=512):
    X = []
    Y = []
    while len(X) < nb_sample:
        city_points = np.random.rand(n_cities, 2)
        inst_info = TSP_Instance(city_points)
        solution = solve(city_points)

        X.append(torch.from_numpy(city_points))
        Y.append(torch.tensor(solution.visited))
    return torch.stack(X).float(), torch.stack(Y).float()

X, Y = generate_data(50, 5000)
X.shape, Y.shape

(torch.Size([5000, 50, 2]), torch.Size([5000, 51]))

In [14]:
from torch.distributions.categorical import Categorical

def myMHA(Q, K, V, nb_heads, mask=None, clip_value=None):
    """
    Compute multi-head attention (MHA) given a query Q, key K, value V and attention mask :
      h = Concat_{k=1}^nb_heads softmax(Q_k^T.K_k).V_k 
    Note : We did not use nn.MultiheadAttention to avoid re-computing all linear transformations at each call.
    Inputs : Q of size (bsz, dim_emb, 1)                batch of queries
             K of size (bsz, dim_emb, nb_nodes+1)       batch of keys
             V of size (bsz, dim_emb, nb_nodes+1)       batch of values
             mask of size (bsz, nb_nodes+1)             batch of masks of visited cities
             clip_value is a scalar 
    Outputs : attn_output of size (bsz, 1, dim_emb)     batch of attention vectors
              attn_weights of size (bsz, 1, nb_nodes+1) batch of attention weights
    """
    bsz, nb_nodes, emd_dim = K.size() #  dim_emb must be divisable by nb_heads
    if nb_heads>1:
        # PyTorch view requires contiguous dimensions for correct reshaping
        Q = Q.transpose(1,2).contiguous() # size(Q)=(bsz, dim_emb, 1)
        Q = Q.view(bsz*nb_heads, emd_dim//nb_heads, 1) # size(Q)=(bsz*nb_heads, dim_emb//nb_heads, 1)
        Q = Q.transpose(1,2).contiguous() # size(Q)=(bsz*nb_heads, 1, dim_emb//nb_heads)
        K = K.transpose(1,2).contiguous() # size(K)=(bsz, dim_emb, nb_nodes+1)
        K = K.view(bsz*nb_heads, emd_dim//nb_heads, nb_nodes) # size(K)=(bsz*nb_heads, dim_emb//nb_heads, nb_nodes+1)
        K = K.transpose(1,2).contiguous() # size(K)=(bsz*nb_heads, nb_nodes+1, dim_emb//nb_heads)
        V = V.transpose(1,2).contiguous() # size(V)=(bsz, dim_emb, nb_nodes+1)
        V = V.view(bsz*nb_heads, emd_dim//nb_heads, nb_nodes) # size(V)=(bsz*nb_heads, dim_emb//nb_heads, nb_nodes+1)
        V = V.transpose(1,2).contiguous() # size(V)=(bsz*nb_heads, nb_nodes+1, dim_emb//nb_heads)
    attn_weights = torch.bmm(Q, K.transpose(1,2))/ Q.size(-1)**0.5 # size(attn_weights)=(bsz*nb_heads, 1, nb_nodes+1)
    if clip_value is not None:
        attn_weights = clip_value * torch.tanh(attn_weights)
    if mask is not None:
        if nb_heads>1:
            mask = torch.repeat_interleave(mask, repeats=nb_heads, dim=0) # size(mask)=(bsz*nb_heads, nb_nodes+1)
        #attn_weights = attn_weights.masked_fill(mask.unsqueeze(1), float('-inf')) # size(attn_weights)=(bsz*nb_heads, 1, nb_nodes+1)
        attn_weights = attn_weights.masked_fill(mask.unsqueeze(1), float('-1e9')) # size(attn_weights)=(bsz*nb_heads, 1, nb_nodes+1)
    attn_weights = torch.softmax(attn_weights, dim=-1) # size(attn_weights)=(bsz*nb_heads, 1, nb_nodes+1)
    attn_output = torch.bmm(attn_weights, V) # size(attn_output)=(bsz*nb_heads, 1, dim_emb//nb_heads)
    if nb_heads>1:
        attn_output = attn_output.transpose(1,2).contiguous() # size(attn_output)=(bsz*nb_heads, dim_emb//nb_heads, 1)
        attn_output = attn_output.view(bsz, emd_dim, 1) # size(attn_output)=(bsz, dim_emb, 1)
        attn_output = attn_output.transpose(1,2).contiguous() # size(attn_output)=(bsz, 1, dim_emb)
        attn_weights = attn_weights.view(bsz, nb_heads, 1, nb_nodes) # size(attn_weights)=(bsz, nb_heads, 1, nb_nodes+1)
        attn_weights = attn_weights.mean(dim=1) # mean over the heads, size(attn_weights)=(bsz, 1, nb_nodes+1)
    return attn_output, attn_weights
    
    
class AutoRegressiveDecoderLayer(nn.Module):
    """
    Single decoder layer based on self-attention and query-attention
    Inputs :  
      h_t of size      (bsz, 1, dim_emb)          batch of input queries
      K_att of size    (bsz, nb_nodes+1, dim_emb) batch of query-attention keys
      V_att of size    (bsz, nb_nodes+1, dim_emb) batch of query-attention values
      mask of size     (bsz, nb_nodes+1)          batch of masks of visited cities
    Output :  
      h_t of size (bsz, nb_nodes+1)               batch of transformed queries
    """
    def __init__(self, dim_emb, nb_heads):
        super(AutoRegressiveDecoderLayer, self).__init__()
        self.dim_emb = dim_emb
        self.nb_heads = nb_heads
        self.Wq_selfatt = nn.Linear(dim_emb, dim_emb)
        self.Wk_selfatt = nn.Linear(dim_emb, dim_emb)
        self.Wv_selfatt = nn.Linear(dim_emb, dim_emb)
        self.W0_selfatt = nn.Linear(dim_emb, dim_emb)
        self.W0_att = nn.Linear(dim_emb, dim_emb)
        self.Wq_att = nn.Linear(dim_emb, dim_emb)
        self.W1_MLP = nn.Linear(dim_emb, dim_emb)
        self.W2_MLP = nn.Linear(dim_emb, dim_emb)
        self.BN_selfatt = nn.LayerNorm(dim_emb)
        self.BN_att = nn.LayerNorm(dim_emb)
        self.BN_MLP = nn.LayerNorm(dim_emb)
        self.K_sa = None
        self.V_sa = None

    def reset_selfatt_keys_values(self):
        self.K_sa = None
        self.V_sa = None
        
    def forward(self, h_t, K_att, V_att, mask):
        bsz = h_t.size(0)
        h_t = h_t.view(bsz,1,self.dim_emb) # size(h_t)=(bsz, 1, dim_emb)
        # embed the query for self-attention
        q_sa = self.Wq_selfatt(h_t) # size(q_sa)=(bsz, 1, dim_emb)
        k_sa = self.Wk_selfatt(h_t) # size(k_sa)=(bsz, 1, dim_emb)
        v_sa = self.Wv_selfatt(h_t) # size(v_sa)=(bsz, 1, dim_emb)
        # concatenate the new self-attention key and value to the previous keys and values
        if self.K_sa is None:
            self.K_sa = k_sa # size(self.K_sa)=(bsz, 1, dim_emb)
            self.V_sa = v_sa # size(self.V_sa)=(bsz, 1, dim_emb)
        else:
            self.K_sa = torch.cat([self.K_sa, k_sa], dim=1)
            self.V_sa = torch.cat([self.V_sa, v_sa], dim=1)
        # compute self-attention between nodes in the partial tour
        h_t = h_t + self.W0_selfatt( myMHA(q_sa, self.K_sa, self.V_sa, self.nb_heads)[0] ) # size(h_t)=(bsz, 1, dim_emb)
        h_t = self.BN_selfatt(h_t.squeeze()) # size(h_t)=(bsz, dim_emb)
        h_t = h_t.view(bsz, 1, self.dim_emb) # size(h_t)=(bsz, 1, dim_emb)
        # compute attention between self-attention nodes and encoding nodes in the partial tour (translation process)
        q_a = self.Wq_att(h_t) # size(q_a)=(bsz, 1, dim_emb)
        h_t = h_t + self.W0_att( myMHA(q_a, K_att, V_att, self.nb_heads, mask)[0] ) # size(h_t)=(bsz, 1, dim_emb)
        h_t = self.BN_att(h_t.squeeze()) # size(h_t)=(bsz, dim_emb)
        h_t = h_t.view(bsz, 1, self.dim_emb) # size(h_t)=(bsz, 1, dim_emb)
        # MLP
        h_t = h_t + self.W2_MLP(torch.relu(self.W1_MLP(h_t)))
        h_t = self.BN_MLP(h_t.squeeze(1)) # size(h_t)=(bsz, dim_emb)
        return h_t
        
        
class Transformer_decoder_net(nn.Module): 
    """
    Decoder network based on self-attention and query-attention transformers
    Inputs :  
      h_t of size      (bsz, 1, dim_emb)                            batch of input queries
      K_att of size    (bsz, nb_nodes+1, dim_emb*nb_layers_decoder) batch of query-attention keys for all decoding layers
      V_att of size    (bsz, nb_nodes+1, dim_emb*nb_layers_decoder) batch of query-attention values for all decoding layers
      mask of size     (bsz, nb_nodes+1)                            batch of masks of visited cities
    Output :  
      prob_next_node of size (bsz, nb_nodes+1)                      batch of probabilities of next node
    """
    def __init__(self, dim_emb, nb_heads, nb_layers_decoder):
        super(Transformer_decoder_net, self).__init__()
        self.dim_emb = dim_emb
        self.nb_heads = nb_heads
        self.nb_layers_decoder = nb_layers_decoder
        self.decoder_layers = nn.ModuleList( [AutoRegressiveDecoderLayer(dim_emb, nb_heads) for _ in range(nb_layers_decoder-1)] )
        self.Wq_final = nn.Linear(dim_emb, dim_emb)
        
    # Reset to None self-attention keys and values when decoding starts 
    def reset_selfatt_keys_values(self): 
        for l in range(self.nb_layers_decoder-1):
            self.decoder_layers[l].reset_selfatt_keys_values()
            
    def forward(self, h_t, K_att, V_att, mask):
        for l in range(self.nb_layers_decoder):
            K_att_l = K_att[:,:,l*self.dim_emb:(l+1)*self.dim_emb].contiguous()  # size(K_att_l)=(bsz, nb_nodes+1, dim_emb)
            V_att_l = V_att[:,:,l*self.dim_emb:(l+1)*self.dim_emb].contiguous()  # size(V_att_l)=(bsz, nb_nodes+1, dim_emb)
            if l<self.nb_layers_decoder-1: # decoder layers with multiple heads (intermediate layers)
                h_t = self.decoder_layers[l](h_t, K_att_l, V_att_l, mask)
            else: # decoder layers with single head (final layer)
                q_final = self.Wq_final(h_t)
                bsz = h_t.size(0)
                q_final = q_final.view(bsz, 1, self.dim_emb)
                attn_weights = myMHA(q_final, K_att_l, V_att_l, 1, mask, 10)[1] 
        prob_next_node = attn_weights.squeeze(1) 
        return prob_next_node


class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_dim = 6  # Dimensión de la entrada
        self.num_heads = 16  # Número de cabezas en la atención multi-cabeza
        self.head_dim = 8  # Dimensión de cada cabeza
        self.node_dim = 2
        self.embd_dim = self.num_heads * self.head_dim
        self.ff_dim=128
        self.nb_layers=6
        self.batchnorm=True
        
        self.MHA_layers = nn.ModuleList( [nn.MultiheadAttention(self.embd_dim, self.num_heads) for _ in range(self.nb_layers)] )
        self.linear1_layers = nn.ModuleList( [nn.Linear(self.embd_dim, self.ff_dim) for _ in range(self.nb_layers)] )
        self.linear2_layers = nn.ModuleList( [nn.Linear(self.ff_dim, self.embd_dim) for _ in range(self.nb_layers)] )   
        if self.batchnorm:
            self.norm1_layers = nn.ModuleList( [nn.BatchNorm1d(self.embd_dim) for _ in range(self.nb_layers)] )
            self.norm2_layers = nn.ModuleList( [nn.BatchNorm1d(self.embd_dim) for _ in range(self.nb_layers)] )
        else:
            self.norm1_layers = nn.ModuleList( [nn.LayerNorm(self.embd_dim) for _ in range(self.nb_layers)] )
            self.norm2_layers = nn.ModuleList( [nn.LayerNorm(self.embd_dim) for _ in range(self.nb_layers)] )

        self.norm = nn.BatchNorm1d(self.embd_dim)
    def forward(self, h):      
        # PyTorch nn.MultiheadAttention requires input size (seq_len, bsz, dim_emb) 
        h = h.transpose(0,1) # size(h)=(nb_nodes, bsz, dim_emb)  
        # L layers
        for i in range(self.nb_layers):
            h_rc = h # residual connection, size(h_rc)=(nb_nodes, bsz, dim_emb)
            h, score = self.MHA_layers[i](h, h, h) # size(h)=(nb_nodes, bsz, dim_emb), size(score)=(bsz, nb_nodes, nb_nodes)
            # add residual connection
            h = h_rc + h # size(h)=(nb_nodes, bsz, dim_emb)
            if self.batchnorm:
                # Pytorch nn.BatchNorm1d requires input size (bsz, dim, seq_len)
                h = h.permute(1,2,0).contiguous() # size(h)=(bsz, dim_emb, nb_nodes)
                h = self.norm1_layers[i](h)       # size(h)=(bsz, dim_emb, nb_nodes)
                h = h.permute(2,0,1).contiguous() # size(h)=(nb_nodes, bsz, dim_emb)
            else:
                h = self.norm1_layers[i](h)       # size(h)=(nb_nodes, bsz, dim_emb) 
            # feedforward
            h_rc = h # residual connection
            h = self.linear2_layers[i](torch.relu(self.linear1_layers[i](h)))
            h = h_rc + h # size(h)=(nb_nodes, bsz, dim_emb)
            if self.batchnorm:
                h = h.permute(1,2,0).contiguous() # size(h)=(bsz, dim_emb, nb_nodes)
                h = self.norm2_layers[i](h)       # size(h)=(bsz, dim_emb, nb_nodes)
                h = h.permute(2,0,1).contiguous() # size(h)=(nb_nodes, bsz, dim_emb)
            else:
                h = self.norm2_layers[i](h) # size(h)=(nb_nodes, bsz, dim_emb)
        # Transpose h
        h = h.transpose(0,1) # size(h)=(bsz, nb_nodes, dim_emb)
        return h, score

def generate_positional_encoding(d_model, max_len):
    """
    Create standard transformer PEs.
    Inputs :  
      d_model is a scalar correspoding to the hidden dimension
      max_len is the maximum length of the sequence
    Output :  
      pe of size (max_len, d_model), where d_model=dim_emb, max_len=1000
    """
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
    pe[:,0::2] = torch.sin(position * div_term)
    pe[:,1::2] = torch.cos(position * div_term)
    return pe


class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Parámetros del modelo
        self.input_dim = 6  # Dimensión de la entrada
        self.num_heads = 16  # Número de cabezas en la atención multi-cabeza
        self.head_dim = 8  # Dimensión de cada cabeza
        self.node_dim = 2
        self.embd_dim = self.num_heads * self.head_dim
        self.ff_dim=256
        self.nb_dec_layers = 2
        self.nb_nodes=50

class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Parámetros del modelo
        self.input_dim = 6  # Dimensión de la entrada
        self.num_heads = 16  # Número de cabezas en la atención multi-cabeza
        self.head_dim = 8  # Dimensión de cada cabeza
        self.node_dim = 2
        self.embd_dim = self.num_heads * self.head_dim
        self.ff_dim=256
        self.nb_dec_layers = 2
        self.nb_nodes=50
        max_len_PE = 10000

        self.input_emb = nn.Linear(self.node_dim, self.embd_dim)
        self.ff = nn.Linear(self.embd_dim, 1)
        self.start_placeholder = nn.Parameter(torch.randn(self.embd_dim))

        self.enc = Encoder()
        
        self.WKatt_dec = nn.Linear(self.embd_dim, self.nb_dec_layers * self.embd_dim)
        self.WVatt_dec = nn.Linear(self.embd_dim, self.nb_dec_layers * self.embd_dim)
        self.PE = generate_positional_encoding(self.embd_dim, max_len_PE)     

        self.dec = Transformer_decoder_net(self.embd_dim, self.num_heads, self.nb_dec_layers)
        
        

        
        
            
    def forward(self, x, deterministic=False):
        # x: (bsz, nb_nodes, dim)
        zero_to_bsz = torch.arange(x.shape[0])
        bsz = x.shape[0]

        attn_mask = None
        h = self.input_emb(x)
        
        repeated_placeholder = self.start_placeholder.repeat(bsz, 1, 1)
        h = torch.cat([h, repeated_placeholder ], dim=1)

        h_enc, _ = self.enc(h)
        tours = []
        sumLog = []
        Katt_dec = self.WKatt_dec(h_enc)
        Vatt_dec = self.WVatt_dec(h_enc)
        self.PE = self.PE.to(x.device)
        idx_start_placeholder = torch.tensor([self.nb_nodes]).long().repeat(bsz).to(x.device)
        h_start = h_enc[zero_to_bsz, idx_start_placeholder, :] + self.PE[0].repeat(bsz, 1) 

        mask_visited_nodes = torch.zeros(bsz, self.nb_nodes +1, device = x.device)
        mask_visited_nodes[zero_to_bsz, idx_start_placeholder] = True
        mask_visited_nodes = mask_visited_nodes.bool()

        

        h_t = h_start
        for t in range(self.nb_nodes):
            prob_next_node = self.dec(h_t, Katt_dec, Vatt_dec, mask_visited_nodes)
            if deterministic:
                idx = torch.argmax(prob_next_node, dim=1)
            else:
                idx = Categorical(prob_next_node).sample()
            ProbOfChoices = prob_next_node[zero_to_bsz, idx]
            sumLog.append(ProbOfChoices)
            h_t = h_enc[zero_to_bsz, idx, :]
            h_t = h_t + self.PE[t+1].expand(bsz, self.embd_dim)

            tours.append(idx)

            mask_visited_nodes = mask_visited_nodes.clone()
            mask_visited_nodes[zero_to_bsz, idx] = True

        sumLog = torch.stack(sumLog, dim=1).sum(dim=1)
        tours = torch.stack(tours, dim=1)
            
        return tours, sumLog

        pass

In [15]:
from torch.utils.data import DataLoader, TensorDataset, random_split
model = CustomModel()
device = torch.device("cuda")
model = model.to(device)
X = X.to(device)
Y= Y.to(device)
import pandas as pd
batch_size=512
# Asumiendo que X_padded y Y_stacked ya están definidos y son tensores de PyTorch
dataset = TensorDataset(X, Y)

# Dividir el dataset en entrenamiento y prueba}
train_size = int(0.5 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Definir el modelo, la función de pérdida y el optimizador
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Initialize the DataFrame to store training results
df = pd.DataFrame(columns=["Model Name", "cities", "iter", "Epoch",
                         "Training Loss", "Training Accuracy",
                         "Validation Loss", "Validation Accuracy"])


print("Entrenando modelo...")
epochs = 10
city_count = 50
num_iter=1
for epoch in range(epochs):
  model.train()
  train_loss = 0
  correct = 0; total = 0
  for X_batch, y_batch in train_loader:
      optimizer.zero_grad()  # Limpia los gradientes
      outputs = model(X_batch)  # Obtenemos logits
      loss = loss_function(outputs[0], y_batch)  # Calcular la pérdida
      loss.backward()  # Backward pass
      optimizer.step()  # Actualizar parámetros
      train_loss += loss.item() * X_batch.size(0)
      _, predicted = torch.max(outputs.data, 1)
      total += y_batch.size(0)
      correct += (predicted == y_batch).sum().item()

  train_loss /= len(train_loader.dataset)
  train_accuracy = 100 * correct / total
  
  # Validación
  model.eval()
  validation_loss = 0
  correct = 0; total = 0
  with torch.no_grad():
      for X_batch, y_batch in test_loader:
          outputs = model(X_batch)
          loss = loss_function(outputs[0], y_batch)
          validation_loss += loss.item() * X_batch.size(0)
          _, predicted = torch.max(outputs.data, 1)
          total += y_batch.size(0)
          correct += (predicted == y_batch).sum().item()
  validation_loss /= len(test_loader.dataset)
  validation_accuracy = 100 * correct / total


  # Log results to DataFrame
  df = pd.concat([df, pd.DataFrame([{
        "Model Name": 'CustomModel',
        "cities": city_count,
        "iter": num_iter,
        "Epoch": epoch +1,
        "Training Loss": train_loss,
        "Training Accuracy": train_accuracy,
        "Validation Loss": validation_loss,
        "Validation Accuracy": validation_accuracy
    }])], ignore_index=True)

  print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')
  print(f'Epoch {epoch+1}, Val Loss: {validation_loss:.4f}, Val Accuracy: {validation_accuracy:.2f}%')

return df;


Entrenando modelo...


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Of the allocated memory 10.53 GiB is allocated by PyTorch, and 690.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)