In [45]:
import torch
from lightning.pytorch.callbacks import ModelCheckpoint, RichModelSummary

from rl4co.envs import PDPEnv
from rl4co.models.zoo import AttentionModel
from rl4co.utils.trainer import RL4COTrainer

In [109]:
from typing import Callable, Union

import torch

from tensordict.tensordict import TensorDict
from torch.distributions import Uniform

from rl4co.envs.common.utils import Generator, get_sampler
from rl4co.utils.pylogger import get_pylogger

log = get_pylogger(__name__)


class PDPGenerator(Generator):
    """Data generator for the Pickup and Delivery Problem (PDP).
    Args:
        num_loc: number of locations (customers) in the PDP, without the depot. (e.g. 10 means 10 locs + 1 depot)
            - 1 depot
            - `num_loc` / 2 pickup locations
            - `num_loc` / 2 delivery locations
        min_loc: minimum value for the location coordinates
        max_loc: maximum value for the location coordinates
        init_sol_type: the method type used for generating initial solutions (random or greedy)
        loc_distribution: distribution for the location coordinates
        depot_distribution: distribution for the depot location. If None, sample the depot from the locations

    Returns:
        A TensorDict with the following keys:
            locs [batch_size, num_loc, 2]: locations of each customer
            depot [batch_size, 2]: location of the depot
    """

    def __init__(
        self,
        num_loc: int = 20,
        min_loc: float = 0.0,
        max_loc: float = 1.0,
        init_sol_type: str = "random",
        loc_distribution: Union[int, float, str, type, Callable] = Uniform,
        depot_distribution: Union[int, float, str, type, Callable] = None,
        **kwargs,
    ):
        self.num_loc = num_loc
        self.min_loc = min_loc
        self.max_loc = max_loc
        self.init_sol_type = init_sol_type

        # Number of locations must be even
        if num_loc % 2 != 0:
            log.warn(
                "Number of locations must be even. Adding 1 to the number of locations."
            )
            self.num_loc += 1

        # Location distribution
        if kwargs.get("loc_sampler", None) is not None:
            self.loc_sampler = kwargs["loc_sampler"]
        else:
            self.loc_sampler = get_sampler(
                "loc", loc_distribution, min_loc, max_loc, **kwargs
            )

        # Depot distribution
        if kwargs.get("depot_sampler", None) is not None:
            self.depot_sampler = kwargs["depot_sampler"]
        else:
            self.depot_sampler = get_sampler(
                "depot", depot_distribution, min_loc, max_loc, **kwargs
            ) if depot_distribution is not None else None

    def _generate(self, batch_size) -> TensorDict:
        # Sample locations: depot and customers
        if self.depot_sampler is not None:
            depot = self.depot_sampler.sample((*batch_size, 2))
            locs = self.loc_sampler.sample((*batch_size, self.num_loc, 2)) 
            charging = self.depot_sampler.sample((*batch_size, 2))
        else:
            # if depot_sampler is None, sample the depot from the locations
            locs = self.loc_sampler.sample((*batch_size, self.num_loc + 2, 2))
            depot = locs[..., 0, :]
            charging = locs[..., 1, :]
            locs = locs[..., 2:, :]

        return TensorDict(
            {
                "locs": locs,
                "depot": depot,
                "charging": charging
            },
            batch_size=batch_size,
        )



In [760]:
from rl4co.envs.routing.pdp.render import render
# from rl4co.envs.routing.pdp.generator import PDPGenerator
from typing import Optional

import torch

from tensordict.tensordict import TensorDict
from torchrl.data import (
    BoundedTensorSpec,
    CompositeSpec,
    UnboundedContinuousTensorSpec,
    UnboundedDiscreteTensorSpec,
)

from rl4co.envs.common.base import ImprovementEnvBase, RL4COEnvBase
from rl4co.utils.ops import gather_by_index, get_tour_length, get_distance


class PDPEnv(RL4COEnvBase):
    """Pickup and Delivery Problem (PDP) environment.
    The environment is made of num_loc + 1 locations (cities):
        - 1 depot
        - `num_loc` / 2 pickup locations
        - `num_loc` / 2 delivery locations
    The goal is to visit all the pickup and delivery locations in the shortest path possible starting from the depot
    The conditions is that the agent must visit a pickup location before visiting its corresponding delivery location

    Observations:
        - locations of the depot, pickup, and delivery locations
        - current location of the vehicle
        - the remaining locations to deliver
        - the visited locations
        - the current step

    Constraints:
        - the tour starts and ends at the depot
        - each pickup location must be visited before its corresponding delivery location
        - the vehicle cannot visit the same location twice

    Finish Condition:
        - the vehicle has visited all locations

    Reward:
        - (minus) the negative length of the path

    Args:
        generator: PDPGenerator instance as the data generator
        generator_params: parameters for the generator
    """

    name = "pdp"

    def __init__(
        self,
        generator: PDPGenerator = None,
        generator_params: dict = {},
        **kwargs,
    ):
        super().__init__(**kwargs)
        if generator is None:
            generator = PDPGenerator(**generator_params)
        self.generator = generator
        self._make_spec(self.generator)

    def _step(self, td: TensorDict) -> TensorDict:
        print("action", td["action"])
        print("current_node", td["current_node"], td["current_node"].shape)
        dist_mat = self._get_distance_matrix(td["locs"])
        cur_node_to_all = gather_by_index(dist_mat, idx=td["current_node"])
        print(cur_node_to_all.shape)
        print("dist_mat_distance", gather_by_index(cur_node_to_all, idx=td["action"]))
        start_node = gather_by_index(td["locs"], idx=td["current_node"].unsqueeze(-1))
        end_node = gather_by_index(td["locs"], idx=td["action"])
        dist_traveled = get_distance(start_node, end_node)
        current_node = td["action"].unsqueeze(-1)
        print("other dist", dist_traveled)
        batch_size = td["locs"].shape[0]

        td["battery"] -= 10 * dist_traveled.view(td["battery"].shape)
        battery_check = td["battery"] < td["threshold"] 
        # if battery_check.any():
        #     print("battery low")
        #     print(td["charging_mask"])
        filtered_matrices = []
        for i in range(dist_mat.size(0)):
            matrix = dist_mat[i]
            action_index = actions[i]
            
            # Select rows excluding the action_index row
            filtered_matrix = matrix[action_index, :]
            
            # Append to the list
            filtered_matrices.append(filtered_matrix)
        
        # Stack the list of filtered matrices into a single tensor
        curr_node_to_all_dist = torch.stack(filtered_matrices)
        #         curr_node_to_all_dist[:,1] = 0 

        # find what locations can be reached with current state of charge
        exceeds_charge = td["battery"] - curr_node_to_all_dist > td["threshold"]
        num_loc = td["locs"].shape[-2] - 2  # except depot and charging station
        # Pickup and delivery node pair of selected node
        new_to_deliver = (current_node + num_loc // 2) % (num_loc + 1)
        # Set available to 0 (i.e., we visited the node)
        available = td["available"].scatter(
            -1, current_node.expand_as(td["action_mask"]), 0
        )

        to_deliver = td["to_deliver"].scatter(
            -1, new_to_deliver.expand_as(td["to_deliver"]), 1
        )

        # Action is feasible if the node is not visited and is to deliver
        # action_mask = torch.logical_and(available, to_deliver)       
        action_mask = available & to_deliver 

        # We are done there are no unvisited locations
        done = torch.count_nonzero(available, dim=-1) == 0
        print(action_mask)
        # The reward is calculated outside via get_reward for efficiency, so we set it to 0 here
        reward = torch.zeros_like(done)

        # Update step
        td.update(
            {
                "current_node": current_node,
                "available": available,
                "to_deliver": to_deliver,
                "i": td["i"] + 1,
                "action_mask": action_mask,
                "reward": reward,
                "done": done,
            }
        )
        return td

    def _reset(self, td: Optional[TensorDict] = None, batch_size=None) -> TensorDict:
        device = td.device

        locs = torch.cat((td["depot"][:, None, :], td["charging"][:, None, :], td["locs"], ), -2)
        print(self.generator.num_loc)
        # Pick is 1, deliver is 0 [batch_size, graph_size+1], [1,1...1, 0...0]
        to_deliver = torch.cat(
            [
                torch.ones(
                    *batch_size,
                    self.generator.num_loc // 2 + 2,
                    dtype=torch.bool,
                ).to(device),
                torch.zeros(
                    *batch_size,
                    self.generator.num_loc // 2 ,
                    dtype=torch.bool,
                ).to(device),
            ],
            dim=-1,
        )
        # num_loc = locs.shape[-2] - 2 
        # charging_station = torch.ones((*batch_size, 1), dtype=torch.int64).to(device)
        # new_to_deliver = (charging_station + num_loc // 2) % (num_loc + 1)
        # to_deliver = to_deliver.scatter(
        #     -1, new_to_deliver.expand_as(to_deliver), 0
        # )

        # Cannot visit depot at first step # [0,1...1] so set not available
        available = torch.ones(
            (*batch_size, self.generator.num_loc + 2), dtype=torch.bool
        ).to(device)
        action_mask = ~available.contiguous()  # [batch_size, graph_size+1]
        action_mask[..., 0] = 1  # First step is always the depot
        
        charging_mask = ~available.contiguous()
        charging_mask[..., 1] = 1
        available[..., 1] = 0

        # Other variables
        current_node = torch.zeros((*batch_size, 1), dtype=torch.int64).to(device)
        i = torch.zeros((*batch_size, 1), dtype=torch.int64).to(device)
        # battery = torch.ones(
        #     (*batch_size, self.generator.num_loc+2), dtype=torch.float32
        # ).to(device)
        battery = torch.full((*batch_size, 1), 100, dtype=torch.float32)
        threshold = torch.full(
                    (*batch_size, 1), 30, device=device
                )        
        return TensorDict(
            {
                "locs": locs,
                "current_node": current_node,
                "to_deliver": to_deliver,
                "available": available,
                "i": i,
                "action_mask": action_mask,
                "charging_mask": charging_mask,
                "battery": battery,
                "threshold": threshold
            },
            batch_size=batch_size,
        )
    
    @staticmethod
    def _get_distance_matrix(locs: torch.Tensor):
        """Compute the Manhattan distance matrix for the given coordinates.

        Args:
            locs: Tensor of shape [..., n, dim]
        """
        if locs.dtype != torch.float32 and locs.dtype != torch.float64:
            locs = locs.to(torch.float32)

            # Compute pairwise differences
        diff = locs[..., :, None, :] - locs[..., None, :, :]

        # Compute Manhattan distance
        distance_matrix = torch.sum(torch.abs(diff), dim=-1)
        return distance_matrix
    
    def _make_spec(self, generator: PDPGenerator):
        """Make the observation and action specs from the parameters."""
        self.observation_spec = CompositeSpec(
            locs=BoundedTensorSpec(
                low=generator.min_loc,
                high=generator.max_loc,
                shape=(generator.num_loc + 1, 2),
                dtype=torch.float32,
            ),
            current_node=UnboundedDiscreteTensorSpec(
                shape=(1),
                dtype=torch.int64,
            ),
            to_deliver=UnboundedDiscreteTensorSpec(
                shape=(1),
                dtype=torch.int64,
            ),
            i=UnboundedDiscreteTensorSpec(
                shape=(1),
                dtype=torch.int64,
            ),
            action_mask=UnboundedDiscreteTensorSpec(
                shape=(generator.num_loc + 1),
                dtype=torch.bool,
            ),
            shape=(),
        )
        self.action_spec = BoundedTensorSpec(
            shape=(1,),
            dtype=torch.int64,
            low=0,
            high=generator.num_loc + 1,
        )
        self.reward_spec = UnboundedContinuousTensorSpec(shape=(1,))
        self.done_spec = UnboundedDiscreteTensorSpec(shape=(1,), dtype=torch.bool)

    @staticmethod
    def _get_reward(td, actions) -> TensorDict:
        # Gather locations in order of tour (add depot since we start and end there)
        locs_ordered = torch.cat(
            [
                td["locs"][..., 0:1, :],  # depot
                gather_by_index(td["locs"], actions),  # order locations
            ],
            dim=1,
        )
        return -get_tour_length(locs_ordered)

    def check_solution_validity(self, td, actions):
        # assert (actions[:, 0] == 0).all(), "Not starting at depot"
        assert (
            torch.arange(actions.size(1), out=actions.data.new())
            .view(1, -1)
            .expand_as(actions)
            == actions.data.sort(1)[0]
        ).all(), "Not visiting all nodes"

        visited_time = torch.argsort(
            actions, 1
        )  # index of pickup less than index of delivery
        assert (
            visited_time[:, 1 : actions.size(1) // 2 + 1]
            < visited_time[:, actions.size(1) // 2 + 1 :]
        ).all(), "Deliverying without pick-up"

    def get_num_starts(self, td):
        """Only half of the nodes (i.e. pickup nodes) can be start nodes"""
        return (td["locs"].shape[-2] - 1) // 2

    def select_start_nodes(self, td, num_starts):
        """Only nodes from [1 : num_loc // 2 +1] (i.e. pickups) can be selected"""
        num_possible_starts = (td["locs"].shape[-2] - 1) // 2
        selected = (
            torch.arange(num_starts, device=td.device).repeat_interleave(td.shape[0])
            % num_possible_starts
            + 1
        )
        return selected

    @staticmethod
    def render(td: TensorDict, actions: torch.Tensor = None, ax=None):
        return render(td, actions, ax)


In [761]:
from torch import nn


class PDPInitEmbedding(nn.Module):
    """Initial embedding for the Pickup and Delivery Problem (PDP).
    Embed the following node features to the embedding space:
        - locs: x, y coordinates of the nodes (depot, pickups and deliveries separately)
           Note that pickups and deliveries are interleaved in the input.
    """

    def __init__(self, embed_dim, linear_bias=True):
        super(PDPInitEmbedding, self).__init__()
        node_dim = 2  # x, y
        self.init_embed_depot = nn.Linear(2, embed_dim, linear_bias)
        self.init_embed_charging = nn.Linear(2, embed_dim, linear_bias)
        self.init_embed_pick = nn.Linear(node_dim * 2, embed_dim, linear_bias)
        self.init_embed_delivery = nn.Linear(node_dim, embed_dim, linear_bias)

    def forward(self, td):
        depot, charging, locs = td["locs"][..., 0:1, :], td["locs"][..., 1:2, :], td["locs"][..., 2:, :]
        num_locs = locs.size(-2)
        pick_feats = torch.cat(
            [locs[:, : num_locs // 2, :], locs[:, num_locs // 2 :, :]], -1
        )  # [batch_size, graph_size//2, 4]
        delivery_feats = locs[:, num_locs // 2 :, :]  # [batch_size, graph_size//2, 2]
        depot_embeddings = self.init_embed_depot(depot)
        charging_embeddings = self.init_embed_charging(charging)
        pick_embeddings = self.init_embed_pick(pick_feats)
        delivery_embeddings = self.init_embed_delivery(delivery_feats)
        # concatenate on graph size dimension
        return torch.cat([depot_embeddings, charging_embeddings, pick_embeddings, delivery_embeddings], -2)

In [762]:
from rl4co.models.nn.env_embeddings.context import EnvContext


class PDPContext(EnvContext):
    """Context embedding for the Pickup and Delivery Problem (PDP).
    Project the following to the embedding space:
        - current node embedding
    """

    def __init__(self, embed_dim):
        super(PDPContext, self).__init__(embed_dim, embed_dim)

    def forward(self, embeddings, td):
        cur_node_embedding = self._cur_node_embedding(embeddings, td).squeeze()
        return self.project_context(cur_node_embedding)

In [763]:
# RL4CO env based on TorchRL
env = PDPEnv(generator_params=dict(num_loc=20))

In [764]:
from examples.slap import StaticEmbedding
from rl4co.models.nn.env_embeddings.context import PDPContext
from rl4co.models import AttentionModel, AttentionModelPolicy

emb_dim = 128
policy = AttentionModelPolicy(env_name=env.name, # this is actually not needed since we are initializing the embeddings!
                              embed_dim=emb_dim,
                              init_embedding=PDPInitEmbedding(emb_dim),
                              context_embedding=PDPContext(emb_dim)
)

In [765]:
# Model: default is AM with REINFORCE and greedy rollout baseline
model = AttentionModel(env,
                       baseline='rollout',
                       policy=policy,
                       train_data_size=100_000, # really small size for demo
                       val_data_size=10_000)

In [766]:
# Greedy rollouts over untrained policy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
td_init = env.reset(batch_size=[4]).to(device)
policy = model.policy.to(device)
out = policy(td_init.clone(), env, phase="test", decode_type="greedy", return_actions=True)

# Plotting
print(f"Tour lengths: {[f'{-r.item():.2f}' for r in out['reward']]}")
for td, actions in zip(td_init, out['actions'].cpu()):
    env.render(td, actions)

20
action tensor([0, 0, 0, 0])
current_node tensor([[0],
        [0],
        [0],
        [0]]) torch.Size([4, 1])
torch.Size([4, 22])
dist_mat_distance tensor([0., 0., 0., 0.])
other dist tensor([0., 0., 0., 0.])
tensor([[False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False,
         False, False],
        [False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False,
         False, False],
        [False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False,
         False, False],
        [False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False,
         False, False]])
action tensor([11,  9,  9,  9])
current_node tensor([[0]

AssertionError: Logits contain NaNs

In [652]:
print((1 + 20 // 2) % (21))

11


In [710]:
(10 + 22 // 2) % (22 + 1)

21

In [439]:
import torch
from lightning.pytorch.callbacks import ModelCheckpoint, RichModelSummary

from rl4co.envs import PDPEnv
from rl4co.models.zoo import AttentionModel
from rl4co.utils.trainer import RL4COTrainer

In [93]:
env = PDPEnv(generator_params=dict(num_loc=20))

In [94]:
model = AttentionModel(env,
                       baseline='rollout',
                       train_data_size=100_000, # really small size for demo
                       val_data_size=10_000)

C:\Users\zm0714\Documents\Projekte\rl4co-slap\venv\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
C:\Users\zm0714\Documents\Projekte\rl4co-slap\venv\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


In [95]:
td_init = env.reset(batch_size=[4])

In [96]:
td_init

TensorDict(
    fields={
        action_mask: Tensor(shape=torch.Size([4, 21]), device=cpu, dtype=torch.bool, is_shared=False),
        available: Tensor(shape=torch.Size([4, 21]), device=cpu, dtype=torch.bool, is_shared=False),
        current_node: Tensor(shape=torch.Size([4, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        depot: Tensor(shape=torch.Size([4, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([4, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        i: Tensor(shape=torch.Size([4, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        locs: Tensor(shape=torch.Size([4, 21, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        terminated: Tensor(shape=torch.Size([4, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        to_deliver: Tensor(shape=torch.Size([4, 21]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([4]),
    device=None,
    is_shared=False)

In [108]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
td_init = env.reset(batch_size=[4]).to(device)
policy = model.policy.to(device)
out = policy(td_init.clone(), env, phase="test", decode_type="greedy", return_actions=True)

# Plotting
print(f"Tour lengths: {[f'{-r.item():.2f}' for r in out['reward']]}")
for td, actions in zip(td_init, out['actions'].cpu()):
    env.render(td, actions)

KeyboardInterrupt: 