In [66]:
import numpy as np
import gym
from gym import spaces
import or_gym
# import ray
# from ray.rllib.agents import ppo
import time
# from ray import tune
# from ray.rllib.models import ModelCatalog
# from ray.rllib.models.tf.tf_modelv2 import TFModelV2
# from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
import torch
from torch import nn
import torch.nn.functional as F
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2

In [2]:
env = gym.make('VehicleRouting-v1')

In [3]:
env.action_space

Tuple(Discrete(5), Discrete(5), Discrete(5), Discrete(2), Discrete(2), Discrete(2))

In [65]:
# Build Multi-Headed A2C model as start

class A2CMulti(nn.Module):
    
    def __init__(self, env, device='cpu'):
        super(A2CMulti, self).__init__()
        
        self.device = device
        self.n_inputs = env.observation_space.shape[0]
        try:
            self.n_outputs = env.action_space.shape[0]
        except TypeError:
            self.n_outputs = len(env.action_space)
        self.env = env
        
        self.body = nn.Sequential(
            nn.Linear(self.n_inputs, 128),
            nn.ELU(),
            nn.Linear(128, 128),
            nn.ELU(),
            nn.Linear(128, 128),
            nn.ELU())
        
        # Value Function Head
        self.value = nn.Sequential(
            nn.Linear(128, 64),
            nn.ELU(),
            nn.Linear(64, 1))
        
        # Policy Function Heads
        if type(env.action_space) == gym.spaces.tuple.Tuple:
            self.outputs = []
            for i, space in enumerate(env.action_space):
                try:
                    nodes = space.n
                except AttributeError:
                    nodes = space.shape[0]
                output_layer = nn.Sequential(
                    nn.Linear(128, 64),
                    nn.ELU(),
                    nn.Linear(64, nodes))
                output_name = 'output_' + str(i)
                self.outputs.append(output_name)
                setattr(self, output_name, output_layer)
        else:
            output_layer = nn.Sequential(
                nn.Linear(128, 64),
                nn.ELU(),
                nn.Linear(64, self.n_outputs))
            
        if self.device == 'cuda':
            self.net.cuda()
            
    def forward(self, state):
        body_output = self.get_body_output(state)
        action_probs = []
        for name in self.outputs:
            output = getattr(self, name)(body_output)
            action_probs.append(F.softmax(output, dim=-1))
        return action_probs, self.value(body_output)

    def get_body_output(self, state):
        state_t = torch.FloatTensor(state).to(device=self.device)
        return self.body(state_t)
    
    def get_action(self, state):
        action_probs, value = self.forward(state)
        actions = np.zeros(len(action_probs))
        for i, a_probs in enumerate(action_probs):
            probs = a_probs.detach().numpy()
            action = np.random.choice(np.arange(probs.shape[0]), p=probs)
            actions[i] = action
        return actions.astype(np.int16), value
    
    def get_log_probs(self, state):
        body_output = self.get_body_output(state)
        log_probs = []
        for name in self.outputs:
            output = getattr(self, name)(body_output)
            log_probs.append(F.log_softmax(output, dim=-1))
        return log_probs

    def train(self, lr):
        self.optimizer = torch.optim.Adam(
            self.parameters(), lr=self.learning_rate())
        

In [63]:
net = A2CMulti(env)

In [64]:
net.get_log_probs(env.state)

[tensor([-1.6726, -2.1915, -1.6142, -1.3281, -1.4420],
        grad_fn=<LogSoftmaxBackward>),
 tensor([-1.4329, -1.5777, -1.8338, -1.4574, -1.8184],
        grad_fn=<LogSoftmaxBackward>),
 tensor([-1.3050, -1.6029, -1.6123, -1.5413, -2.1717],
        grad_fn=<LogSoftmaxBackward>),
 tensor([-0.7181, -0.6688], grad_fn=<LogSoftmaxBackward>),
 tensor([-0.4607, -0.9965], grad_fn=<LogSoftmaxBackward>),
 tensor([-0.4587, -0.9999], grad_fn=<LogSoftmaxBackward>)]

In [42]:
for a_probs in action_probs:
    probs = a_probs.detach().numpy()
    action = np.random.choice(np.arange(probs.shape[0]), p=probs)
    print(action)

2
1
2
0
1
0


In [8]:
env_.action_space

Discrete(2)

In [6]:
type(env.action_space) == gym.spaces.tuple.Tuple

True

In [4]:
outputs = {}
for i, space in enumerate(env.action_space):
    try:
        nodes = space.n
    except AttributeError:
        nodes = space.shape[0]
        
    outputs[i] = nn.Sequential(
        nn.Linear(128, 64),
        nn.ELU(),
        nn.Linear(64, nodes))

In [5]:
outputs

{0: Sequential(
   (0): Linear(in_features=128, out_features=64, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=64, out_features=5, bias=True)
 ),
 1: Sequential(
   (0): Linear(in_features=128, out_features=64, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=64, out_features=5, bias=True)
 ),
 2: Sequential(
   (0): Linear(in_features=128, out_features=64, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=64, out_features=5, bias=True)
 ),
 3: Sequential(
   (0): Linear(in_features=128, out_features=64, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=64, out_features=2, bias=True)
 ),
 4: Sequential(
   (0): Linear(in_features=128, out_features=64, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=64, out_features=2, bias=True)
 ),
 5: Sequential(
   (0): Linear(in_features=128, out_features=64, bias=True)
   (1): ELU(alpha=1.0)
   (2): Linear(in_features=64, out_features=2, bias=True)
 )}

In [69]:
x = spaces.MultiDiscrete([env.num_directions if i < env.num_vehicles
            else env.num_actions
            for i in range(2*env.num_vehicles)])

In [78]:
x.sample()

array([2, 3, 3, 0, 0, 1])

In [79]:
env.num_directions

5