In [2]:
import numpy as np
import tensorflow as tf
from lib.grid_world import GridWorld
from lib.dqn import DQNAgent, ReplayBuffer, DQN
from lib.util import train_dqn, test_agent, render_path

# Back to the GridWorld
We explored DQNs and applied them to the CartPole environment which has a continuous observation space and a discrete action space. Today, we're going to revisit our first environment: GridWorld.

Unlike the CartPole, the state in GridWorld is represented by the agent's position on the grid, which is a discrete value. This difference in state representation brings us to an important point: the use of one-hot encoding for discrete states.

One-hot encoding is a process of converting discrete variables into a form that could be provided to machine learning algorithms to improve their performance. In one-hot, we map each category to a vector that contains 1 and 0, denoting the presence of the feature or not. This method is important because it allows us to use categorical data in algorithms that require numerical input. It also helps to prevent any potential misinterpretations by the algorithm due to arbitrary number assignments for categories.

In our previous session, we've implicitly used a form of one-hot encoding for actions. Our Q network took only the state as input and output a vector, with each element representing a possible action. We then selected the action corresponding to the maximum value in the vector. This is similar to one-hot encoding, where each action corresponds to a unique element in the output vector. The use of the argmax function to select the action is analogous to selecting the '1' in a one-hot encoded vector.

By using one-hot encoding for both the states and actions in GridWorld, we can ensure that our DQN accurately interprets the discrete nature of the environment and makes optimal decisions.

DQNGridWorld extends the base `GridWorld` class to create a new environment suitable for a DQN agent. The state in this environment is represented by the agent's one-hot encoded position. This is achieved by adding the `get_observation` method. The `reset` and `step` methods are overridden to add use `get_observation` and `step` also adds a small negative reward for each step, encouraging the agent to find faster solutions.

In [4]:
# Define a new GridWorld environment for DQN where the state is represented by the agent's position
class DQNGridWorld(GridWorld):
    def get_observation(self):
        # Create a grid with the agent's position marked as 1 and the rest as 0
        agent = np.zeros(self.world_shape)
        agent[tuple(self.agent_current_pos)] = 1
        # Flatten the grid to create the observation
        observation = agent.flatten()
        return observation

    def reset(self):
        # Reset the environment and return the initial observation
        super().reset()
        return self.get_observation()

    def step(self, action):
        # Perform the action and get the new state, reward and done flag
        _, reward, done = super().step(action)
        # Add a small negative reward for each step to encourage faster solutions
        reward += -0.01
        return self.get_observation(), reward, done

Now we test our environment and model. First we initialize the environment, the agent and the replay buffer. Then we train the agent while periodically validating its performance. Finally, we test the trained agent and render its path.

In [5]:
# dimensions of the GridWorld
world_shape = (3, 4)
# initial position of the agent
agent_init_pos = (2, 0)
# list of blocking state positions
blocking_states = [(1, 1)]
# list of terminal state positions
terminal_states = [(0, 3), (1, 3)]
# dictionary of rewards with key: position and value: reward
reward_states = {
    (0, 3): 1,
    (1, 3): -1
}
# Create the environment and check the observation space
env = DQNGridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)
obs = env.reset()
env.render()
print(obs)
print(obs.shape)

[[ 0.  0.  0.  1.]
 [ 0.  8.  0. -1.]
 [ 4.  0.  0.  0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
(12,)


In [6]:
# Create the DQN agent
agent = DQNAgent(env.action_space, obs.shape)
# Create the replay buffer
replay_buffer = ReplayBuffer()

In [7]:
# Train the DQN agent
train_dqn(env, agent, replay_buffer, n_epochs=40, n_steps=400, eval_after_steps=1)

epoch 1, loss 0.007327901264943648, avg_return -0.2
epoch 2, loss 0.0050391774657327915, avg_return -0.2
epoch 3, loss 0.0045160068566474365, avg_return -0.2
epoch 4, loss 0.0036988739029766293, avg_return -0.2
epoch 5, loss 0.0034128879033232806, avg_return -0.2
epoch 6, loss 0.003345584276758018, avg_return -0.2
epoch 7, loss 0.0035492531078489264, avg_return -0.2
epoch 8, loss 0.0037231964161037467, avg_return -0.2
epoch 9, loss 0.004095938687896705, avg_return -0.2
epoch 10, loss 0.004443292569703772, avg_return 0.95
epoch 11, loss 0.005256044471025234, avg_return 0.95
epoch 12, loss 0.005523191048268927, avg_return 0.95
epoch 13, loss 0.0053218105604173616, avg_return 0.95
epoch 14, loss 0.005325290167093044, avg_return 0.95
epoch 15, loss 0.005272434573271312, avg_return 0.95
epoch 16, loss 0.004802814710274106, avg_return 0.95
epoch 17, loss 0.004224048729156493, avg_return 0.95
epoch 18, loss 0.003963031776947901, avg_return 0.95
epoch 19, loss 0.00358435119051137, avg_return 0

In [8]:
# Test the trained agent and render its path
cumulated_reward, path = test_agent(agent, env)
print('cumulated reward', cumulated_reward)
render_path(env, path)

cumulated reward 0.95
[[ 0.  0.  3.  3.]
 [ 0.  8.  3. -1.]
 [ 3.  3.  3.  0.]]


The model should also perform on larger and complexer environments.

In [9]:
# dimensions of the GridWorld
world_shape = (8, 8)
# initial position of the agent
agent_init_pos = (7, 0)
# list of blocking state positions
blocking_states = [(0, 3),
                   (1, 3),
                   (2, 3),
                   (3, 3),
                   (5, 3),
                   (6, 3),
                   (7, 3)]
# list of terminal state positions
terminal_states = [(0, 7), (1, 7)]
# dictionary of rewards with key: position and value: reward
reward_states = {
    (0, 7): 1,
    (1, 7): -1
}
# Create the environment and check the observation space
env_large = DQNGridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)
obs = env_large.reset()
env_large.render()
print(obs)
print(obs.shape)

[[ 0.  0.  0.  8.  0.  0.  0.  1.]
 [ 0.  0.  0.  8.  0.  0.  0. -1.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 4.  0.  0.  8.  0.  0.  0.  0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
(64,)


In [10]:
# Create the DQN agent
agent_large = DQNAgent(env_large.action_space, obs.shape)
# Create the replay buffer
replay_buffer_large = ReplayBuffer()

In [11]:
# Train the DQN agent
train_dqn(env_large, agent_large, replay_buffer_large, n_epochs=40, n_steps=400, eval_after_steps=1)

epoch 1, loss 0.002964810000776197, avg_return -0.2
epoch 2, loss 0.0022253918050409993, avg_return -0.2
epoch 3, loss 0.0013253529696157784, avg_return -0.2
epoch 4, loss 0.0008619869963695237, avg_return -0.2
epoch 5, loss 0.0005547847863454081, avg_return -0.2
epoch 6, loss 0.0004024379597922234, avg_return -0.2
epoch 7, loss 0.00028751534841831017, avg_return -0.2
epoch 8, loss 0.0002197890057686891, avg_return -0.2
epoch 9, loss 0.0001840024727925993, avg_return -0.2
epoch 10, loss 0.0001392982356946959, avg_return -0.2
epoch 11, loss 0.00011619759885661551, avg_return -0.2
epoch 12, loss 9.345242403924203e-05, avg_return -0.2
epoch 13, loss 8.035337353362593e-05, avg_return -0.2
epoch 14, loss 7.650910904999364e-05, avg_return -0.2
epoch 15, loss 7.700516340491959e-05, avg_return -0.2
epoch 16, loss 5.311181911338281e-05, avg_return -0.2
epoch 17, loss 5.654819543110534e-05, avg_return -0.2
epoch 18, loss 6.076910958086046e-05, avg_return -0.2
epoch 19, loss 7.441735465363308e-05

In [12]:
# Test the trained agent and render its path
cumulated_reward, path = test_agent(agent_large, env_large)
print('cumulated reward', cumulated_reward)
render_path(env_large, path)

cumulated reward 0.86
[[ 0.  0.  0.  8.  0.  0.  3.  3.]
 [ 0.  0.  0.  8.  0.  3.  3. -1.]
 [ 0.  0.  0.  8.  0.  3.  0.  0.]
 [ 0.  0.  0.  8.  3.  3.  0.  0.]
 [ 0.  0.  3.  3.  3.  0.  0.  0.]
 [ 0.  0.  3.  8.  0.  0.  0.  0.]
 [ 0.  0.  3.  8.  0.  0.  0.  0.]
 [ 3.  3.  3.  8.  0.  0.  0.  0.]]


## Convolutional DQN

Convolutional Neural Networks (CNNs) are particularly well-suited for processing grid-like data, such as images. In the case of our GridWorld environment, the state can be represented as a grid where each cell corresponds to a particular state of the environment. This grid-like structure makes it a good fit for a CNN.

CNNs have the ability to automatically and adaptively learn spatial hierarchies of features. This is particularly useful in our GridWorld scenario where the agent's decision at a particular location may depend on the surrounding cells. For example, the agent may need to avoid a blocking state that is nearby or move towards a reward state. The convolutional layers can learn to recognize these spatial patterns and make decisions accordingly.

Convolutional DQNs extend the traditional DQN by replacing the fully connected layers with convolutional layers. This allows the ConvDQN to take advantage of the spatial structure in the input data, which can lead to more efficient learning.

In contrast, fully connected DQNs, while powerful, do not take into account the spatial structure of the input data. The network does not have a built-in mechanism to learn spatial hierarchies of features. This can make fully connected DQNs less efficient at learning in environments where spatial relationships are important.

In general, ConvDQNs can be useful in any scenario where the input data has a grid-like structure. This includes not only game environments like GridWorld, but also real-world applications such as image processing, robotics, and any other task where spatial relationships in the input data are important.


The `ConvDQN` class extends the base `DQN` class to create a new DQN model with convolutional layers. The `__init__` method is overridden to initialize the layers of the model with convolutional layers, a flatten layer, and a dense layer.


In [13]:
# Define a new DQN model with convolutional layers
class ConvDQN(DQN):
    def __init__(self, filters=(32, 16, 8, 8), kernel_size=(3, 3), n_actions=4, **kwargs):
        super().__init__(**kwargs)
        # Initialize the layers of the model
        self.layers = []
        # Add convolutional layers with the specified filters and kernel size
        for f in filters:
            self.layers.append(tf.keras.layers.Conv2D(f, kernel_size, activation='relu', padding='same'))
        # Add a flatten layer to convert the 2D output of the convolutional layers to 1D
        self.layers.append(tf.keras.layers.Flatten())
        # Add a dense layer with the number of units equal to the number of actions
        self.layers.append(tf.keras.layers.Dense(n_actions))

The `ConvDQNAgent` class extends the base `DQNAgent` class to create a new DQN agent that uses a convolutional DQN model.  Only the `__init__` method is overridden to initialize the model and the target model with a convolutional DQN model instead of a dense DQN model.

In [14]:
# Define a new DQN agent with the convolutional DQN model
class ConvDQNAgent(DQNAgent):
    def __init__(self, action_space, observation_shape, epsilon=0.9, gamma=0.95):
        self.action_space = action_space
        # Initialize the model and the target model with the convolutional DQN model
        self.model = ConvDQN(n_actions=self.action_space.n)
        self.target_model = ConvDQN(n_actions=self.action_space.n)
        self.epsilon = epsilon  # exploration vs exploitation
        self.gamma = gamma  # discount factor
        self.optimizer = tf.keras.optimizers.Adam()
        # Initialize the networks of the model and the target model
        self._init_networks(observation_shape)

The `ConvDQNGridWorld` class extends the `DQNGridWorld` class to create a new environment where the state is represented by the entire grid. We only override the `get_observation` method to enable convolutional DQN agents to capture spatial dependencies.

In [15]:
# Define a new GridWorld environment for Convolutional DQN where the state is represented by the entire grid
class ConvDQNGridWorld(DQNGridWorld):
    def get_observation(self):
        # Create separate grids for the agent, rewards, negative rewards and blocking states
        agent = np.zeros(self.world_shape)
        agent[tuple(self.agent_current_pos)] = 1
        rewards = np.zeros(self.world_shape)
        negative_rewards = np.zeros(self.world_shape)
        for state, reward in self.reward_states.items():
            if state not in self.collected_rewards:
                if reward > 0:
                    rewards[state] = reward
                else:
                    negative_rewards[state] = reward
        blocking_states = np.zeros(self.world_shape)
        for blocking_state in self.blocking_states:
            blocking_states[blocking_state] = 1
        # Stack the grids to create the observation
        observation = np.stack([agent, rewards, negative_rewards, blocking_states], axis=-1)
        return observation

Now we test our environment and model. First we initialize the environment, the agent and the replay buffer. Then we train the agent while periodically validating its performance. Finally, we test the trained agent and render its path.

In [16]:
# dimensions of the GridWorld
world_shape = (8, 8)
# initial position of the agent
agent_init_pos = (7, 0)
# list of blocking state positions
blocking_states = [(0, 3),
                   (1, 3),
                   (2, 3),
                   (3, 3),
                   (5, 3),
                   (6, 3),
                   (7, 3)]

# list of terminal state positions
terminal_states = [(0, 7), (1, 7)]
# dictionary of rewards with key: position and value: reward
reward_states = {
    (0, 7): 1,
    (1, 7): -1
}
# Create the environment and check the observation space
env_conv = ConvDQNGridWorld(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)
obs = env_conv.reset()
env_conv.render()
for i in range(obs.shape[-1]):
    print(obs[..., i])
print(obs.shape)

[[ 0.  0.  0.  8.  0.  0.  0.  1.]
 [ 0.  0.  0.  8.  0.  0.  0. -1.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 0.  0.  0.  8.  0.  0.  0.  0.]
 [ 4.  0.  0.  8.  0.  0.  0.  0.]]
[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]]
[[0.

In [17]:
# Create the Convolutional DQN agent
agent_conv = ConvDQNAgent(env_conv.action_space, obs.shape)
# Create the replay buffer
replay_buffer_conv = ReplayBuffer()

In [18]:
# Train the Convolutional DQN agent
train_dqn(env_conv, agent_conv, replay_buffer_conv, n_epochs=40, n_steps=400, eval_after_steps=1)

epoch 1, loss 0.0011825622469814334, avg_return -0.2
epoch 2, loss 0.00048678940189006425, avg_return -0.2
epoch 3, loss 0.00031929314144463206, avg_return -0.2
epoch 4, loss 0.0001877865820461011, avg_return -0.2
epoch 5, loss 0.0005851912990948449, avg_return -0.2
epoch 6, loss 0.0003869798361932908, avg_return -0.2
epoch 7, loss 0.00023286131830957402, avg_return -0.2
epoch 8, loss 0.00016616802852809087, avg_return -0.2
epoch 9, loss 0.0001624102603159372, avg_return -0.2
epoch 10, loss 0.0001388551022358797, avg_return -0.2
epoch 11, loss 0.00013964975022418002, avg_return -0.2
epoch 12, loss 0.00012604870517662903, avg_return -0.2
epoch 13, loss 0.00011814487517369798, avg_return -0.2
epoch 14, loss 0.00010638092263093313, avg_return -0.2
epoch 15, loss 0.00010834162805650749, avg_return -0.2
epoch 16, loss 0.00013261970926237154, avg_return -0.2
epoch 17, loss 0.00015229240335656868, avg_return -0.2
epoch 18, loss 0.00017328833251895048, avg_return -0.2
epoch 19, loss 0.00017893

In [19]:
# Test the trained agent and render its path
cumulated_reward, path = test_agent(agent_conv, env_conv)
print('cumulated reward', cumulated_reward)
render_path(env_conv, path)

cumulated reward 0.86
[[ 0.  0.  0.  8.  0.  0.  3.  3.]
 [ 0.  0.  0.  8.  0.  0.  3. -1.]
 [ 0.  0.  0.  8.  3.  3.  3.  0.]
 [ 0.  0.  0.  8.  3.  0.  0.  0.]
 [ 0.  0.  3.  3.  3.  0.  0.  0.]
 [ 0.  3.  3.  8.  0.  0.  0.  0.]
 [ 3.  3.  0.  8.  0.  0.  0.  0.]
 [ 3.  0.  0.  8.  0.  0.  0.  0.]]


## Generalization

The `RandomConvDQNGridWorld` class extends the `ConvDQNGridWorld` class to create a new environment where the reward (and terminal) states are randomly generated. This is achieved by overriding the `reset` method and adding a new `generate_config` method. In the constructor we have `reward_dict` instead of `terminal_states` and  `reward_states`, mapping reward values to number of states with that reward. This class is useful for testing the generalization ability of a convolutional DQN agent, as the agent needs to adapt to different reward and terminal states in each episode. We don't need to change the implementation of the Convolutional DQN agent.

In [20]:
# Define a new GridWorld environment for Convolutional DQN where the state is represented by the entire grid
# and the reward (and terminal) states are randomly generated
class RandomConvDQNGridWorld(ConvDQNGridWorld):
    def __init__(self, world_shape, agent_init_pos, blocking_states, reward_dict):
        terminal_states, reward_states = self.generate_config(world_shape, agent_init_pos, blocking_states, reward_dict)
        super().__init__(world_shape, agent_init_pos, blocking_states, terminal_states, reward_states)
        self.reward_dict = reward_dict

    @staticmethod
    def generate_config(world_shape, agent_init_pos, blocking_states, reward_dict):
        # Generate random reward and terminal states
        n_positions = sum([v for v in reward_dict.values()])
        possible_positions = np.meshgrid(np.arange(world_shape[0]), np.arange(world_shape[1]))
        possible_positions = np.array(possible_positions).T.reshape(-1, 2)
        # Remove agent initial position and blocking states from possible positions
        possible_positions = possible_positions[np.logical_not(np.all(possible_positions == agent_init_pos, axis=1))]
        for blocking_state in blocking_states:
            possible_positions = possible_positions[
                np.logical_not(np.all(possible_positions == blocking_state, axis=1))]
        positions = possible_positions[np.random.choice(possible_positions.shape[0], n_positions, replace=False), :]
        current_index = 0
        terminal_states = []
        reward_states = {}
        for k, v in reward_dict.items():
            terminal_states.append(tuple(positions[current_index]))
            for i in range(v):
                reward_states[tuple(positions[current_index])] = k
            current_index += 1
        return terminal_states, reward_states

    def reset(self):
        # Reset the environment and return the initial observation
        self.terminal_states, self.reward_states = self.generate_config(self.world_shape, self.agent_init_pos,
                                                                        self.blocking_states, self.reward_dict)
        return super().reset()

Now we test our environment and model. First we initialize the environment, the agent and the replay buffer. Then we train the agent while periodically validating its performance. Finally, we test the trained agent and render its path.

In [21]:
# initial position of the agent
agent_init_pos = (7, 0)

# list of blocking state positions
blocking_states = [(0, 3),
                   (1, 3),
                   (2, 3),
                   (3, 3),
                   (5, 3),
                   (6, 3),
                   (7, 3)]
# dictionary mapping reward to number of states with that reward
reward_dict = {
    1: 1
}
# Create the environment and check the observation space
env_conv_random = RandomConvDQNGridWorld(world_shape, agent_init_pos, blocking_states, reward_dict)
obs = env_conv_random.reset()
env_conv_random.render()
print(obs.shape)

[[0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [4. 0. 0. 8. 0. 0. 0. 0.]]
(8, 8, 4)


In [22]:
# Create the Convolutional DQN agent for the random environment
agent_conv_random = ConvDQNAgent(env_conv_random.action_space, obs.shape)
# Create the replay buffer
replay_buffer_conv_random = ReplayBuffer()

In [None]:
# Train the Convolutional DQN agent
train_dqn(env_conv_random, agent_conv_random, replay_buffer_conv_random, n_epochs=160, n_steps=400,
          eval_after_steps=1)

epoch 1, loss 4.869260916962048e-05, avg_return -0.2
epoch 2, loss 0.0005204806265197703, avg_return -0.2
epoch 3, loss 0.00042627325994715193, avg_return -0.2
epoch 4, loss 0.0009700416095625997, avg_return -0.08100000000000004
epoch 5, loss 0.0005691390172160027, avg_return -0.2
epoch 6, loss 0.00047958690086602473, avg_return -0.08200000000000005
epoch 7, loss 0.0006423462054669926, avg_return -0.2
epoch 8, loss 0.0007211429442577355, avg_return -0.2
epoch 9, loss 0.0008188191991393978, avg_return -0.08100000000000002
epoch 10, loss 0.000786642007710725, avg_return 0.26999999999999996
epoch 11, loss 0.000723100112963948, avg_return 0.03399999999999996
epoch 12, loss 0.0006775240714773645, avg_return -0.2
epoch 13, loss 0.0007386754757590097, avg_return 0.036999999999999963
epoch 14, loss 0.0006334689053346665, avg_return -0.2
epoch 15, loss 0.0006690150692065799, avg_return -0.08500000000000005
epoch 16, loss 0.0007070919906482231, avg_return 0.146
epoch 17, loss 0.00081064265918939

In [23]:
# Test the trained agent and render its path
cumulated_reward, path = test_agent(agent_conv_random, env_conv_random)
print('cumulated reward', cumulated_reward)
render_path(env_conv_random, path)
print(path)

cumulated reward 0.9
[[0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 3. 0.]
 [0. 3. 3. 3. 3. 3. 3. 0.]
 [3. 3. 0. 8. 0. 0. 0. 0.]
 [3. 0. 0. 8. 0. 0. 0. 0.]
 [3. 0. 0. 8. 0. 0. 0. 0.]]
[(7, 0), (6, 0), (5, 0), (5, 1), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (3, 6)]


In [23]:
# Test the trained agent and render its path
cumulated_reward, path = test_agent(agent_conv_random, env_conv_random)
print('cumulated reward', cumulated_reward)
render_path(env_conv_random, path)
print(path)

cumulated reward -0.3000000000000001
[[0. 0. 0. 8. 0. 1. 0. 0.]
 [0. 0. 0. 8. 3. 3. 3. 0.]
 [0. 0. 0. 8. 0. 3. 3. 0.]
 [0. 0. 0. 8. 0. 3. 0. 0.]
 [0. 0. 3. 3. 3. 3. 0. 0.]
 [0. 3. 3. 8. 0. 0. 0. 0.]
 [0. 3. 0. 8. 0. 0. 0. 0.]
 [3. 3. 0. 8. 0. 0. 0. 0.]]
[(7, 0), (7, 1), (6, 1), (5, 1), (5, 2), (4, 2), (4, 3), (4, 4), (4, 5), (3, 5), (2, 5), (2, 6), (1, 6), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4), (1, 5), (1, 4)]


In [26]:
# Test the trained agent and render its path
cumulated_reward, path = test_agent(agent_conv_random, env_conv_random)
print('cumulated reward', cumulated_reward)
render_path(env_conv_random, path)
print(path)

cumulated reward 0.89
[[0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 0. 0. 8. 0. 0. 0. 0.]
 [0. 3. 3. 3. 3. 3. 0. 0.]
 [3. 3. 0. 8. 0. 3. 3. 0.]
 [3. 0. 0. 8. 0. 0. 3. 0.]
 [3. 0. 0. 8. 0. 0. 0. 0.]]
[(7, 0), (6, 0), (5, 0), (5, 1), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (5, 5), (5, 6), (6, 6)]
