Reinforcement learning agent in a simple environment

In [None]:
EPISODES = 1000

To gain a basic understanding of reinforcement learning, we experiment with a "simple" problem. 
That is, a pole balancing on a cart which can move either to the left or the right.

Cart pole environment

Actions: ndarray (1,)
 - {0, 1}: In which direction (0 = left, 1 = right) to push the cart.

Observation: ndarray (4,)
 - \[cart position, cart velocity, pole angle, pole angular velocity\]
 
Reward: float
 - Reward is for every step taken, including the termination step.
 
Termination: bool
 - Pole Angle is more than ±12°
 - Cart Position is more than ±2.4 (center of the cart reaches the edge of the display)
 - Episode length is greater than 500

In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt

# Disable plot axes and ticks:
plt.rcParams.update({
    'axes.spines.top': False, 
    'axes.spines.right': False, 
    'axes.spines.bottom': False, 
    'axes.spines.left': False, 
    'xtick.bottom': False, 
    'xtick.labelbottom': False, 
    'ytick.labelleft': False, 
    'ytick.left': False
})

Environment

In [None]:
environment = gym.make('CartPole-v1',
                       render_mode="rgb_array")

_ = environment.reset()
_ = plt.imshow(environment.render())

Agent

In [None]:
import torch
import torch.nn as nn

class Agent(nn.Module):
    def __init__(self, inputs=4, actions=1, hidden=False, neurons=64):
        """
        Agent for reinforcement learning.
        
        Parameters
        ----------
        inputs : int, optional
            Number of inputs.
        actions : int, optional
            Number of actions.
        hidden : int or False, optional
            Number of hidden layers.
        neurons : int, optional
            Number of neurons between layers.
        """
        super(Agent, self).__init__()
        self.hidden = hidden
        
        self.layer_in = nn.Linear(inputs, neurons)
        
        if self.hidden:
            for i in range(self.hidden):
                setattr(self, f"layer_{i+1}", nn.Linear(neurons, neurons))
            
        self.layer_out = nn.Linear(neurons, actions)
        
    def forward(self, x):
        """
        Forward pass.
        
        Parameters
        ----------
        x : numpy.ndarray or torch.Tensor
            Observation state.
            
        Returns
        -------
        output : torch.Tensor
            Action probabilities.
        """
        _output = torch.tanh(self.layer_in(x))
        
        if self.hidden:
            for i in range(self.hidden):
                _output = torch.tanh(getattr(self, f"layer_{i+1}")(_output))
            
        if self.layer_out.out_features > 1:
            output = torch.softmax(self.layer_out(_output), dim=-1)
        else:
            output = torch.sigmoid(self.layer_out(_output))
        
        return output

Training

In [None]:
# agents = {
#     "vanilla": Agent(),
#     "one_hidden": Agent(hidden=1),
#     "two_hidden": Agent(hidden=2),
#     "three_hidden": Agent(hidden=3),
#     "128_neurons": Agent(neurons=128),
#     "256_neurons": Agent(neurons=256),
#     "128_neurons_one_hidden": Agent(neurons=128, hidden=1),
#     "256_neurons_one_hidden": Agent(neurons=256, hidden=1),
#     "128_neurons_two_hidden": Agent(neurons=128, hidden=2),
#     "256_neurons_two_hidden": Agent(neurons=256, hidden=2),
#     "128_neurons_three_hidden": Agent(neurons=128, hidden=3),
#     "256_neurons_three_hidden": Agent(neurons=256, hidden=3)
# }
agents = {"test": Agent()}

# Using the binary cross-entropy loss function for rewards of 0 and 1.
loss_function = nn.BCELoss()

for description, agent in agents.items():
    
    print(f"Training agent '{description}'...")
    
    optimizer = torch.optim.Adam(agent.parameters())
    
    for episode in range(EPISODES):
        observation, info = environment.reset()
        
        for step in range(1000):
            
            observation = torch.tensor(observation, dtype=torch.float32)
            
            actions = agent(observation)

            if agent.layer_out.out_features > 1:
                
                # If there are multiple actions, we choose the action with the highest probability.
                # The index of the highest probability is then regarded as the action.
                
                action = actions.argmax().item()
            else:
                
                # If there are only two actions, we round the output to the nearest integer.
                # The value 0 or 1 is then regarded as the actions.
                
                action = int(actions.round().item())
            
            observation, reward, terminated, truncated, _ = environment.step(action)
            
            loss = loss_function(actions, torch.tensor([reward], dtype=torch.float32))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if terminated or truncated:
                observation, _ = environment.reset()
                
    print(f"Agent '{description}' trained.")

In [None]:
# Use the agent to play the game and save the frames.

observation, info = environment.reset()

for timestep in range(1000):
    
    observation = torch.tensor(observation, dtype=torch.float32)
    actions = agents['test'](observation)
    action = int(actions.round().item())
    
    print(action)
    
    observation, reward, terminated, truncated, _ = environment.step(action)
    
    _ = plt.imshow(environment.render())
    plt.savefig(f"./gif/{timestep}.png")
    
    if terminated or truncated:
        break

In [ ]:
environment.close()

Sources

https://gymnasium.farama.org/content/basic_usage/

https://gymnasium.farama.org/environments/classic_control/cart_pole/

https://ieeexplore.ieee.org/document/6313077