# workflow

1 get input image, in numpy
2 convert torch tensor and send to device
3 model forward
5 get output action
6 create unity ActionTuple() and add action to this
7 set action to unity, env.set_actions(behavior_name,action_tuple)
8 env.step
9 update weights after N steps, num_steps_for_update

In [1]:
# -----------------
# This code is used to close an env that might not have been closed before
try:
    env.close()
except:
    pass
# -----------------

from mlagents_envs.registry import default_registry
from mlagents_envs.environment import ActionTuple, UnityEnvironment as UE
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
%matplotlib inline

env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
print("environment created.")
env.reset()

#initialise behavior parameters
behavior_name=list(env.behavior_specs)[0]
print(f"Name of the environment behavior: {behavior_name}")
behavior_spec=env.behavior_specs[behavior_name]

action_space=env.behavior_specs[behavior_name].action_spec

print(f"Action space is: {action_space}")
# actions space is 2,2,2,2 since each of them can do that or do nothing

num_actions=len(action_space[1]) if behavior_spec.action_spec.is_discrete() else len(action_space[0]) 
print(f"Action size is: {num_actions}")

# action size is 4 since move forward backward rotate right left are 4 different actions


  from .autonotebook import tqdm as notebook_tqdm


environment created.
Name of the environment behavior: stage0?team=0
Action space is: Continuous: 0, Discrete: (2, 2, 2, 2)
Action size is: 4


In [5]:
#implement custom model algo
import torch
from typing import Tuple
from math import floor
from torch.nn import Parameter

#algo and update algo function
class CustomModel(nn.Module):
    def __init__(
    self,
    input_shape: Tuple[int, int, int],
    encoding_size: int,
    output_size: int
    ):        
        """
        Creates a neural network that takes as input a batch of images (3
        dimensional tensors) and outputs a batch of outputs (1 dimensional
        tensors)
        """
        super(CustomModel, self).__init__()
        height = input_shape[1]
        width = input_shape[2]
        initial_channels = input_shape[0]
        conv_1_hw = self.get_conv_output_shape((height, width), 8, 4)
        conv_2_hw = self.get_conv_output_shape(conv_1_hw, 4, 2)
        self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32
        self.conv1 = torch.nn.Conv2d(initial_channels, 16, [8, 8], [4, 4])
        self.conv2 = torch.nn.Conv2d(16, 32, [4, 4], [2, 2])
        #     self.dense1 = torch.nn.Linear(self.final_flat, encoding_size)
        self.dense1 = torch.nn.Linear(6272, encoding_size)
        self.dense2 = torch.nn.Linear(encoding_size, output_size)
        
    def forward(
        self,
        input_image: torch.tensor
    ):
        """
        Forward pass of model, outputs a torch.tensor of shape (num_agents,num_actions).
        """
        #     print(visual_obs.shape)
        conv_1 = torch.relu(self.conv1(input_image))
        #     print(conv_1.shape)
        conv_2 = torch.relu(self.conv2(conv_1))
        #     print(conv_2.shape)
        conv_2=conv_2.view(-1,6272)
        #     print(conv_2.shape)
        #     hidden = self.dense1(conv_2.reshape([-1, self.final_flat]))
        hidden = self.dense1(conv_2)
        #     print(hidden.shape)
        hidden = torch.relu(hidden)
        #     print(hidden.shape)
        hidden = self.dense2(hidden)
        #     print(hidden.shape)
        #     print(hidden)
        return hidden    
    
    def get_conv_output_shape(
        h_w: Tuple[int, int],
        kernel_size: int = 1,
        stride: int = 1,
        pad: int = 0,
        dilation: int = 1,
        ):
            """
            Using standard formula, computes the height and width of the output of a convolution layer.
            """
            h = floor(
              ((h_w[0] + (2 * pad) - (dilation * (kernel_size - 1)) - 1) / stride) + 1
            )
            w = floor(
              ((h_w[1] + (2 * pad) - (dilation * (kernel_size - 1)) - 1) / stride) + 1
            )
            return h, w
    
    def update_weights():
        pass
        

In [6]:
#code only works for single agent, if multi agent need to add tracked=-1, refer to old sample code
#for multi agent, will need to change the code snippet at the second decision_steps,terminal_steps=env.get_steps(behavior_name)

num_episodes=10
max_steps=100000 #max steps in 1 episode
num_steps_for_update=1000
print_how_many_results_in_one_episode=5 #how many lines of intermediate results do you want to see
encoding_size=1000

decision_steps,terminal_steps=env.get_steps(behavior_name)
sample_input_image=decision_steps.obs[0] #in numpy    
input_size=sample_input_image.shape
output_size=num_actions
model=CustomModel(input_size,encoding_size,output_size)
# model=VisualQNetwork(input_size,encoding_size,output_size)

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)
print(f"model: {model}")
print(f"device: {device}")
print("\n")

for episode in range(num_episodes):
    print("-"*30)
    print(f"Start of Episode {episode+1}/{num_episodes}")
    env.reset() 
    decision_steps,terminal_steps=env.get_steps(behavior_name)
    #decision steps tells you which agents are requesting an action (means episode not ended for that agent, for multi agents, some agents end before others)
    #terminal steps tells you which agents have reached terminal step (means episode ended)
    episode_reward=0 
    
    for i in range(max_steps):
#         if len(decision_steps)!=0: #means theres still agents requesting decision
        #get the input RGB images of the agent
        input_images=decision_steps.obs[0] #in numpy
        input_images=torch.from_numpy(input_images) #convert to torch #env is in cpu, so the input images are in cpu
        input_images=input_images.detach() #to prevent calc of gradients to input images durign backprop
        input_images=input_images.to(device) #send to gpu since model is in gpu
        #generate an action for the agent, should be in shape (num_agents,num_actions)
        output_actions=model.forward(input_images) #currently in gpu and type torch.tensor
        #send back to cpu and convert to numpy
#         output_actions=output_actions.to("cpu") 
        outputs_actions_detached=output_actions.detach().numpy()
        #setup mlagents ActionTuple and add actions and set actions
        action_tuple=ActionTuple()
        action_tuple.add_discrete(outputs_actions_detached)
        env.set_actions(behavior_name,action_tuple)
        env.step() #moves the simulation by 1 step

        if i%num_steps_for_update==0:
            #update the weights of the model
            model.update_weights()

        #run this again since this might be the step that is the last and agent moves to terminal step
        decision_steps,terminal_steps=env.get_steps(behavior_name)
        if len(decision_steps)!=0:
            episode_rewards+=decision_steps[0].reward
        else: #means agent ended and now in terminal_steps
            episode_rewards+=terminal_steps[0].reward
            print(f"Terminal step reached. End episode: Total num of steps: {i+1}, Total reward for episode {episode+1}/{num_episodes} is {episode_rewards}")                
            break #break out of current episode loop regardless of max steps hit or not

        print_divisor=1/(print_how_many_results_in_one_episode+1)
        if i%int(max_steps*print_divisor)==0:
            if i==0 or i==max_steps-1:
                continue
            else:
                print(f"At step {i+1}, Cumulative reward thus far {episode+1}/{num_episodes} is {episode_rewards}")                

        if i==max_steps:
            print(f"Max steps reached, End episode prematurely. Total reward for episode {episode+1}/{num_episodes} is {episode_rewards}")

TypeError: 'CustomModel' object does not support indexing

# example q learning algo from online source

## pytorch model - q learning algo

In [52]:
import torch
from typing import Tuple
from math import floor
from torch.nn import Parameter


class VisualQNetwork(torch.nn.Module):
  def __init__(
    self,
    input_shape: Tuple[int, int, int],
    encoding_size: int,
    output_size: int
  ):
    """
    Creates a neural network that takes as input a batch of images (3
    dimensional tensors) and outputs a batch of outputs (1 dimensional
    tensors)
    """
    super(VisualQNetwork, self).__init__()
    height = input_shape[1]
    width = input_shape[2]
    initial_channels = input_shape[0]
    conv_1_hw = self.conv_output_shape((height, width), 8, 4)
    conv_2_hw = self.conv_output_shape(conv_1_hw, 4, 2)
    self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32
    self.conv1 = torch.nn.Conv2d(initial_channels, 16, [8, 8], [4, 4])
    self.conv2 = torch.nn.Conv2d(16, 32, [4, 4], [2, 2])
    self.dense1 = torch.nn.Linear(self.final_flat, encoding_size)
    self.dense2 = torch.nn.Linear(encoding_size, output_size)

  def forward(self, visual_obs: torch.tensor):
    print(visual_obs.shape)
    conv_1 = torch.relu(self.conv1(visual_obs))
    print(conv_1.shape)
    conv_2 = torch.relu(self.conv2(conv_1))
    print(conv_2.shape)
    hidden = self.dense1(conv_2.reshape([-1, self.final_flat]))
    print(hidden.shape)
    hidden = torch.relu(hidden)
    print(hidden.shape)
    hidden = self.dense2(hidden)
    print(hidden.shape)
    return hidden

  @staticmethod
  def conv_output_shape(
    h_w: Tuple[int, int],
    kernel_size: int = 1,
    stride: int = 1,
    pad: int = 0,
    dilation: int = 1,
  ):
    """
    Computes the height and width of the output of a convolution layer.
    """
    h = floor(
      ((h_w[0] + (2 * pad) - (dilation * (kernel_size - 1)) - 1) / stride) + 1
    )
    w = floor(
      ((h_w[1] + (2 * pad) - (dilation * (kernel_size - 1)) - 1) / stride) + 1
    )
    return h, w


## create classes to store env data 

In [53]:
import numpy as np
from typing import NamedTuple, List


class Experience(NamedTuple):
  """
  An experience contains the data of one Agent transition.
  - Observation
  - Action
  - Reward
  - Done flag
  - Next Observation
  """

  obs: np.ndarray
  action: np.ndarray
  reward: float
  done: bool
  next_obs: np.ndarray

# A Trajectory is an ordered sequence of Experiences
Trajectory = List[Experience]

# A Buffer is an unordered list of Experiences from multiple Trajectories
Buffer = List[Experience]

## trainer class

Now, we can create our trainer class. The role of this trainer is to collect data from the Environment according to a Policy, and then train the Q-Network with that data.

In [54]:
from mlagents_envs.environment import ActionTuple, BaseEnv
from typing import Dict
import random


class Trainer:
  @staticmethod
  def generate_trajectories(
    env: BaseEnv, q_net: VisualQNetwork, buffer_size: int, epsilon: float
  ):
    """
    Given a Unity Environment and a Q-Network, this method will generate a
    buffer of Experiences obtained by running the Environment with the Policy
    derived from the Q-Network.
    :param BaseEnv: The UnityEnvironment used.
    :param q_net: The Q-Network used to collect the data.
    :param buffer_size: The minimum size of the buffer this method will return.
    :param epsilon: Will add a random normal variable with standard deviation.
    epsilon to the value heads of the Q-Network to encourage exploration.
    :returns: a Tuple containing the created buffer and the average cumulative
    the Agents obtained.
    """
    # Create an empty Buffer
    buffer: Buffer = []

    # Reset the environment
    env.reset()
    # Read and store the Behavior Name of the Environment
    behavior_name = list(env.behavior_specs)[0]
    # Read and store the Behavior Specs of the Environment
    spec = env.behavior_specs[behavior_name]

    # Create a Mapping from AgentId to Trajectories. This will help us create
    # trajectories for each Agents
    dict_trajectories_from_agent: Dict[int, Trajectory] = {}
    # Create a Mapping from AgentId to the last observation of the Agent
    dict_last_obs_from_agent: Dict[int, np.ndarray] = {}
    # Create a Mapping from AgentId to the last action of the Agent
    dict_last_action_from_agent: Dict[int, np.ndarray] = {}
    # Create a Mapping from AgentId to cumulative reward (Only for reporting)
    dict_cumulative_reward_from_agent: Dict[int, float] = {}
    # Create a list to store the cumulative rewards obtained so far
    cumulative_rewards: List[float] = []

    while len(buffer) < buffer_size:  # While not enough data in the buffer
      # Get the Decision Steps and Terminal Steps of the Agents
      decision_steps, terminal_steps = env.get_steps(behavior_name)

      # permute the tensor to go from NHWC to NCHW (batch_size,height,width,num_channels) to (batch_size,num_channels,height,width,)
      order = (0, 3, 1, 2)
      decision_steps.obs = [np.transpose(obs, order) for obs in decision_steps.obs]
      terminal_steps.obs = [np.transpose(obs, order) for obs in terminal_steps.obs]

      # For all Agents with a Terminal Step:
      for agent_id_terminated in terminal_steps:
        # Create its last experience (is last because the Agent terminated)
        last_experience = Experience(
          obs=dict_last_obs_from_agent[agent_id_terminated].copy(),
          reward=terminal_steps[agent_id_terminated].reward,
          done=not terminal_steps[agent_id_terminated].interrupted,
          action=dict_last_action_from_agent[agent_id_terminated].copy(),
          next_obs=terminal_steps[agent_id_terminated].obs[0],
        )
        # Clear its last observation and action (Since the trajectory is over)
        dict_last_obs_from_agent.pop(agent_id_terminated)
        dict_last_action_from_agent.pop(agent_id_terminated)
        # Report the cumulative reward
        cumulative_reward = (
          dict_cumulative_reward_from_agent.pop(agent_id_terminated)
          + terminal_steps[agent_id_terminated].reward
        )
        cumulative_rewards.append(cumulative_reward)
        # Add the Trajectory and the last experience to the buffer
        buffer.extend(dict_trajectories_from_agent.pop(agent_id_terminated))
        buffer.append(last_experience)

      # For all Agents with a Decision Step:
      for agent_id_decisions in decision_steps:
        # If the Agent does not have a Trajectory, create an empty one
        if agent_id_decisions not in dict_trajectories_from_agent:
          dict_trajectories_from_agent[agent_id_decisions] = []
          dict_cumulative_reward_from_agent[agent_id_decisions] = 0

        # If the Agent requesting a decision has a "last observation"
        if agent_id_decisions in dict_last_obs_from_agent:
          # Create an Experience from the last observation and the Decision Step
          exp = Experience(
            obs=dict_last_obs_from_agent[agent_id_decisions].copy(),
            reward=decision_steps[agent_id_decisions].reward,
            done=False,
            action=dict_last_action_from_agent[agent_id_decisions].copy(),
            next_obs=decision_steps[agent_id_decisions].obs[0],
          )
          # Update the Trajectory of the Agent and its cumulative reward
          dict_trajectories_from_agent[agent_id_decisions].append(exp)
          dict_cumulative_reward_from_agent[agent_id_decisions] += (
            decision_steps[agent_id_decisions].reward
          )
        # Store the observation as the new "last observation"
        dict_last_obs_from_agent[agent_id_decisions] = (
          decision_steps[agent_id_decisions].obs[0]
        )

      # Generate an action for all the Agents that requested a decision
      # Compute the values for each action given the observation
      actions_values = (
        q_net(torch.from_numpy(decision_steps.obs[0])).detach().numpy()
      )
      # Add some noise with epsilon to the values
      actions_values += epsilon * (
        np.random.randn(actions_values.shape[0], actions_values.shape[1])
      ).astype(np.float32)
      # Pick the best action using argmax
      actions = np.argmax(actions_values, axis=1)
      actions.resize((len(decision_steps), 1))
      # Store the action that was picked, it will be put in the trajectory later
      for agent_index, agent_id in enumerate(decision_steps.agent_id):
        dict_last_action_from_agent[agent_id] = actions[agent_index]

      # Set the actions in the environment
      # Unity Environments expect ActionTuple instances.
      action_tuple = ActionTuple()
      action_tuple.add_discrete(actions)
      env.set_actions(behavior_name, action_tuple)
      # Perform a step in the simulation
      env.step()
    return buffer, np.mean(cumulative_rewards)

  @staticmethod
  def update_q_net(
    q_net: VisualQNetwork,
    optimizer: torch.optim,
    buffer: Buffer,
    action_size: int
  ):
    """
    Performs an update of the Q-Network using the provided optimizer and buffer
    """
    BATCH_SIZE = 1000
    NUM_EPOCH = 3
    GAMMA = 0.9
    batch_size = min(len(buffer), BATCH_SIZE)
    random.shuffle(buffer)
    # Split the buffer into batches
    batches = [
      buffer[batch_size * start : batch_size * (start + 1)]
      for start in range(int(len(buffer) / batch_size))
    ]
    for _ in range(NUM_EPOCH):
      for batch in batches:
        # Create the Tensors that will be fed in the network
        obs = torch.from_numpy(np.stack([ex.obs for ex in batch]))
        reward = torch.from_numpy(
          np.array([ex.reward for ex in batch], dtype=np.float32).reshape(-1, 1)
        )
        done = torch.from_numpy(
          np.array([ex.done for ex in batch], dtype=np.float32).reshape(-1, 1)
        )
        action = torch.from_numpy(np.stack([ex.action for ex in batch]))
        next_obs = torch.from_numpy(np.stack([ex.next_obs for ex in batch]))

        # Use the Bellman equation to update the Q-Network
        target = (
          reward
          + (1.0 - done)
          * GAMMA
          * torch.max(q_net(next_obs).detach(), dim=1, keepdim=True).values
        )
        mask = torch.zeros((len(batch), action_size))
        mask.scatter_(1, action, 1)
        prediction = torch.sum(q_net(obs) * mask, dim=1, keepdim=True)
        criterion = torch.nn.MSELoss()
        loss = criterion(prediction, target)

        # Perform the backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [55]:
# -----------------
# This code is used to close an env that might not have been closed before
try:
  env.close()
except:
  pass
# -----------------

from mlagents_envs.registry import default_registry
from mlagents_envs.environment import UnityEnvironment
import matplotlib.pyplot as plt
import os
%matplotlib inline

# Create the GridWorld Environment from the registry
# env = default_registry["GridWorld"].make()
# print("GridWorld environment created.")

env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
print("environment created.")

num_actions = 4

try:
  # Create a new Q-Network.
  qnet = VisualQNetwork((3, 64, 84), 126, num_actions)

  experiences: Buffer = []
  optim = torch.optim.Adam(qnet.parameters(), lr= 0.001)

  cumulative_rewards: List[float] = []

  # The number of training steps that will be performed
  NUM_TRAINING_STEPS = int(os.getenv('QLEARNING_NUM_TRAINING_STEPS', 70))
  # The number of experiences to collect per training step
  NUM_NEW_EXP = int(os.getenv('QLEARNING_NUM_NEW_EXP', 1000))
  # The maximum size of the Buffer
  BUFFER_SIZE = int(os.getenv('QLEARNING_BUFFER_SIZE', 10000))

  for n in range(NUM_TRAINING_STEPS):
    new_exp,_ = Trainer.generate_trajectories(env, qnet, NUM_NEW_EXP, epsilon=0.1)
    random.shuffle(experiences)
    if len(experiences) > BUFFER_SIZE:
      experiences = experiences[:BUFFER_SIZE]
    experiences.extend(new_exp)
    Trainer.update_q_net(qnet, optim, experiences, num_actions)
    _, rewards = Trainer.generate_trajectories(env, qnet, 100, epsilon=0)
    cumulative_rewards.append(rewards)
    print("Training step ", n+1, "\treward ", rewards)
except KeyboardInterrupt:
  print("\nTraining interrupted, continue to next cell to save to save the model.")
finally:
  env.close()

# Show the training graph
try:
  plt.plot(range(NUM_TRAINING_STEPS), cumulative_rewards)
except ValueError:
  print("\nPlot failed on interrupted training.")

environment created.
torch.Size([1, 3, 128, 128])
torch.Size([1, 16, 31, 31])
torch.Size([1, 32, 14, 14])


RuntimeError: shape '[-1, 1728]' is invalid for input of size 6272

# SARSA algorithm
-this is a model free algorithm that does not use any deep learning models, neural networks
-tried and tested before but results are very bad since task with RGB inputs require CNN / model based algo

SARSA (State-Action-Reward-State-Action) is a popular reinforcement learning algorithm that updates Q-values based on the observed state-action pairs and their subsequent state-action pairs. While SARSA is a well-known and widely-used algorithm in the field of reinforcement learning, it is not considered state-of-the-art (SOTA) compared to more advanced algorithms such as Deep Q-Networks (DQN), Proximal Policy Optimization (PPO), or Soft Actor-Critic (SAC).

State-of-the-art algorithms in reinforcement learning are often more sophisticated and build upon SARSA-like algorithms by incorporating deep neural networks, value function approximation, policy gradients, or other advanced techniques. These SOTA algorithms have achieved significant breakthroughs and performance improvements in various domains, including game playing, robotics, and control.

That being said, SARSA is still relevant and used in certain research and practical applications, especially when dealing with small and discrete state and action spaces. It serves as a good starting point for understanding reinforcement learning concepts and provides a solid foundation for exploring more advanced algorithms.

SARSA does not use any deep learning neural networks or any ML models.
SARSA is a model-free reinforcement learning algorithm that does not rely on neural networks. It is a tabular method that maintains a table (or dictionary) of Q-values, where each entry represents the estimated value of a state-action pair. The Q-values are updated iteratively based on the observed experiences.

SARSA is a TD (Temporal Difference) learning algorithm that combines the concepts of on-policy learning and the use of a value function. It interacts with the environment, observes state transitions, selects actions based on a policy (e.g., epsilon-greedy), and updates the Q-values accordingly.

While SARSA itself does not utilize neural networks, its basic principles can be extended and combined with neural networks in more advanced algorithms. For example, Deep Q-Networks (DQN) combine SARSA with deep neural networks to handle high-dimensional state spaces and achieve better performance in complex environments.

In [327]:
import sys
import random
import numpy as np
from typing import Dict
from collections import deque
import matplotlib.pyplot as plt
from collections import defaultdict
import time

import mlagents
from mlagents_envs.environment import UnityEnvironment

def e_greedy(Q, state, epsilon, num_actions):
    policy_s = np.ones(num_actions) * epsilon / num_actions
    #calculates a policy to be used to calc the action to take later
    #policy_s represents the policy probabilities for all actions in the current state.
    best = np.argmax(Q[state])
    #best is the index of the action with the highest Q-value for the given state.
    policy_s[best] = 1-epsilon + (epsilon/num_actions)
    #The line policy_s[best] = 1-epsilon + (epsilon/nA[0]) assigns a new value to the policy probability of the best action.
    #1 - epsilon represents the exploitation component of the policy, where the best action is given a higher probability.
    #(epsilon/nA[0]) represents the exploration component of the policy, where a small probability is distributed equally among all actions to encourage exploration.
    #By updating policy_s[best] with the new probability, you ensure that the best action has a higher probability of being selected while still allowing for some exploration.
    if np.sum(Q[state]) > 0:
        #checks if there are any non-zero Q-values for the given state. If there are non-zero Q-values, it implies that the Q-values have been updated for this state during the learning process.
        action = np.random.choice(np.arange(num_actions), p=policy_s)
        #is used to select an action randomly according to the policy probabilities (policy_s). The np.arange(num_actions) generates an array of action indices, and p=policy_s specifies the probabilities associated with each action. By using np.random.choice, an action is randomly selected according to the given probabilities.
    else:
        action = spec.action_spec.random_action(len(decision_steps))
        # called to select a random action from the action space. spec.action_spec.random_action generates a random action based on the action specification defined for the environment. len(decision_steps) is used to determine the number of agents in the environment.
    return action


def update_Q_expsarsa(alpha, gamma, num_actions, eps, Q, state, action, reward, next_state=None):
    """Returns updated Q-value for the most recent experience."""
    current = Q[state][action.discrete]         # estimate in Q-table (for current state, action pair)
    policy_s = np.ones(num_actions) * eps / num_actions  # current policy (for next state S')
    policy_s[np.argmax(Q[next_state])] = 1 - eps + (eps / num_actions) # greedy action
    Qsa_next = np.dot(Q[next_state], policy_s)         # get value of state at next time step
    target = reward + (gamma * Qsa_next)               # construct target
    new_value = current + (alpha * (target - current)) # get updated value
    return new_value


def expected_sarsa(env, num_episodes, alpha, gamma=1.0):
    action_space = env.behavior_specs[behavior_name].action_spec
    if spec.action_spec.is_continuous():
        continuous_action_space = action_space[0]
        num_actions = sum(continuous_action_space)
    else: #is_discrete()
        discrete_action_space = action_space[1]
        # action size is 4 since move forward backward rotate right left are 4 different actions
        # actions space is 2,2,2,2 since each of them can do that or do nothing
        num_actions = sum(discrete_action_space)

    # initialize empty dictionary of arrays
    Q = defaultdict(lambda: np.zeros(num_actions))
        # Q is initialised in the format of Q[state]=np.zeros(num_actions)
        # so after updating q values, the q values will replace the zeros
        # so when you call Q[state][action], action is 0-7, this represents a specific state-action pair
        # Q[state][action] returns the q value for that specific state-action pair
        # q value is calculated in def update_Q_expsarsa()
    
    total_steps=0
    training_start_time=time.time()
    print("########## START TRAINING ##########")
    for i_episode in range(1, num_episodes+1):
        episode_rewards = 0
        episode_steps=0
        episode_start_time=time.time()
        # monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush() # flushes the output buffer, ensuring that the printed message is immediately displayed in the console without any delay.
        epsilon = 1/i_episode 
            #decays the epsilon
            #epsilon is the exploration-exploitation trade-off parameter. It determines the probability of exploration versus exploitation.
            #increase epsilon increase randomness, so epsilon high at the start to do more exploration and then reduce later to do more exploitation of past knowledge
        env.reset() # begin the episode
        decision_steps, terminal_steps = env.get_steps(behavior_name)
            # decision steps returns the agents requesting an action when the code is called
            # terminal steps returns the agents that have already reached a terminal step when the code is called, terminal step means end of episode, its like this because one scene can have more than 1 agent
        tracked_agent = -1 
            # this is just assigning an index to the agent you want to track right now
            # 1 env can have multiple agents, so -1 means settling each agent from back to front
        done = False # For the tracked_agent
        state = tuple(map(tuple, camera_input) for camera_input in decision_steps.obs)
            # print(decision_steps.obs[0].shape) gives (1, 128, 128, 3)
            # this refers to the input observation of the agent, depends on how you setup the scene
            # the values are the RGB pixel values of the 128x128 pixels
            # since only camera sensor attached with 128x128 RGB input, it is (1, 128, 128, 3)
            # 1 refers to the single observation from 1 camera, so if 2 cameras, 1 256x256, 1 128x128, you need to resize the images, or easiest, keep it standardised in the scene setup
            
            #The map() function executes a specified function for each item in an iterable. The item is sent to the function as a parameter.
            #here map() maps a tuple function to each observation in decision_steps.obs 
            #To access elements from a map object, you can convert it to a list or tuple first, and then use indexing to retrieve specific elements. so list(sample) works but sample[0] wouldnt
            
            #if only 1 camera in the scene, for o in decision_steps.obs gives the entire decision_steps.obs
            
        while not done: #for the current tracked_agent
            if tracked_agent == -1 and len(decision_steps) >= 1: # means there is still agents requesting a decision and have not reached terminal step
              tracked_agent = decision_steps.agent_id[0]
                #assign the tracked_agent to the specific agent_id in the env
                
            #choose next action under e-greedy Q
            action = e_greedy(Q, state, epsilon, num_actions)
            #take action and observe r,s
            env.set_actions(behavior_name, action)
            env.step()
            episode_steps+=1
            decision_steps, terminal_steps = env.get_steps(behavior_name) # agent performs internal updates based on sampled experience
                #checks the updated situation with the agents in decision_steps and terminal_steps
            next_state = tuple(map(tuple, camera_input) for camera_input in decision_steps.obs)
            if tracked_agent in decision_steps:
                reward = decision_steps[tracked_agent].reward
                episode_rewards += decision_steps[tracked_agent].reward
                    #add the reward of the current tracked agent to the cumulative reward
                #Q(s,a) <-- Q(s,a) + a(R1 + g*sum(expected prob a|next state *Q(next_state, action)) - Q(s,a))
#                 prev_Q = Q[state][action] #dont need this because prevQ is already Q and you are updaitng the next Q with Q[state][action] = update_Q_expsarsa(alpha, gamma, num_actions, epsilon, Q, state, action, reward, next_state)
                    #The code calculates the previous Q-value for the current state-action pair using prev_Q = Q[state][action]. This value will be used later in the Q-value update equation.
                eps_probs = np.ones(num_actions) * (1-epsilon) + (epsilon/num_actions)
                Q[state][action.discrete] = update_Q_expsarsa(alpha, gamma, num_actions, epsilon, Q, state, action, reward, next_state)
                state = next_state
            if tracked_agent in terminal_steps:
                done = True
                episode_rewards += terminal_steps[tracked_agent].reward
                episode_time=time.time()-episode_start_time
                print(f"Episode {i_episode}, Steps taken: {episode_steps}, Time elapsed: {episode_time}, Total reward: {episode_rewards}") #this only applies to doing in steps, Mean reward: {np.mean(episode_rewards)}, Std of Reward: {np.mean(episode_rewards)}")
                break
        total_steps+=episode_steps
        total_time=time.time()-training_start_time
    print("########## END TRAINING ##########")
    print(f"Total steps taken: {total_steps}, Total time taken: {total_time}")
    return Q

#step,time elapsed, mean reward, std reward, cumulative reward, total steps, total time
Q_table = expected_sarsa(env, 400, .7, .7)
#this returns the Q_values for all the state-action pairs in the environment running from start till end

########## START TRAINING ##########
Episode 1, Steps taken: 1647, Time elapsed: 164.24765014648438, Total reward: -53.0
Episode 2, Steps taken: 366, Time elapsed: 36.56473231315613, Total reward: -4.0
Episode 3, Steps taken: 622, Time elapsed: 62.228856563568115, Total reward: -5.0
Episode 4, Steps taken: 131, Time elapsed: 13.070374011993408, Total reward: 0.0
Episode 5, Steps taken: 171, Time elapsed: 17.062402486801147, Total reward: -2.0
Episode 6, Steps taken: 516, Time elapsed: 51.58318328857422, Total reward: -15.0
Episode 7, Steps taken: 1101, Time elapsed: 109.98961114883423, Total reward: -5.0
Episode 8, Steps taken: 1786, Time elapsed: 178.5037899017334, Total reward: -24.0
Episode 9, Steps taken: 4006, Time elapsed: 400.52717185020447, Total reward: -24.0
Episode 10, Steps taken: 54, Time elapsed: 5.401919603347778, Total reward: -1.0
Episode 11, Steps taken: 627, Time elapsed: 62.6712908744812, Total reward: 0.0
Episode 12, Steps taken: 2648, Time elapsed: 264.68278884887

Episode 100/400Episode 100, Steps taken: 2019, Time elapsed: 201.84026408195496, Total reward: -1.0
Episode 101, Steps taken: 1913, Time elapsed: 191.21023511886597, Total reward: -35.0
Episode 102, Steps taken: 721, Time elapsed: 72.00407552719116, Total reward: -5.0
Episode 103, Steps taken: 419, Time elapsed: 41.838796615600586, Total reward: -1.0
Episode 104, Steps taken: 842, Time elapsed: 84.11452436447144, Total reward: -10.0
Episode 105, Steps taken: 1601, Time elapsed: 160.0652461051941, Total reward: -27.0
Episode 106, Steps taken: 1762, Time elapsed: 176.05212664604187, Total reward: -22.0
Episode 107, Steps taken: 2971, Time elapsed: 297.0610935688019, Total reward: -41.0
Episode 108, Steps taken: 3582, Time elapsed: 358.60643219947815, Total reward: -65.0
Episode 109, Steps taken: 1118, Time elapsed: 111.69622564315796, Total reward: -36.0
Episode 110, Steps taken: 357, Time elapsed: 35.721726179122925, Total reward: 1.0
Episode 111, Steps taken: 56, Time elapsed: 5.517662

Episode 198, Steps taken: 1061, Time elapsed: 105.98809051513672, Total reward: -4.0
Episode 199, Steps taken: 1529, Time elapsed: 152.8378837108612, Total reward: -56.0
Episode 200/400Episode 200, Steps taken: 4548, Time elapsed: 454.7270121574402, Total reward: -88.0
Episode 201, Steps taken: 2158, Time elapsed: 215.74714255332947, Total reward: -39.0
Episode 202, Steps taken: 2190, Time elapsed: 218.9100878238678, Total reward: -14.0
Episode 203, Steps taken: 85, Time elapsed: 8.456557035446167, Total reward: -4.0
Episode 204, Steps taken: 2012, Time elapsed: 201.05030488967896, Total reward: -54.0
Episode 205, Steps taken: 1329, Time elapsed: 132.83111596107483, Total reward: -11.0
Episode 206, Steps taken: 2377, Time elapsed: 237.62399220466614, Total reward: -108.0
Episode 207, Steps taken: 350, Time elapsed: 34.93128848075867, Total reward: -3.0
Episode 208, Steps taken: 1428, Time elapsed: 142.7144331932068, Total reward: -60.0
Episode 209, Steps taken: 951, Time elapsed: 94.99

KeyboardInterrupt: 

In [328]:
env.close()