# Policy template for the VerySimpleSpread Scenario

This notebook serves as a template for how to creata a policy for the VerySimpleSpread Scenario - two agents and two landmarks. In order to create your own policy and execute it in the VerySimpleSpread environment, follow the below instructions:
1. Copy this notebook and rename it to your policy name.
2. Modify the `RandomPolicy` class code to be your policy's code, and rename the class.
3. Modify line 7 in the third notebook cell to use your policy instead of the `RandomPolicy`.

In [4]:
import time

import numpy as np
from tabulate import tabulate

from envs.multiagent_particle_envs import make_env
from envs.multiagent_particle_envs.multiagent.environment import MultiAgentEnv
from envs.multiagent_particle_envs.multiagent.scenarios import load

from experiments.policy import VerySimpleSpreadPolicy

In [7]:
class Q_learning(VerySimpleSpreadPolicy):
    """Randomly returns actions.
    
    Parameters
    ----------
    _id : int or str
        Unique identifier for policy.
    verbose : bool, optional, default True
        If True, prints out observation and action on each episode.
    """
    
    def __init__(self, _id, verbose=True):
        self._id = _id
        self.verbose = verbose
        
        
    def update(o, a, r, o1):
        """Update policy given previous observation, action, and reward.
        
        Parameters
        ----------
        o : str
            Previous Observation
        a : list
            action
        r : float
            reward.
        o1 : current observation
        """
        # TODO: modify policy using these parameters.
        # s1 = T(s,a)
        # Q[s,a] = Q[s,a] + alpha*(r+gamma*max(Q[s1, a1]))
        
    def action(self, obs):
        """Returns a random action.
        
        Parameters
        ----------
        obs : list
            <'p_velx', 'p_vely', 'p_posx', 'p_posy', 'landmark1_posx', 'landmark1_posy',
            'landmark2_posx', 'landmark2_posy', 'ag_posx', 'ag_posy', 'ag_com1', 'ag_com2'>
        
        Returns
        -------
        list<float>, length 6
            Each action index is a float between 0 and 1.

        NOTE
        ----
        Action indexes are:
            0: No-op
            1: Move left
            2: Move right
            3: Move up
            4: Move down
            5: Communicate dimension 1
            6: Communicate dimension 2
        """

        n_actions = 7
        
        # Construct returned action array
        action = np.zeros(n_actions)
        
        # -------------------------- CHANGE THIS --------------------------
        # Randomly select an action dimension
        action_idx = np.random.randint(0, n_actions-1)
        # Randomly choose actions strength between from normal distribution N(.5, .1)
        action_strength = .1 * np.random.randn() + .5
        action[action_idx] = round(action_strength, 3)
        # -----------------------------------------------------------------

        if self.verbose:
            print('\nAgent {}'.format(self._id))
            # Print observation
            obs_str = self._obs_str(obs)
            print('\nObservation')
            print(obs_str)
            # Print action
            print('\nAction')
            action_str = self._action_str(action)
            print(action_str)
            
        return action

In [8]:
max_episodes = 5

scenario = load('very_simple_spread.py').Scenario()
world = scenario.make_world()
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,scenario.observation,
                    info_callback=None, shared_viewer=False)
policies = [Q_learning(i, True) for i in range(env.n)]

obs_n = env.reset()
rewards = np.zeros(len(env.world.agents))
env.render()

episodes_count = 0

total_rewards = []

while episodes_count < max_episodes:
    # query for action from each agent's policy
    act_n = []
    for i, policy in enumerate(policies):
        act_n.append(policy.action(obs_n[i]))
    # step environment
    prev_obs_n = obs_n
    obs_n, reward_n, done_n, _ = env.step(act_n)
    # render all agent views
    env.render()

    agent = env.world.agents[0]  # Both agents get same reward
    episode_reward = env._get_reward(agent)
    total_rewards.append(episode_reward)
    
    for i, policy in enumerate(policies):
        policy.update(prev_obs_n[i], act_n[i], reward_n[i], obs_n[i])
        
    
    
    print('\n' + 80*'=' + '\n')
    
    episodes_count += 1
    time.sleep(1)
    
print(total_rewards)


Agent 0

Observation
  vel_x    vel_y    pos_x    pos_y    lm1_x    lm1_y    lm2_x    lm2_y    a1_x    a1_x    a1_com1    a1_com2
-------  -------  -------  -------  -------  -------  -------  -------  ------  ------  ---------  ---------
      0        0    0.608   -0.515   -0.063   -0.467   -0.684    1.504  -0.495   0.955          0          0

Action
  No-op    Move left    Move right    Move up    Move down    Communicate 1    Communicate 2
-------  -----------  ------------  ---------  -----------  ---------------  ---------------
      0        0.644             0          0            0                0                0

Agent 1

Observation
  vel_x    vel_y    pos_x    pos_y    lm1_x    lm1_y    lm2_x    lm2_y    a1_x    a1_x    a1_com1    a1_com2
-------  -------  -------  -------  -------  -------  -------  -------  ------  ------  ---------  ---------
      0        0    0.113    0.441    0.431   -1.422   -0.189    0.549   0.495  -0.955          0          0

Action
  No-op