In [None]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])


In [None]:
for i in range(1, 6):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

In [None]:
env.close()

### Multiagent Deep Deterministic Policy Gradient

In [None]:
# Import necessary libraries
from maddpg import MADDPG
from collections import deque

# Hyperparameters
ddpg_agent_params = {
    'state_size': state_size,
    'action_size': action_size,
    'hidden_size_1': 256,
    'hidden_size_2': 256,
    'gamma': 0.99,
    'tau': 1e-3,
    'lr_actor': 1e-4,
    'lr_critic': 1e-3,
    'weight_decay': 0.0,
    'random_seed': 42
}

maddpg_params = {
    'buffer_size': int(1e5),
    'batch_size': 256,
    'learn_every': 10,
    'num_training': 1,
    'random_seed': ddpg_agent_params['random_seed']
}
# Additional parameters
n_episodes = 3000 #4000
t_max = 100

maddpg = MADDPG(num_agents, maddpg_params, ddpg_agent_params)
score_all_episodes = []
score_all_episodes_window = deque(maxlen=100)
mean_overall_score = []

# Main loop
for i_episode in range(n_episodes):
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    

    # Initialize a random process for action exploration
    maddpg.reset()

    # Receive initial state
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)

    for t in range(0, t_max):
        
        # Select action for each agent w.r.t current policy and exploration
        actions = maddpg.act(states, add_noise=True) # select an action (for each agent)

        env_info = env.step(actions)[brain_name]           # send all actions to tne environment

        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished

        # This adds the experience to the buffer and makes each agent learn if enough experiences 
        # are available
        maddpg.step(states, actions, rewards, next_states, dones)

        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break

    score_all_episodes.append(np.max(scores))                    # save most recent score
    score_all_episodes_window.append(np.max(scores))              # save most recent score
    mean_overall_score.append(np.mean(score_all_episodes_window))  # save mean score

    print(f"\rEpisode {i_episode}/{n_episodes} - Mean Score: {np.mean(score_all_episodes_window):.2f}", end="")

    if i_episode % 10 == 0:
        print(f"\rEpisode {i_episode} - Mean Score: {np.mean(score_all_episodes_window):.2f}")

    if np.mean(score_all_episodes_window) >= 0.5:
        high_score_count += 1
        if high_score_count >= 100:
            print(f"Environment solved in {i_episode} episodes!")
            maddpg.save_models()
            break