In [1]:
from unityagents import UnityEnvironment
import numpy as np
import time
import matplotlib.pyplot as plt
from collections import deque
import torch
from maddpg_agent import MADDPG

In [2]:
env = UnityEnvironment(file_name='../Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
#reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [5]:
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
i = 0
while True:
    i+=1
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        print(i)
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

15
Total score (averaged over agents) this episode: -0.004999999888241291


In [6]:
state_dim = int(env_info.vector_observations.shape[1])
action_dim = int(brain.vector_action_space_size)

In [7]:
agent = MADDPG(num_agents, state_dim, action_dim, 10)

In [8]:
def ddpg(n_episodes=10, max_t=1000):
    scores_window = deque(maxlen=100)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        scores_episode = np.zeros(num_agents)
            
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]       
            next_states = env_info.vector_observations     
            rewards = env_info.rewards
            rewards = [0.1 if rew > 0 else 0 for rew in rewards]
            dones = env_info.local_done        
            agent.step(states, actions, rewards, next_states, dones) 
            states = next_states
            scores_episode += rewards
            
            print('\rEpisode {}\tStep: {:.2f}\t Score: {:.2f}'.format(i_episode, t, np.mean(scores)), end="")
            if np.any(dones):
                break 
        score = np.max(scores_episode)
        scores_window.append(score)       
        scores.append(score)

        print('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(i_episode, score, np.mean(scores_window)))
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        if np.mean(scores_window)>=0.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            for i, agent_i in enumerate(agent.agents):
                torch.save(agent_i.actor_local.state_dict(),  f"../model_weights/actor_agent_{i+1}.pth")
                torch.save(agent_i.critic_local.state_dict(), f"../model_weights/critic_agent_{i+1}.pth")
            break
            
    return scores

In [9]:
scores = ddpg(n_episodes=100)

Episode 1	Step: 0.00	 Score: nan

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Episode 1	Step: 1.00	 Score: nanEpisode 1	Step: 2.00	 Score: nanEpisode 1	Step: 3.00	 Score: nanEpisode 1	Step: 4.00	 Score: nanEpisode 1	Step: 5.00	 Score: nanEpisode 1	Step: 6.00	 Score: nanEpisode 1	Step: 7.00	 Score: nanEpisode 1	Step: 8.00	 Score: nanEpisode 1	Step: 9.00	 Score: nanEpisode 1	Step: 10.00	 Score: nanEpisode 1	Step: 11.00	 Score: nanEpisode 1	Step: 12.00	 Score: nanEpisode 1	Step: 13.00	 Score: nanEpisode 1	Score: 0.00	Average Score: 0.00

Environment solved in 1 episodes!	Average Score: 0.00
