In [1]:
try:
    env.close()
except:
    pass

from unityagents import UnityEnvironment
import numpy as np
env = UnityEnvironment(file_name='reacher.app')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33


In [None]:
from ddpg import Agent
from collections import deque
import torch
from torch import optim
from time import time

gamma = .99
n_episodes = 2000
torque_reward = -0#.0001
max_t = 10000
agent = Agent(state_size, action_size,
              buffer_size=1e5, batch_size=128,
              learning_rate=0.0002, tau=0.001)
deterministic = True

# keep track of progress    
scores = []                        # list containing scores from each episode
scores_window = deque(maxlen=100) 
adj_scores_window = deque(maxlen=10)
for i_episode in range(1, n_episodes+1):
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    state = env_info.vector_observations               # get the current state
    score = np.zeros(len(state))
    adj_score = np.zeros(len(state))
    for t in range(max_t):
        action = agent.act(state, deterministic)
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations      # get the next state
        reward = env_info.rewards                    # get the reward
        adj_reward = reward + np.mean(np.power(action, 2), 1) * torque_reward 
        done = env_info.local_done                     # see if episode has finished
        agent.step(state, action, adj_reward, next_state, done)
        if t % 20 == 0:
            for _ in range(10):
                agent.learn(gamma, deterministic)
        state = next_state
        score += np.mean(reward)
        adj_score += np.mean(adj_reward)
        if any(done):
            break 
    scores_window.append(score)       # save most recent score
    adj_scores_window.append(adj_score)
    scores.append(score)              # save most recent score
    print('\rEpisode {}\tAverage Score: {:.2f} ({:.2f})'.format(i_episode, np.mean(scores_window), np.mean(adj_scores_window)), end="")
    if i_episode % 10 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f} ({:.2f})'.format(i_episode, np.mean(scores_window), np.mean(adj_scores_window)))
    if np.mean(scores_window) > 30:
        print('\nEnvironment fit in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
        torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        break

torch.save(agent.local_actor.state_dict(), 'checkpoint.pth')

Episode 10	Average Score: 1.13 (1.13)
Episode 20	Average Score: 1.18 (1.24)
Episode 30	Average Score: 1.18 (1.17)
Episode 40	Average Score: 1.17 (1.14)
Episode 50	Average Score: 1.13 (0.98)
Episode 60	Average Score: 1.10 (0.97)
Episode 70	Average Score: 1.10 (1.08)
Episode 80	Average Score: 1.10 (1.10)
Episode 88	Average Score: 1.10 (1.08)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline


plt.ion()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = agent.act(states) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))