In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Found path: /lhome/jaiinga/git/nanodegree-drl/drl-p2-continuous-control/./Reacher_Linux/Reacher.x86_64
Mono path[0] = '/lhome/jaiinga/git/nanodegree-drl/drl-p2-continuous-control/./Reacher_Linux/Reacher_Data/Managed'
Mono config path = '/lhome/jaiinga/git/nanodegree-drl/drl-p2-continuous-control/./Reacher_Linux/Reacher_Data/MonoBleedingEdge/etc'
Preloaded 'ScreenSelector.so'
Preloaded 'libgrpc_csharp_ext.x64.so'
Unable to preload the following plugins:
	ScreenSelector.so
	libgrpc_csharp_ext.x86.so
Logging to /lhome/jaiinga/.config/unity3d/Unity Technologies/Unity Environment/Player.log


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [None]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations[0]            # get the current state (for each agent)
print(len(states))

In [None]:
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

## Deep Deterministic Policy Gradient Implementation

In [None]:
from ddpg_agent import DDPGAgent
from collections import deque
import torch

# Hyperparameters
BUFFER_SIZE = 100000  # replay buffer size
BATCH_SIZE = 256       # minibatch size
GAMMA = 0.99          # discount factor
TAU = 5e-4           # for soft update of target parameters
LR_ACTOR = 2e-5         # learning rate of the actor 
LR_CRITIC = 2e-4        # learning rate of the critic
HIDDEN_SIZE_1 = 256  # size of the first hidden layer
HIDDEN_SIZE_2 = 256  # size of the second hidden layer
WEIGHT_DECAY = 0.0   # L2 weight decay
LEARN_EVERY = 40
NUM_TRAINING = 20

# Other parameters
n_episodes = 800
t_max = 900  # maximum number of timesteps per episode
scores = []  # list of scores from each episode
mean_scores = []  # mean score at each episode
scores_window = deque(maxlen=100)
high_score_count = 0  # count of episodes with high score of 30 or more

agent = DDPGAgent(state_size=state_size, action_size=action_size, hidden_size_1=HIDDEN_SIZE_1,
                 hidden_size_2=HIDDEN_SIZE_2, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA,
                 tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, learn_every=LEARN_EVERY,
                 num_training=NUM_TRAINING, random_seed=42)

for i_episode in range(1, n_episodes + 1):
    # Intialize a random process for action exploration
    agent.noise_reset()  # reset the noise process for each episode
    # Receive initial state and reset scores
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]  # get the current state (for each agent)
    score = 0  # initialize the score 
    for t in range(1, t_max + 1):
        # Select an action according to the current policy and exploration noise
        action = agent.act(state, add_noise=True)
        # Execute the action and observe the next state and reward

        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]  # get next state 
        reward = env_info.rewards[0]  # get reward 
        done = env_info.local_done[0]  # get done 


        # Add to experience and learn
        agent.step(state, action, reward, next_state, done)

        state = next_state
        score += reward
        if done:
            break

    scores.append(score)
    scores_window.append(score)  # save most recent score
    mean_scores.append(np.mean(scores_window))  # save mean score
    print(f"\rEpisode {i_episode}/{n_episodes} - Mean Score: {np.mean(scores_window):.2f}", end="")

    if i_episode % 10 == 0:
        print(f"\rEpisode {i_episode} - Mean Score: {np.mean(scores_window)}")

    if np.mean(scores_window) >= 30.0:
        high_score_count += 1
        if high_score_count >= 100:
            print(f"Environment solved in {i_episode} episodes!")
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break



Episode 10/800 - Mean Score: 0.32Episode 10 - Mean Score: 0.3209999928250909
Episode 20/800 - Mean Score: 0.47Episode 20 - Mean Score: 0.47499998938292265
Episode 30/800 - Mean Score: 0.54Episode 30 - Mean Score: 0.5373333213229974
Episode 40/800 - Mean Score: 0.75Episode 40 - Mean Score: 0.7534999831579625
Episode 50/800 - Mean Score: 1.06Episode 50 - Mean Score: 1.0579999763518573
Episode 60/800 - Mean Score: 1.35Episode 60 - Mean Score: 1.3494999698363244
Episode 70/800 - Mean Score: 1.63Episode 70 - Mean Score: 1.6271428207733802
Episode 80/800 - Mean Score: 1.83Episode 80 - Mean Score: 1.8256249591941014
Episode 90/800 - Mean Score: 2.19Episode 90 - Mean Score: 2.1944443953947887


In [None]:
env.close()

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(211)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
fig.add_subplot(212)
plt.plot(np.arange(len(mean_scores)), mean_scores)
plt.ylabel('Mean Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)