In [None]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

## Deep Deterministic Policy Gradient Implementation

In [None]:
from ddpg_agent import DDPGAgent
from collections import deque
import torch

# Hyperparameters
BUFFER_SIZE = 100000    # replay buffer size
BATCH_SIZE = 256        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 3e-5         # learning rate of the actor
LR_CRITIC = 4e-4        # learning rate of the critic
HIDDEN_SIZE_1 = 256     # size of the first hidden layer
HIDDEN_SIZE_2 = 256     # size of the second hidden layer
WEIGHT_DECAY = 0.0      # L2 weight decay
LEARN_EVERY = 40        # learn every x steps
NUM_TRAINING = 20       # number of training iterations per update

# Other parameters
n_episodes = 800
t_max = 900  # maximum number of timesteps per episode
scores = []  # list of scores from each episode
mean_scores = []  # mean score at each episode
scores_window = deque(maxlen=100)
high_score_count = 0  # count of episodes with high score of 30 or more

agent = DDPGAgent(state_size=state_size, action_size=action_size, hidden_size_1=HIDDEN_SIZE_1,
                 hidden_size_2=HIDDEN_SIZE_2, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA,
                 tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, learn_every=LEARN_EVERY,
                 num_training=NUM_TRAINING, random_seed=42)

for i_episode in range(1, n_episodes + 1):
    # Intialize a random process for action exploration
    agent.noise_reset()  # reset the noise process for each episode
    # Receive initial state and reset scores
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]  # get the current state (for each agent)
    score = 0  # initialize the score 

    for t in range(1, t_max + 1):
        # Select an action according to the current policy and exploration noise
        action = agent.act(state, add_noise=True)

        # Execute the action and observe the next state and reward
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]  # get next state 
        reward = env_info.rewards[0]  # get reward 
        done = env_info.local_done[0]  # get done 


        # Add to experience and learn
        agent.step(state, action, reward, next_state, done)

        state = next_state
        score += reward
        if done:
            break

    scores.append(score)
    scores_window.append(score)  
    mean_scores.append(np.mean(scores_window))  
    print(f"\rEpisode {i_episode}/{n_episodes} - Mean Score: {np.mean(scores_window):.2f}", end="")

    if i_episode % 10 == 0:
        print(f"\rEpisode {i_episode} - Mean Score: {np.mean(scores_window)}")

    if np.mean(scores_window) >= 30.0:
        high_score_count += 1
        if high_score_count >= 100:
            print(f"Environment solved in {i_episode} episodes!")
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            break

### Plot scores and save along with hyperparameters in a JSON file

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot the scores
fig = plt.figure()
ax = fig.add_subplot(211)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
fig.add_subplot(212)
plt.plot(np.arange(len(mean_scores)), mean_scores)
plt.ylabel('Mean Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
import json

# Save the scores and parameters to a JSON file
data = {
    'BUFFER_SIZE': BUFFER_SIZE,
    'BATCH_SIZE': BATCH_SIZE,
    'GAMMA': GAMMA,
    'TAU': TAU,
    'LR_ACTOR': LR_ACTOR,
    'LR_CRITIC': LR_CRITIC,
    'HIDDEN_SIZE_1': HIDDEN_SIZE_1,
    'HIDDEN_SIZE_2': HIDDEN_SIZE_2,
    'WEIGHT_DECAY': WEIGHT_DECAY,
    'LEARN_EVERY': LEARN_EVERY,
    'NUM_TRAINING': NUM_TRAINING,
    'OU_mean': agent.noise.mu.tolist(),
    'OU_theta': agent.noise.theta,
    'OU_sigma': agent.noise.sigma,
    'scores': scores,
    'mean_scores': mean_scores
}

with open('training_scores.json', 'w') as f:
    json.dump(data, f)

## Test the trained agent

In [None]:
# Test the trained agent
from ddpg_agent import DDPGAgent

import json
import torch

# Load parameters and scores from the JSON file
with open('training_scores.json', 'r') as f:
    data = json.load(f)

# Example: Access hyperparameters
BUFFER_SIZE = data['BUFFER_SIZE']
BATCH_SIZE = data['BATCH_SIZE']
GAMMA = data['GAMMA']
TAU = data['TAU']
LR_ACTOR = data['LR_ACTOR']
LR_CRITIC = data['LR_CRITIC']
HIDDEN_SIZE_1 = data['HIDDEN_SIZE_1']
HIDDEN_SIZE_2 = data['HIDDEN_SIZE_2']
WEIGHT_DECAY = data['WEIGHT_DECAY']
LEARN_EVERY = data['LEARN_EVERY']
NUM_TRAINING = data['NUM_TRAINING']

agent = DDPGAgent(state_size=state_size, action_size=action_size, hidden_size_1=HIDDEN_SIZE_1,
                  hidden_size_2=HIDDEN_SIZE_2, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA,
                  tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY,
                  learn_every=LEARN_EVERY, num_training=NUM_TRAINING, random_seed=42)

agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth', map_location=torch.device('cpu')))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth', map_location=torch.device('cpu')))

env_info = env.reset(train_mode=False)[brain_name]
state = env_info.vector_observations[0]
score = 0

t_max = 900

for t in range(1, t_max + 1):
    action = agent.act(state, add_noise=False)  # No noise for evaluation
    env_info = env.step(action)[brain_name]
    next_state = env_info.vector_observations[0]
    reward = env_info.rewards[0]
    done = env_info.local_done[0]
    score += reward
    state = next_state
    if done:
        break

print(f"Score achieved by the trained agent: {score}")

# Close the environment
env.close()