In [1]:
import os, sys
from unityagents import UnityEnvironment
import numpy as np

import torch

sys.path.insert(0, 'src')

from model import (
    Critic,
    Actor,
    DDPG,
)

from buffer import ReplayBuffer
from noise import OUNoise
from torch.utils.tensorboard import SummaryWriter

from environment_interaction import (
    evaluation,
    train,
)

%load_ext autoreload
%autoreload 2

## Information about the enviroment that we are going to solve

In [2]:
env = UnityEnvironment(file_name='unity/Reacher_Linux_Many/Reacher.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment 
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents, "\n")

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size, "\n")

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]

print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size), "\n")

print('The state for the first agent looks like:', states[0], "\n")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 20 

Size of each action: 4 

There are 20 agents. Each observes a state with length: 33 

The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01] 



## Setting up hyperparamters

In [3]:
# The name of the experiment, as seen in tensorboard
experiement_name = "final"

# Size of the batches
batch_size = 128
# Size of the replay bufffer
replay_buffer_size = int(1e6)

# In the replay buffer we normalize the weights (w), associated with each replay tuplet, with the number of times (seen),
# the tuplet has been used for training. We then scale (w) with (seen) -> w <- w / seen ** beta.
# High beta -> we prefer to sample unseen tuples 
beta = 1

# In the replay buffer, after we have scaled the weights and converted it to a probability distribution (p) we scale the 
# distribution, p.^beta_2. High beta_2 -> we will only sample tuples with high weight
beta_2 = 2

# Number of episodes used for training
episodes = 2000

# Gamma in the td-loss 
gamma = 0.99

# We have the possibility of scaling the reward with a factor
scale_reward = 10

# Where do you want to run the inference/training, cuda or cpu 
device = "cuda"

# We wait (inference_steps), for each training episode, before we update the models
inference_steps = 20
# When we start update the models we do that for (update_steps) 
update_steps = 45
# Total number of training steps per episode (1000 / inference_steps) * update_steps

# We randomly, with probability sigma, choose to use action generated from our noise-generator or to use 
# the output from the actor-network

# Start of sigma
sigma_init = 1
# decay factor
sigma_decay = 0.86
# stop of sigma
sigma_end = 0.1

# Ornstein–Uhlenbeck sigma 
ou_sigma = 0.25

# Number of times we are running evaluation episodes (that we average over) after each training episode
eval_rounds = 1

# Learning rates
lr_critic = 10**-4
lr_actor = 10**-4
# Controls how we are updating the target network after each training step
tau_critic = 10**-3
tau_actor = 10**-3

# Number of output nodes in the first 2 layers
hidden_dim = 256

# Number of output nodes in the following hidden layers
squeeze_dim = 128

## Setting up the networks

In [4]:
actor = Actor(
    in_dim=state_size,
    out_dim=action_size,
    hidden_dim=hidden_dim,
    squeeze_dim=squeeze_dim,
)        

critic = Critic(
    in_dim=state_size+action_size,
    out_dim=1,
    hidden_dim=hidden_dim,
    squeeze_dim=squeeze_dim,
)        

# The targets start out as copies of the networks that will undergoe training 
actor_target = actor.copy()
critic_target = critic.copy()

ddpg = DDPG(
    critic=critic,
    actor=actor,
    critic_target=critic_target,
    actor_target=actor_target,
    lr_critic=lr_critic,
    lr_actor=lr_actor,
    tau_critic=tau_critic,
    tau_actor=tau_actor,
)

replaybuffer = ReplayBuffer(
    replay_buffer_size,
    state_size=state_size,
    action_size=action_size,
)

# Send the models to the device of choice
ddpg = ddpg.to(device)

## Train the model

In [None]:
# Setting up the tensorboard writer
writer = SummaryWriter(f"continuous_control/runs/{experiement_name}")

# Setting up the Ornstein–Uhlenbeck noise generator 
ounoise = OUNoise(
    n=len(env_info.agents),
    n_actions=action_size,
    mu=0,
    theta=0.15,
    sigma=ou_sigma,
    low=-1,
    high=1,
)

# Train the models
train(
    env=env,
    ddpg=ddpg,
    replaybuffer=replaybuffer,
    ounoise=ounoise, 
    writer=writer,
    episodes=episodes,
    brain_name=brain_name,
    device=device,
    batch_size=batch_size,
    gamma=gamma,
    sigma_init=sigma_init,
    sigma_end=sigma_end,
    sigma_decay=sigma_decay,
    evaluation_rounds=eval_rounds,
    train_mode=True,
    inference_steps=inference_steps,
    update_steps=update_steps,
    beta=beta,
    beta_2=beta_2,
    scale_reward=scale_reward,
)

## Evaluate the model

In [None]:
# Load the checkpoints of trainied models
ddpg._actor.load_state_dict(torch.load("models/actor.ckp"))
ddpg._critic.load_state_dict(torch.load("models/critic.ckp"))

score = evaluation(
    env=env,
    ddpg=ddpg,
    episodes=100,
    device=device,
    brain_name=brain_name,
    train_mode=True,
)

In [6]:
# The score
print(score)

39.43875911847502
