In [1]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.autograd import Variable
import torch.nn.functional as F

def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        
def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)
        
from unityagents import UnityEnvironment

env = UnityEnvironment(file_name="Reacher_single")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [2]:
class Actor(nn.Module):
    
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Actor, self).__init__()
        self.action_space = action_space
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        
        self.mu = nn.Linear(hidden_size, action_space)
        self.mu.weight.data.mul_(0.1)
        self.mu.bias.data.mul_(0.1)
        
    def forward(self, state):
        x = state
        
        x = self.linear1(x)
        x = self.ln1(x)
        x = F.relu(x)
        
        x = self.linear2(x)
        x = self.ln2(x)
        x = F.relu(x)
        
        mu = F.tanh(self.mu(x))
        return mu
    
class Critic(nn.Module):
    
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Critic, self).__init__()
        self.action_space = action_space
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        
        self.linear2 = nn.Linear(hidden_size + action_space, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        
        self.V = nn.Linear(hidden_size, 1)
        self.V.weight.data.mul_(0.1)
        self.V.bias.data.mul_(0.1)
        
    def forward(self, state, action):
        x = state
        
        x = self.linear1(x)
        x = self.ln1(x)
        x = F.relu(x)

        x = torch.cat((x, action), 2)
        x = self.linear2(x)
        x = self.ln2(x)
        x = F.relu(x)
        
        V = self.V(x)
        return V
    
class DDPG(object):
    
    def __init__(self, gamma, tau, hidden_size, num_inputs, action_space):
        self.num_inputs = num_inputs
        self.action_space = action_space
        
        self.actor = Actor(hidden_size, num_inputs, action_space)
        self.actor_target = Actor(hidden_size, num_inputs, action_space)
        self.actor_perturbed = Actor(hidden_size, num_inputs, action_space)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-3)
        
        self.critic = Critic(hidden_size, num_inputs, action_space)
        self.critic_target = Critic(hidden_size, num_inputs, action_space)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-4)
        
        self.gamma = gamma
        self.tau = tau
        
        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self. critic)
        
    def select_action(self, state, action_noise=None, param_noise=None):
        self.actor.eval()
        if param_noise is not None:
            mu = self.actor_perturbed((Variable(state)))
        else:
            mu = self.actor((Variable(state)))
            
        self.actor.train()
        mu = mu.data
        
        if action_noise is not None:
            mu += torch.Tensor(action_noise.noise())
            
        return mu.clamp(-1, 1)
    
    def update_parameters(self, batch):
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action)).view(BATCH_SIZE, NUMBER_OF_AGENTS, ACTION_SIZE)

        reward_batch = Variable(torch.cat(batch.reward))
        mask_batch = Variable(torch.cat(batch.mask))
        next_state_batch = Variable(torch.cat(batch.next_state))
        
        next_action_batch = self.actor_target(next_state_batch)
        next_state_action_values = self.critic_target(next_state_batch, next_action_batch)
        
        reward_batch = reward_batch.unsqueeze(1)
        mask_batch = mask_batch.unsqueeze(1)
        expected_state_action_batch = reward_batch + (self.gamma * mask_batch * next_state_action_values)
        
        self.critic_optim.zero_grad()
        
        state_action_batch = self.critic((state_batch), (action_batch))
        
        value_loss = F.mse_loss(state_action_batch, expected_state_action_batch)
        value_loss.backward()
        self.critic_optim.step()
        
        self.actor_optim.zero_grad()
        
        policy_loss = -self.critic((state_batch), self.actor((state_batch)))
        
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        
        return value_loss.item(), policy_loss.item()
    
    def perturb_actor_parameters(self, param_noise):
        hard_update(self.actor_perturbed, self.actor)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            param += torch.randn(param.shape) * param_noise.current_stddev
            
    def save_model(self, env_name, suffix="", actor_path=None, critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')
            
        if actor_path is None:
            actor_path = f'models/ddpg_actor_{env_name}_{suffix}'
        if critic_path is None:
            critic_path = f'models/ddpg_critic_{env_name}_{suffix}'
        print(f'Saving models to {actor_path} and {critic_path}')
        torch.save(self.actor.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)
        
    def load_model(self, actor_path, critic_path):
        print(f'Loading models from {actor_path} and {critic_path}')
        if actor_path is not None:
            self.actor.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.criric.load_state_dict(torch.load(critic_path))
            
import numpy as np
import torch
from math import sqrt

"""
From OpenAI Baselines:
https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
"""
class AdaptiveParamNoiseSpec(object):
    def __init__(self, initial_stddev=0.1, desired_action_stddev=0.2, adaptation_coefficient=1.01):
        """
        Note that initial_stddev and current_stddev refer to std of parameter noise, 
        but desired_action_stddev refers to (as name notes) desired std in action space
        """
        self.initial_stddev = initial_stddev
        self.desired_action_stddev = desired_action_stddev
        self.adaptation_coefficient = adaptation_coefficient

        self.current_stddev = initial_stddev

    def adapt(self, distance):
        if distance > self.desired_action_stddev:
            # Decrease stddev.
            self.current_stddev /= self.adaptation_coefficient
        else:
            # Increase stddev.
            self.current_stddev *= self.adaptation_coefficient

    def get_stats(self):
        stats = {
            'param_noise_stddev': self.current_stddev,
        }
        return stats

    def __repr__(self):
        fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adaptation_coefficient={})'
        return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adaptation_coefficient)

def ddpg_distance_metric(actions1, actions2):
    """
    Compute "distance" between actions taken by two policies at the same states
    Expects numpy arrays
    """
    diff = actions1-actions2
    mean_diff = np.mean(np.square(diff), axis=0)
    dist = sqrt(np.mean(mean_diff))
    return dist

import random
from collections import namedtuple

# Taken from
# https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb

Transition = namedtuple(
    'Transition', ('state', 'action', 'mask', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [3]:
# env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
# states = env_info.vector_observations                  # get the current state (for each agent)
# scores = np.zeros(num_agents)                          # initialize the score (for each agent)
# while True:
#     actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
#     actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
#     env_info = env.step(actions)[brain_name]           # send all actions to tne environment
#     next_states = env_info.vector_observations         # get next state (for each agent)
#     rewards = env_info.rewards                         # get reward (for each agent)
#     dones = env_info.local_done                        # see if episode finished
#     scores += env_info.rewards                         # update the score (for each agent)
#     states = next_states                               # roll over states to next time step
#     if np.any(dones):                                  # exit loop if episode finished
#         break
# print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUMBER_OF_AGENTS = states.shape[0]
STATE_SIZE = state_size
ACTION_SIZE = action_size

GAMMA = 0.99
TAU = 0.9
HIDDEN_SIZE = STATE_SIZE * 2
NOISE_SCALE = 0.1
EPISODES = 500
BATCH_SIZE = 64
UPDATES_PER_STEP = 4
REPLAY_SIZE = 1000000
SEED = 33
TMAX = 300

In [None]:
import math
from collections import namedtuple
from itertools import count
from tqdm import tqdm
from tensorboardX import SummaryWriter
import numpy as np


writer = SummaryWriter()
agent = DDPG(GAMMA, TAU, HIDDEN_SIZE, STATE_SIZE, ACTION_SIZE)
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=NOISE_SCALE, adaptation_coefficient=1.05)
memory = ReplayMemory(REPLAY_SIZE)

rewards = []
total_numsteps = 0
updates = 0

for i_episode in range(EPISODES):
    t = 0
    env_info = env.reset(train_mode=True)[brain_name] 
    state = torch.Tensor([env_info.vector_observations])
    
    agent.perturb_actor_parameters(param_noise)
    
    episode_reward = 0
    
    while True:
        action = agent.select_action(state, None, param_noise).view(NUMBER_OF_AGENTS, ACTION_SIZE)
        env_info = env.step(action.numpy())[brain_name]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        done = env_info.local_done
        total_numsteps += 1
        episode_reward += sum(reward) / len(reward)
        
        mask = torch.Tensor([not d for d in done])
        next_state = torch.Tensor([next_state])
        reward = torch.Tensor([reward])
        
        memory.push(state, action, mask, next_state, reward)
        state = next_state
        t += 1
        if len(memory) > BATCH_SIZE:
            for _ in range(UPDATES_PER_STEP):
                transitions = memory.sample(BATCH_SIZE)
                batch = Transition(*zip(*transitions))
                value_loss, policy_loss = agent.update_parameters(batch)
                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)
                
                updates += 1
                
        if True in done:
            break
        
    episode_transitions = memory.memory[memory.position - t:memory.position]
    states = torch.cat([transition[0] for transition in episode_transitions], 0)
    unperturbed_actions = agent.select_action(states, None, None)
    perturbed_actions = torch.cat([transition[1] for transition in episode_transitions], 0)

    ddpq_dist = ddpg_distance_metric(perturbed_actions.numpy(), unperturbed_actions.numpy()) * 10
    param_noise.adapt(ddpq_dist)

    rewards.append(episode_reward)
    if i_episode % 10 == 0:
        env_info = env.reset(train_mode=True)[brain_name] 
        state = torch.Tensor([env_info.vector_observations])
        episode_reward = 0
        while True:
            action = agent.select_action(state).view(-1)
            env_info = env.step(action.numpy())[brain_name]
            next_state = env_info.vector_observations
            next_state = torch.Tensor([next_state])
            reward = env_info.rewards
            done = env_info.local_done
            episode_reward += sum(reward) / len(reward)
            state = next_state
            if True in done:
                break

        writer.add_scalar('reward/test', episode_reward, i_episode)
        rewards.append(episode_reward)
        print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:])))
    if np.mean(rewards[-100:]) > 30.0:
        agent.save_model("ddqn")
# env_info = env.reset(train_mode=True)[brain_name]
# writt

Episode: 0, total numsteps: 1001, reward: 0.0, average reward: 0.014999999664723873
Episode: 10, total numsteps: 11011, reward: 0.0, average reward: 0.6849999846890569
Episode: 20, total numsteps: 21021, reward: 1.1299999747425318, average reward: 1.257999971881509
Episode: 30, total numsteps: 31031, reward: 0.9199999794363976, average reward: 1.924999956972897
Episode: 40, total numsteps: 41041, reward: 1.9499999564141035, average reward: 2.1399999521672726


In [None]:
# env.close()