In [22]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from collections import deque
import random
from IPython.display import clear_output
import matplotlib.pyplot as plt


physical_devices = tf.config.list_physical_devices('GPU') 
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

# Deep Deterministic Policy Gradient

DDPG is an Actor-Critic RL algorithm. It has a policy approximation model and a value approximation model. Its policy network produces continuous actions, and combines them with a DQN-like q-value estimator.

DDPG uses 2 main models. The Actor network learns the policy and the Critic network learns the q-value. The Actor network receives the state as input and outputs an action vector, corresponding to the continuous action space (e.g. joint velocities), thus it produces a deterministic policy. The Critic receives the output of the Actor, combines it with the state, and approximates a q-value. 

Since the learned the policy is deterministic, we need to implement some kind of exploration during the training process. To make DDPG policies explore better, a noise to their actions. Originally an Ornstein-Uhlenbeck (https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.) noise was proposed, however it is often used with normal-distributed noise or parameter noise, directly applied to the network parameters.

## Agent

Similarly to the PolicyGradient agent, the DDPG agent also has "frozen" models. Both the Actor and the Critic network have their target Actor and target Critic counterparts.

In case of the Critic, the learning step is almost identical to the learning step of the DQN. The only difference is, that here, the actions are directly fed into the network, as input, and the output is the corresponding q-value, instead of having multiple q-values for each possible action.

The interesting part is the training of the Actor. Since the goal is to maximize the q-value with the policy, and we already have a differentiable q-value approximator, the solution is sort of obvious. We just feed the output of the Actor $\pi_\phi$ into our Critic $c_\theta$, along with the state, negate the result and use it as loss.

$$
loss_{actor} = -c_\Theta (s_t, \pi_\Phi (s_t))
$$

If the Critic is doing its job well, and the Actor minimizes the negated output of the Critic, then it should maximize a well approximated q-value, thus also approaching a well working policy.

## Replay buffer

Similar to the case of the DQN, a replay buffer is used to store gathered experience, and later sample from it during the training phase. In this case however, we adjust the samplint method to faver resent experiences over older experiences.

In [23]:
class Actor(tf.keras.layers.Layer):
    def __init__(self, units=(400, 300), n_actions=2, **kwargs):
        super(Actor, self).__init__(**kwargs)
        self.layers = []
        for i, u in enumerate(units):
            self.layers.append(tf.keras.layers.Dense(u, activation=tf.nn.leaky_relu,
                                                     kernel_initializer=tf.keras.initializers.glorot_normal()))
        last_init = tf.random_normal_initializer(stddev=0.0005)
        self.layers.append(tf.keras.layers.Dense(n_actions, activation='tanh', kernel_initializer=last_init))

    def call(self, inputs, **kwargs):
        outputs = inputs
        for l in self.layers:
            outputs = l(outputs)
        return outputs
    
class Critic(tf.keras.layers.Layer):
    def __init__(self, state_units=(400, 300), action_units=(300,), units=(150,), **kwargs):
        super(Critic, self).__init__(**kwargs)
        self.layers_state = []
        for u in state_units:
            self.layers_state.append(tf.keras.layers.Dense(u, activation=tf.nn.leaky_relu,
                                                           kernel_initializer=tf.keras.initializers.glorot_normal()))

        self.layers_action = []
        for u in action_units:
            self.layers_action.append(tf.keras.layers.Dense(u, activation=tf.nn.leaky_relu,
                                                            kernel_initializer=tf.keras.initializers.glorot_normal()))

        self.layers = []
        for u in units:
            self.layers.append(tf.keras.layers.Dense(u, activation=tf.nn.leaky_relu,
                                                     kernel_initializer=tf.keras.initializers.glorot_normal()))
        last_init = tf.random_normal_initializer(stddev=0.00005)
        self.layers.append(tf.keras.layers.Dense(1, kernel_initializer=last_init))

        self.add = tf.keras.layers.Add()

    def call(self, inputs, **kwargs):
        p_action = inputs['action']
        p_state = inputs['state']

        for l in self.layers_action:
            p_action = l(p_action)

        for l in self.layers_state:
            p_state = l(p_state)

        outputs = self.add([p_state, p_action])
        for l in self.layers:
            outputs = l(outputs)

        return outputs
    
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
                self.x_prev
                + self.theta * (self.mean - self.x_prev) * self.dt
                + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)
            
class DDPGAgent:
    def __init__(self, action_space, observation_shape, gamma=0.99, tau=0.001, epsilon=0.05):
        self.action_space = action_space
        self.tau = tau  # target network weight adaptation
        self.gamma = gamma  # discount factor
        self.epsilon = epsilon

        self.actor = Actor(n_actions=action_space.shape[0])
        self.critic = Critic()

        self.target_actor = Actor(n_actions=action_space.shape[0])
        self.target_critic = Critic()

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

        self.noise = OUActionNoise(mean=np.zeros(np.array(self.action_space.sample()).shape),
                                   std_deviation=float(0.2) * np.ones(1))

        self._init_networks(observation_shape)

    def _init_networks(self, observation_shape):
        initial_state = np.zeros([1, observation_shape])

        initial_action = self.actor(initial_state)
        self.target_actor(initial_state)

        critic_input = {'action': initial_action, 'state': initial_state}
        self.critic(critic_input)
        self.target_critic(critic_input)

        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    @staticmethod
    def update_target(model_target, model_ref, tau=0.0):
        new_weights = [tau * ref_weight + (1 - tau) * target_weight for (target_weight, ref_weight) in
                       list(zip(model_target.get_weights(), model_ref.get_weights()))]
        model_target.set_weights(new_weights)

    def act(self, observation, explore=True, random_action=False):
        if random_action or np.random.uniform(0, 1) < self.epsilon:
            a = self.action_space.sample()
        else:
            a = self.actor(observation).numpy()[:, 0]
            if explore:
                a += self.noise()
        a = np.clip(a, self.action_space.low, self.action_space.high)
        return a

    def compute_target_q(self, rewards, next_states, dones):
        actions = self.target_actor(next_states)
        critic_input = {'action': actions, 'state': next_states}
        next_q = self.target_critic(critic_input)
        target_q = rewards + (1 - dones) * next_q * self.gamma
        return target_q

    def get_actor_grads(self, states):
        with tf.GradientTape() as tape:
            actions = self.actor(states)
            critic_input = {'action': actions, 'state': states}
            qs = self.critic(critic_input)
            loss = -tf.math.reduce_mean(qs)
        gradients = tape.gradient(loss, self.actor.trainable_variables)
        gradients = [tf.clip_by_value(grad, -1.0, 1.0) for grad in gradients]
        return gradients, loss

    def get_critic_grads(self, states, actions, target_qs):
        with tf.GradientTape() as tape:
            critic_input = {'action': actions, 'state': states}
            qs = self.critic(critic_input)
            loss = tf.reduce_mean(tf.abs(target_qs - qs))
        gradients = tape.gradient(loss, self.critic.trainable_variables)
        gradients = [tf.clip_by_value(grad, -1.0, 1.0) for grad in gradients]
        return gradients, loss

    def learn(self, states, actions, rewards, next_states, dones):
        target_qs = self.compute_target_q(rewards, next_states, dones)

        actor_grads, actor_loss = self.get_actor_grads(states)
        critic_grads, critic_loss = self.get_critic_grads(states, actions, target_qs)

        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
        self.target_update()
        return actor_loss, critic_loss

    def target_update(self):
        DDPGAgent.update_target(self.target_critic, self.critic, self.tau)
        DDPGAgent.update_target(self.target_actor, self.actor, self.tau)

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
        self.p_indices = [0.5 / 2]

    def put(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, np.expand_dims(reward, -1), next_state, np.expand_dims(done, -1)])

    def sample(self, batch_size=1, unbalance=0.8):
        p_indices = None
        if random.random() < unbalance:
            self.p_indices.extend((np.arange(len(self.buffer) - len(self.p_indices)) + 1)
                                  * 0.5 + self.p_indices[-1])
            p_indices = self.p_indices / np.sum(self.p_indices)
        sample_idx = np.random.choice(len(self.buffer),
                                      size=min(batch_size, len(self.buffer)),
                                      replace=False,
                                      p=p_indices)
        sample = [self.buffer[s_i] for s_i in sample_idx]
        states, actions, rewards, next_states, dones = map(np.array, zip(*sample))
        return states, actions, rewards, next_states, dones

    def size(self):
        return len(self.buffer)

## Training

The training loop is similar to the other training loops we saw until now. Except, for periodically updating and adapting the parameter noise, and we let the agent collect some initial experience in the beginning.

In [24]:
def compute_avg_return(env, agent, num_episodes=1, max_steps=200, render=False):
    total_return = 0.0
    for _ in range(num_episodes):
        obs, _ = env.reset()
        episode_return = 0.0
        done = False
        steps = 0
        while not (done or steps > max_steps):
            if render:
                clear_output(wait=True)
                plt.axis('off')
                plt.imshow(env.render())
                plt.show()
            action = agent.act(np.array([obs]))
            obs, r, done, _, _ = env.step(action)
            episode_return += r
            steps += 1
        total_return += episode_return
    return total_return / num_episodes

In [25]:
replay_buffer = ReplayBuffer()

env = gym.make('Pendulum-v1', render_mode='rgb_array')

agent = DDPGAgent(env.action_space, env.observation_space.shape[0])
for i in range(1001):
    obs, _ = env.reset()
    # gather experience
    agent.noise.reset()
    ep_actor_loss = 0
    ep_critic_loss = 0
    steps = 0
    for j in range(200):
        steps += 1
        env.render()
        action = agent.act(np.array([obs]), random_action=(i < 1))
        # execute action
        new_obs, r, done, _, _ = env.step(action)
        replay_buffer.put(obs, action, r, new_obs, done)
        obs = new_obs
        if done:
            break
            
    # Learn from the experiences in the replay buffer.
    for _ in range(128):
        s_states, s_actions, s_rewards, s_next_states, s_dones = replay_buffer.sample(64)
        actor_l, critic_l = agent.learn(s_states, s_actions, s_rewards, s_next_states, s_dones)
        ep_actor_loss += actor_l
        ep_critic_loss += critic_l
        
    if i % 25 == 0:
        avg_return = compute_avg_return(env, agent, num_episodes=2, render=False)
        print(
            f'epoch {i}, actor loss {ep_actor_loss / steps}, critic loss {ep_critic_loss / steps} , avg return {avg_return}')

epoch 0, actor loss 7.015113830566406, critic loss 0.44769567251205444 , avg return -1115.7896571918664
epoch 25, actor loss 21.003501892089844, critic loss 0.1957494020462036 , avg return -1170.2190757357748
epoch 50, actor loss 44.12150192260742, critic loss 0.310014933347702 , avg return -1237.6062782935878
epoch 75, actor loss 53.26805877685547, critic loss 0.46954214572906494 , avg return -1312.3806947671394
epoch 100, actor loss 52.87529754638672, critic loss 0.46217361092567444 , avg return -1145.0513225605082
epoch 125, actor loss 43.268592834472656, critic loss 0.41431134939193726 , avg return -443.8112816932462
epoch 150, actor loss 42.63667297363281, critic loss 0.4188898503780365 , avg return -615.337327283951
epoch 175, actor loss 26.33926773071289, critic loss 0.43491464853286743 , avg return -243.64586758290628


KeyboardInterrupt: 

In [None]:
compute_avg_return(env, agent, num_episodes=10, render=True)
env.close()