## DDPG algorithm

- randomly initialize critic network `Q(s, a)` and actor `Mu(s)` network with `W_Q` and `W_Mu`  
- initialize target networks `Q'(s, a)` and `Mu'(s)` with weights `W_Q' = W_Q` and `W_Mu' = W_Mu`  
- initialize replay buffer `R`

- for iteration 1 to M:  
  - initialize a random process for **action exploration**
  - receive initial observation state `s1`
  
  - for t 1 to T:
      - select action `a_t = Mu(s_t) + N_t` according to the current policy and exploration noise
      - execute action `a_t` and observe reward `r_t` and observe new state s_{t+1}
      - store transition (s_t, a_t, r_t, s_{t+1}) in `R`
      - sample a random minibatch of N transitions (s_i, a_i, r_i, s_{i+1}) from `R`
      - set `y_i = r_i + gamma * Q'(s_{i+1}, Mu'(s_{i+1}))`
      - update critic by minimizing the loss: `L = (1/N) * sum_{(y_i - Q(s_i, a_i))^2}`
      - update the actor policy using the sampled policy gradient:  
      ![](policy_gradient.gif)  
      - update the target network `W_Q' = tau * W_Q + (1 - tao) * W_Q'` and `W_Mu' = tau * W_Mu + (1 - tau) * W_Mu'`
      
By Lillicrap et al

## Actor

In [6]:
import keras.backend as K
from keras.models import Model
from keras.layers import Dense, Input, Concatenate
from keras.optimizers import Adam
import tensorflow as tf
import numpy as np

class Actor:
    
    def __init__(self, tf_session, state_size, action_size, hidden_layer_units=(300, 600), 
                 learning_rate=0.0001, batch_size=64, tau=0.001):
        
        self._tf_session = tf_session
        self._state_size = state_size
        self._action_size = action_size    # the shape of states
        self._hidden_layer_units = hidden_layer_units
        self._batch_size = batch_size
        self._tau = tau
        
        K.set_session(self._tf_session)
        
        self._main_model = self._generate_model()
        self._target_model = self._generate_model()
        
        self._critic_gradients = tf.placeholder(shape=(action_size, None))  #Del_{a} Q(s, a)
        self._actor_gradients = tf.gradients(self._main_model.output,
                                             self._main_model.trainable_weights,
                                             -self._critics_gradients)
        self._gradients = zip(self._actor_gradients, self._main_model.weights)
        
        self._optimize = tf.train.AdamOptimizer(learning_rate).apply_gradients(self._gradients)
        
        self._tf_session.run(tf.initialize_all_variables())
        
    def _generate_model(self):
        self._state = Input(shape=self._state_size)
        dense = Dense(unit=self._hidden_layer_units[0])(self._state)
        for unit in self._hidden_layer_units[1:]:
            dense = Dense(unit=unit)(dense)
        output_layer = Dense(unit=self._action_size, activation='sigmoid')
        
        model = Model(inputs=self._state, outputs=output_layer)
        return model
    
    def train_main_model(self, states, critic_gradients):
        self._tf_session.run(self._optimize, feed_dict={
            self._state: states,
            self._critic_gradients: critic_gradients
        })
    
    def train_target_model(self):
        weights_main = self._main_model.get_weights()
        weights_target = self._target_model.get_weights()
        new_target_weights = [self._tau * m + (1 - self._tau) * t for m, t in zip(weights_main, weights_target)]
        self._target_model.set_weights(new_target_weights)
    
    

Notes:
- `apply_gradients()`: Apply gradients to variables. This is the second part of minimize(). It returns an Operation that applies gradients.

## Critic

In [5]:
class Critic:
    
    def __init__(self, tf_session, state_size, action_size, hidden_layer_units=(300, 600),
                 learning_rate=0.0001, batch_size=64, tau=0.001):
        
        self._tf_session = tf_session
        self._state_size = state_size
        self._action_size = action_size
        self._hidden_layer_units = hidden_layer_units
        self._learning_rate = learning_rate
        self._tau = tau
        
        K.set_session(self._tf_session)
        
        self._main_model = self._generate_model()
        self._target_model = self._generate_model()
        
        loss = tf.losses.mean_squared_error(self.Y, self._main_model.output)
        self._compute
        self._optimize = .apply_gradients(self._gradients)
        
        self._tf_session.run(tf.initialize_all_variables())
        
        
    def _generate_model(self):
        state_input = Input(shape=self._state_size, name='state')
        action_input = Input(shape=self._action_size, name='action')  
        total_input = Concatenate(axis=1)([state_input, action_input])
        dense = Dense(unit=self._hidden_layer_units[0], activation='relu')(total_input)
        for unit in self._hidden_layer_units[1:]:
            dense = Dense(unit=unit, activation='relu')(dense)
        output = Dense(unit=1, activation='relu')
        model = Model(inputs=[self._state, self._action], outputs=output)
        return model
    
    
    def train_main_model(self, states, actions, Yi):
        Y = K.placeholder(shape=(1, None), name='Y')
        loss = tf.losses.mean_squared_error(Y, self._main_model.output)
        optimizer = tf.train.AdamOptimizer(self._learning_rate)
        gradients = self._tf_session.run(optimizer.compute_gradients(loss),
                                         feed_dict={'state': states, 'action': actions})
        self._tf_session.run(optimizer.apply_gradients(gradients))
        return gradients[0]
        
        
    def train_target_model(self):
        weights_main = self._main_model.get_weights()
        weights_target = self._target_model.get_weights()
        new_weights = [self._tau * m + (1 - self._tau) * t for m, t in zip(weights_main, weights_target)]
        weights_target.set_weights(new_weights)

## DDPG

In [7]:
import random

class DDPG:
    
    def __init__(self, state_size, action_size, actor_hidden_units=(300, 600), actor_learning_rate=0.0001,
                 critic_hidden_units=(300, 600), critic_learning_rate=0.0001, batch_size=64,
                 reward_discount=0.99, memory_size=100000, tau=0.001, T=1000, sampling_size=64, gamma=0.001):
        
        self._t = 0
        self._T = T
        self._sampling_size = sampling_size
        self._gamma = gamma
        
        tf_session = tf.Session()
        
        self._actor = Actor(tf_session=tf_session,
                            state_size=state_size,
                            action_size=action_size,
                            hidden_layer_units=actor_hidden_units,
                            learning_rate=actor_learning_rate,
                            batch_size=batch_size,
                            tau=tau)
        
        self._critic = Critic(tf_session=tf_session,
                              state_size=state_size,
                              action_size=action_size,
                              hidden_layer_units=critic_hidden_units,
                              learning_rate=critic_learning_rate,
                              batch_size=batch_size,
                              tau=tau)
        
        self._R = []
        self._R_size = 0
        self._memory_size = memory_size
        
        self.random_actions = self._get_random_actions()
        
    def _get_random_actions(self):
        return np.random.rand(*(self._T, ) + action_size)
    
    def get_action(self, state):
        return self._actor._model.predict(state) + self.random_actions[self._t]
    
    def remember(self, s_t, a_t, r_t, s_tplusone):
        self._R.append((s_t, a_t, r_t, s_tplusone))
        self._R_size += 1
        if self._memory_size < self._R_size:
            self.R.pop(0)
            self._R_size -= 1
    
    def sample_transitions(self):
        samples = random.sample(self._R, self._sampling_size)
        return zip(*samples)
    
    def compute_Yi(self, samples):
        s_i, a_i, r_i, s_iplusone = samples
        s_i = np.array(s_i)
        a_i = np.array(a_i)
        r_i = np.array(r_i)
        s_iplusone = np.array(s_iplusone)
        
        a_i_pred = self._actor._target_model.predict(s_iplusone)
        y_i = r_i + self._critic._target_model.predict(s_iplusone, a_i_pred) * self._gamma
        return s_i, a_i, y_i
    
    def train_and_get_q(self, state, action, y_i):
        return self._critic._main_model.train_main_model(state, a_i, y_i)
    
    def update_main_actor(self, states, critic_gradients):
        self._actor._main_model.train_main_model(states, critic_gradients)
        
    def update_target_models(self):
        self._critic.train_target_model()
        self._target.train_target_model()