## DDPG algorithm

- randomly initialize critic network `Q(s, a)` and actor `Mu(s)` network with `W_Q` and `W_Mu`  
- initialize target networks `Q'(s, a)` and `Mu'(s)` with weights `W_Q' = W_Q` and `W_Mu' = W_Mu`  
- initialize replay buffer `R`

- for iteration 1 to M:  
  - initialize a random process for **action exploration**
  - receive initial observation state `s1`
  
  - for t 1 to T:
      - select action `a_t = Mu(s_t) + N_t` according to the current policy and exploration noise
      - execute action `a_t` and observe reward `r_t` and observe new state s_{t+1}
      - store transition (s_t, a_t, r_t, s_{t+1}) in `R`
      - sample a random minibatch of N transitions (s_i, a_i, r_i, s_{i+1}) from `R`
      - set `y_i = r_i + gamma * Q'(s_{i+1}, Mu'(s_{i+1}))`
      - update critic by minimizing the loss: `L = (1/N) * sum_{(y_i - Q(s_i, a_i))^2}`
      - update the actor policy using the sampled policy gradient:  
      ![](policy_gradient.gif)  
      - update the target network `W_Q' = tau * W_Q + (1 - tao) * W_Q'` and `W_Mu' = tau * W_Mu + (1 - tau) * W_Mu'`
      
By Lillicrap et al

## Actor

In [None]:
import keras.backend as K
from keras.models import Model
from keras.layers import Dense, Input, Concatenate
from keras.optimizers import Adam
import tensorflow as tf
import numpy as np

class Actor:
    
    def __init__(self, tf_session, state_size, action_size, hidden_layer_units=(300, 600), 
                 learning_rate=0.0001, batch_size=64, tau=0.001):
        
        self._tf_session = tf_session
        self._state_size = state_size
        self._action_size = action_size    # the shape of states
        self._hidden_layer_units = hidden_layer_units
        self._batch_size = batch_size
        self._tau = tau
        
        K.set_session(self._tf_session)
        
        self._main_model = self._generate_model()
        self._target_model = self._generate_model()
        
        self._critic_gradients = tf.placeholder(shape=(action_size, ), dtype=tf.float32, name='critic_grad')  #Del_{a} Q(s, a)
        self._actor_gradients = tf.gradients(self._main_model.output,
                                             self._main_model.trainable_weights,
                                             -self._critic_gradients)
        
        self._optimize = tf.train.AdamOptimizer(learning_rate).\
                apply_gradients(zip(self._actor_gradients, self._main_model.weights))
        
        self._tf_session.run(tf.initializers.global_variables())
        
    def _generate_model(self):
        input_ = Input(shape=(self._state_size, ), name='state')
        dense = Dense(units=self._hidden_layer_units[0])(input_)
        for unit in self._hidden_layer_units[1:]:
            dense = Dense(units=unit)(dense)
        output_layer = Dense(units=self._action_size, activation='sigmoid')(dense)
        
        model = Model(inputs=input_, outputs=output_layer)
        return model
    
    def train_main_model(self, states, critic_gradients):
        self._tf_session.run(self._optimize, feed_dict={
            'state': states,
            'critic_grad': critic_gradients
        })
    
    def train_target_model(self):
        weights_main = self._main_model.get_weights()
        weights_target = self._target_model.get_weights()
        new_target_weights = [self._tau * m + (1 - self._tau) * t for m, t in zip(weights_main, weights_target)]
        self._target_model.set_weights(new_target_weights)
    
    def select_action(self, state):
        return self._main_model.predict(state.reshape(1, -1))[0]

In [None]:
sess = tf.Session()
actor = Actor(sess, 5, 5)

Notes:
- `apply_gradients()`: Apply gradients to variables. This is the second part of minimize(). It returns an Operation that applies gradients.

## Critic

In [None]:
class Critic:
    
    def __init__(self, tf_session, state_size, action_size, hidden_layer_units=(300, 600),
                 learning_rate=0.0001, batch_size=64, tau=0.001):
        
        self._tf_session = tf_session
        self._state_size = state_size
        self._action_size = action_size
        self._hidden_layer_units = hidden_layer_units
        self._learning_rate = learning_rate
        self._tau = tau
        
        K.set_session(self._tf_session)
        
        self._main_model, self._action_input = self._generate_model()
        self._target_model, _ = self._generate_model()
        
        self._tf_session.run(tf.initialize_all_variables())
        
        
    def _generate_model(self):
        state_input = Input(shape=(self._state_size, ), name='state')
        action_input = Input(shape=(self._action_size, ), name='action')  
        total_input = Concatenate(axis=1)([state_input, action_input])
        dense = Dense(units=self._hidden_layer_units[0], activation='relu')(total_input)
        for unit in self._hidden_layer_units[1:]:
            dense = Dense(units=unit, activation='relu')(dense)
        output = Dense(units=1, activation='relu')(dense)
        model = Model(inputs=[state_input, action_input], outputs=output)
        
#         Y = K.placeholder(shape=(1, ), name='Y')
#         self._loss = tf.losses.mean_squared_error(Y, self._main_model.output)
#         optimizer = tf.train.AdamOptimizer(self._learning_rate)
        
        return action_input, model
    
    def compute_gradients_wrt_actions(self, states, actions):
        gradients_wrt_actions_op = tf.gradients(self._main_model.output, self._action_input)
        return self._tf_seesion.run(gradients_wrt_actions_op, feed_dict={'state': states, 'action': actions})[0]
    
    def train_main_model(self, states, actions, Y):
        Y = K.placeholder(shape=(1, ), name='Y')
        self._loss = tf.losses.mean_squared_error(Y, self._main_model.output)
        optimizer = tf.train.AdamOptimizer(self._learning_rate)        

        self._tf_session.run(optimizer.minimize(self._loss),
            feed_dict={'state': states, 'action': actions, 'Y': Y})
        
    def train_target_model(self):
        weights_main = self._main_model.get_weights()
        weights_target = self._target_model.get_weights()
        new_weights = [self._tau * m + (1 - self._tau) * t for m, t in zip(weights_main, weights_target)]
        weights_target.set_weights(new_weights)
        
    def get_next_state(self, state, action):
        return self._main_model.predict(zip(state.reshape(1, -1), action.reshape(1, -1)))[0]

## Environment

In [None]:
class Environment:
    
    def __init__(self, initial_state):
        self.initial_state = initial_state
        
    def get_reward_and_next_state(self, state, action):
        reward = np.random.rand
        next_state = np.random.rand(*state.shape)
        return reward, next_state

## DDPG

In [None]:
import random

class DDPG:
    
    def __init__(self, state_size, action_size, actor_hidden_units=(300, 600), actor_learning_rate=0.0001,
                 critic_hidden_units=(300, 600), critic_learning_rate=0.0001, batch_size=64,
                 reward_discount=0.99, memory_size=100000, tau=0.001, T=1000, minibatch_size=64, gamma=0.001):
        
        self._action_size = action_size
        self._t = 0
        self._T = T
        self._minibatch_size = minibatch_size
        self._gamma = gamma
        
        tf_session = tf.Session()
        
        self._actor = Actor(tf_session=tf_session,
                            state_size=state_size,
                            action_size=action_size,
                            hidden_layer_units=actor_hidden_units,
                            learning_rate=actor_learning_rate,
                            batch_size=batch_size,
                            tau=tau)
        
        self._critic = Critic(tf_session=tf_session,
                              state_size=state_size,
                              action_size=action_size,
                              hidden_layer_units=critic_hidden_units,
                              learning_rate=critic_learning_rate,
                              batch_size=batch_size,
                              tau=tau)
        
        self._R = []
        self._R_size = 0
        self._memory_size = memory_size
        
        self.random_actions = self._initialize_random_actions()
        
    def _initialize_random_actions(self):
        return np.random.rand(*(self._T, ) + (self._action_size, ))
    
    def select_action(self, state):
        return self._actor.select_action(state) + self.random_actions[self._t]
    
    def go_to_next_state(self, state, action):
        return self._critic.get_next_state(state, action)
    
    def store_transition(self, state, action, reward, state_next):
        self._R.append((state, action, reward, state_next))
        self._R_size += 1
        if self._memory_size < self._R_size:
            self.R.pop(0)
            self._R_size -= 1
    
    def sample_minibatch_transition(self):
        samples = random.sample(self._R, self._minibatch_size)
        return zip(*samples)
    
    def compute_Y(self, states, actions, rewards, state_nexts):
        states = np.vstack(states)
        actions = np.vstack(actions)
        rewards = np.vstack(rewards)
        state_nexts = np.vstack(state_nexts)
        
        action_nexts = self._actor.select_action(state_nexts)
        Y = R + self._critic.get_next_state(state_nexts, action_nexts) * self._gamma
        return Y
    
    def train_critic(self, states, actions, Y):
        self._critic.train_main_model(states, actions, Y)
        
    def compute_critic_gradients(self, states, actions):
        return self._critic.compute_gradients_wrt_actions(states, actions)
    
    def train_actor(self, states, critic_gradients):
        self._actor.train_main_model(self, states, critic_gradients)
        
    def update_target_models(self):
        self._critic.train_target_model()
        self._target.train_target_model()

In [None]:
M = 100
T = 100
env = Environment(np.random.rand(1, 3))
agent = DDPG(state_size=3, action_size=5)

for _ in range(M):
    init_state = env.initial_state
    for _ in range(T):
        action = agent.select_action(init_state)
        reward, next_state = env.get_reward_and_next_state(init_state, action)
        agent.store_transition(init_state, action, reward, next_state)
        
        states, actions, rewards, state_nexts = agent.sample_minibatch_transition()
        Y = agent.compute_Y(states, actions, rewards, state_nexts)
        agent.train_critic(states, actions, Y)
        critic_gradients = agent.compute_critic_gradients(states, actions)
        agent.train_actor(states, gradients)
        agent.update_target_models