# DDPG Reinforcement Learning

In [81]:
# Import the dependencies for the agent

from keras.layers import Dense, Input, Concatenate
from keras.models import Model
from keras.optimizers import adam_v2
try: from ...environment.custom_environment import StackedBarsEnv
except: from RL.environment.custom_environment import StackedBarsEnv
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


In this notebook, we will train a DDPG agent to play the custom environment [StackedBarsEnv](nothing) using the [DDPG algorithm](https://arxiv.org/abs/1509.02971).

In [82]:
env = StackedBarsEnv()

num_states = env.observation_space.shape[0]

num_actions = env.action_space.shape[0]

upper_bound = env.action_space.high
lower_bound = env.action_space.low

print('The size of state space is: {}'.format(num_states))
print('The size of action space is: {}'.format(num_actions))
print('The max value of the action is: {}'.format(upper_bound))
print('The min value of the action is: {}'.format(lower_bound))


The size of state space is: 4
The size of action space is: 2
The max value of the action is: [0.1 0.1]
The min value of the action is: [0.01 0.01]


In order for making the actor a better explorer we are going to implement the **Ornstein-Uhlenbeck process** for generating noise.

In [83]:
class OUActionNoise(object):
    def __init__(self, mean, std_deviation, theta=.15, dt=1e-3, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_deviation = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()
    
    def __call__(self):
        # Formula taken from https://www.doc.ic.ac.uk/~erik/
        x = (
            self.x_prev +
            (self.theta * (self.mean - self.x_prev) * self.dt) +
            (self.std_deviation * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape))
        )

        self.x_prev = x
        return x
    
    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

Now we will implement the DDPG algorithm. That is described in the following image:

![DPPG process](DDPG_ALG.png)))


In [84]:
class Buffer:
    def __init__(self, buffer_capacity=50000, batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity
        
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1][0]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    @tf.function
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch,
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)


# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

## Define the actor and critic networks


In [85]:

def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.001, maxval=0.001)

    inputs = Input(shape=(num_states,))
    # Add a hidden layer with relu activation
    x = Dense(units=64, activation="relu", kernel_initializer=last_init)(inputs)
    # Add a hidden layer with relu activation
    out = Dense(256, activation="relu")(x)
    out = Dense(256, activation="relu")(out)
    outputs = Dense(2, activation="tanh", kernel_initializer=last_init)(out)
    # Our upper bound is 2.0 for Pendulum.
    outputs = outputs * upper_bound
    model = Model(inputs, outputs)
    return model


def get_critic():
    # State as input
    state_input = Input(shape=(num_states))
    state_out = Dense(32, activation="relu")(state_input)
    state_out = Dense(32, activation="relu")(state_out)
    state_out = Dense(64, activation="relu")(state_out)
    # add a hidden layer with relu activation
    
    # Action as input
    action_input = Input(shape=(num_actions))
    action_out = Dense(64, activation="relu")(action_input)

    # Both are passed through seperate layer before concatenating
    concat = Concatenate()([state_out, action_out])

    out = Dense(256, activation="relu")(concat)
    out = Dense(256, activation="relu")(out)
    outputs = Dense(1)(out)

    # Outputs single value for give state-action
    model = Model([state_input, action_input], outputs)

    return model


# Policy 


In [86]:
def policy(state, noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    # Adding noise to action
    sampled_actions = sampled_actions.numpy() + noise

    # We make sure action is within bounds
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)

    
    return [np.squeeze(legal_action)]




# Training hyperparameters

In [87]:
std_dev = 0.3
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_lr = 0.05
actor_lr = 0.004

critic_optimizer = adam_v2.Adam(learning_rate=critic_lr)
actor_optimizer = adam_v2.Adam(learning_rate=actor_lr)

total_episodes =  6000
# Discount factor for future rewards
gamma = 0.999
# Used to update target networks
tau = 0.05

SHOW_EVERY = 1000

buffer = Buffer(50000, 256)



In [88]:
## Render test
env.reset()
linspace_r = np.linspace(0.01,0.5, 100)
# invert the linspace
linspace_h = np.ones(100) * 0.1

for i in range(10):
    env.step(np.array([linspace_r[i], linspace_h[i]]))
env.render(mode='human',n_episode=6666)
## Render test

In [89]:
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

# Takes about 4 min to train
for ep in range(total_episodes):

    prev_state = env.reset()
    episodic_reward = 0

    while True:
        # Uncomment this to see the Actor in action
        # But not in a python notebook.

        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        action = policy(tf_prev_state, ou_noise)
        # Recieve state and reward from environment.
        state, reward, done, info = env.step(action[0])

        print(action)
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward

        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)

        # End this episode when `done` is True
        if done:
            break
        

        prev_state = state

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    
    avg_reward = np.mean(ep_reward_list[-50:])
    if ep % SHOW_EVERY == 0:
        env.render(mode='human',n_episode=ep)
        print("Episode * {} * Avg Reward is ==> {}".format(ep, round(avg_reward,5)))
    avg_reward_list.append(avg_reward)


# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.title('Agent learning throughout the episodes')
plt.xlabel("Episode")
plt.ylabel("Avg. Episode Reward")
plt.show()
# save graph
plt.savefig('graph.png')
# save model
actor_model.save_weights("polebuilder_actor.h5")
critic_model.save_weights("polebuilder_critic.h5")

target_actor.save_weights("polebuilder_target_actor.h5")
target_critic.save_weights("polebuilder_target_critic.h5")

[array([0.01, 0.01])]
The observation tuple shape is: <class 'list'>


ValueError: setting an array element with a sequence. The requested array would exceed the maximum number of dimension of 1.