In [1]:
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Lambda,concatenate

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import gym
from collections import deque
import random

In [2]:
class Actor(object):
    """
        Actor Network for DDPG
    """
    def __init__(self, state_dim, action_dim, action_bound, tau, learning_rate):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.tau = tau
        self.learning_rate = learning_rate

        self.model = self.build_network()
        self.target_model = self.build_network()


        self.actor_optimizer = tf.keras.optimizers.Adam(self.learning_rate)

    ## actor network
    def build_network(self):
        state_input = Input((self.state_dim,))
        h1 = Dense(64, activation='relu')(state_input)
        h2 = Dense(32, activation='relu')(h1)
        h3 = Dense(16, activation='relu')(h2)
        out = Dense(self.action_dim, activation='tanh')(h3)

        # Scale output to [-action_bound, action_bound]
        action_output = Lambda(lambda x: x*self.action_bound)(out)
        model = Model(state_input, action_output)
        model.summary()
        return model


    ## actor prediction
    def predict(self, state):
		# type of action in env is numpy array
        return self.model.predict(np.reshape(state, [1, self.state_dim]))[0]


    ## target actor prediction
    def target_predict(self, state):
        return self.target_model.predict(state)


    ## transfer actor weights to target actor with a aau
    def update_target_network(self):
        theta, target_theta = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(theta)):
            target_theta[i] = self.tau * theta[i] + (1 - self.tau) * target_theta[i]
        self.target_model.set_weights(target_theta)


    ## train the actor network
    def train(self, states, dq_das):
        with tf.GradientTape() as tape:
            self.dj_dtheta = tape.gradient(self.model(states), self.model.trainable_variables, -dq_das)
        grads = zip(self.dj_dtheta, self.model.trainable_variables)
        self.actor_optimizer.apply_gradients(grads)

In [3]:
class Critic(object):
    """
        Critic Network for DDPG: Q function approximator
    """
    def __init__(self, state_dim, action_dim, tau, learning_rate):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.tau = tau
        self.learning_rate = learning_rate

        # create critic and target critic network
        self.model = self.build_network()
        self.target_model = self.build_network()

        self.model.compile(optimizer=Adam(self.learning_rate), loss='mse')
        self.target_model.compile(optimizer=Adam(self.learning_rate), loss='mse')


    ## critic network
    def build_network(self):
        state_input = Input((self.state_dim,))
        action_input = Input((self.action_dim,))
        x1 = Dense(64, activation='relu')(state_input)
        x2 = Dense(32, activation='linear')(x1)
        #a1 = Dense(1, activation='linear')(action_input)
        a1 = Dense(32, activation='linear')(action_input)
        h2 = concatenate([x2, a1], axis=-1)
        #h2 = Add()([x2, a1])
        h3 = Dense(16, activation='relu')(h2)
        q_output = Dense(1, activation='linear')(h3)
        model = Model([state_input, action_input], q_output)
        model.summary()
        return model


    ## q-value prediction of target critic
    def target_predict(self, inp):
        return self.target_model.predict(inp)


    ## transfer critic weights to target critic with a aau
    def update_target_network(self):
        phi = self.model.get_weights()
        target_phi = self.target_model.get_weights()
        for i in range(len(phi)):
            target_phi[i] = self.tau * phi[i] + (1 - self.tau) * target_phi[i]
        self.target_model.set_weights(target_phi)


    ## gradient of q-values wrt actions
    def dq_da(self, states, actions):
        a = tf.convert_to_tensor(actions)
        with tf.GradientTape() as tape:
            # compute dq_da to feed to the actor
            tape.watch(a)
            q = self.model([states, a])
            q = tf.squeeze(q)
        q_grads = tape.gradient(q, a)
        return q_grads

    ## single gradient update on a single batch data
    def train_on_batch(self, states, actions, td_targets):
        return self.model.train_on_batch([states, actions], td_targets)


In [4]:
class ReplayBuffer(object):
    """
    Reply Buffer
    """
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = deque()
        self.count = 0

    ## save to buffer
    def add_buffer(self, state, action, reward, next_state, done):
        transition = (state, action, reward, next_state, done)

        # check if buffer is full
        if self.count < self.buffer_size:
            self.buffer.append(transition)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(transition)

    ## sample a batch
    def sample_batch(self, batch_size):
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)
        # return a batch of transitions
        states = np.asarray([i[0] for i in batch])
        actions = np.asarray([i[1] for i in batch])
        rewards = np.asarray([i[2] for i in batch])
        next_states = np.asarray([i[3] for i in batch])
        dones = np.asarray([i[4] for i in batch])
        return states, actions, rewards, next_states, dones


    ## Current buffer occupation
    def buffer_size(self):
        return self.count

    ## Clear buffer
    def clear_buffer(self):
        self.buffer = deque()
        self.count = 0

In [5]:
class DDPGagent(object):

    def __init__(self, env):

        ## hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 64
        self.BUFFER_SIZE = 20000
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        ## create actor and critic networks
        self.actor = Actor(self.state_dim,
                           self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE)

        ## initialize replay buffer
        self.buffer = ReplayBuffer(self.BUFFER_SIZE)

        # save the results
        self.save_epi_reward = []

    ## Ornstein Uhlenbeck Noise
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho*(mu - x)*dt + sigma*np.sqrt(dt)*np.random.normal(size=dim)

    ## computing TD target: y_k = r_k + gamma*Q(s_k+1, a_k+1)
    def td_target(self, rewards, q_values, dones):
        y_k = np.asarray(q_values)
        for i in range(q_values.shape[0]): # number of batch
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.GAMMA * q_values[i]
        return y_k


    ## train the agent
    def train(self, max_episode_num):

        # initial transfer model weights to target model network
        self.actor.update_target_network()
        self.critic.update_target_network()

        for ep in range(int(max_episode_num)):
            # reset OU noise
            pre_noise = np.zeros(self.action_dim)
            # reset episode
            time, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset()
            while not done:
                # visualize the environment
                #self.env.render()
                # pick an action: shape = (1,)
                action = self.actor.predict(state)
                noise = self.ou_noise(pre_noise, dim=self.action_dim)
                # clip continuous action to be within action_bound
                action = np.clip(action + noise, -self.action_bound, self.action_bound)
                # observe reward, new_state
                next_state, reward, done, _ = self.env.step(action)
                # add transition to replay buffer
                train_reward = (reward + 8) / 8
                self.buffer.add_buffer(state, action, train_reward, next_state, done)

                if self.buffer.buffer_size > 1000:  # start train after buffer has some amounts

                    # sample transitions from replay buffer
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(self.BATCH_SIZE)
                    # predict target Q-values
                    target_qs = self.critic.target_predict([next_states, self.actor.target_predict(next_states)])
                    # compute TD targets
                    y_i = self.td_target(rewards, target_qs, dones)
                    # train critic using sampled batch
                    self.critic.train_on_batch(states, actions, y_i)
                    # Q gradient wrt current policy
                    s_actions = self.actor.model.predict(states) # shape=(batch, 1),
                    # caution: NOT self.actor.predict !
                    # self.actor.model.predict(state) -> shape=(1,1)
                    # self.actor.predict(state) -> shape=(1,) -> type of gym action
                    s_grads = self.critic.dq_da(states, s_actions)
                    dq_das = np.array(s_grads).reshape((-1, self.action_dim))
                    # train actor
                    self.actor.train(states, dq_das)
                    # update both target network
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                # update current state
                pre_noise = noise
                state = next_state
                episode_reward += reward
                time += 1

            ## display rewards every episode
            print('Episode: ', ep+1, 'Time: ', time, 'Reward: ', episode_reward)

            self.save_epi_reward.append(episode_reward)
        print(self.save_epi_reward)


    ## save them to file if done
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()

In [None]:
max_episode_num = 200
env = gym.make("Pendulum-v0")
agent = DDPGagent(env)

agent.train(max_episode_num)

agent.plot_result()



Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
dense (Dense)                (None, 64)                256       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
_________________________________________________________________
lambda (Lambda)              (None, 1)                 0         
Total params: 2,881
Trainable params: 2,881
Non-trainable params: 0
___________________________________________________________

Episode:  5 Time:  200 Reward:  -1587.1932377853261
Episode:  6 Time:  200 Reward:  -1626.4720190967291
Episode:  7 Time:  200 Reward:  -1584.0178315900923
Episode:  8 Time:  200 Reward:  -1582.8789971801123
Episode:  9 Time:  200 Reward:  -1539.4173548007973
Episode:  10 Time:  200 Reward:  -1521.884476283167
Episode:  11 Time:  200 Reward:  -1502.2266541680017
Episode:  12 Time:  200 Reward:  -1576.1319397421587
Episode:  13 Time:  200 Reward:  -1522.585600914639
Episode:  14 Time:  200 Reward:  -1431.1348188891204
Episode:  15 Time:  200 Reward:  -1564.1871729411055
Episode:  16 Time:  200 Reward:  -1493.0111305240432
Episode:  17 Time:  200 Reward:  -1519.554739423586
Episode:  18 Time:  200 Reward:  -1556.2142488555114
Episode:  19 Time:  200 Reward:  -1498.102078869386
Episode:  20 Time:  200 Reward:  -1453.945119481014
Episode:  21 Time:  200 Reward:  -1533.1597805871293
Episode:  22 Time:  200 Reward:  -1436.5016790465243
Episode:  23 Time:  200 Reward:  -1539.0100602240902
Epi