Dependencies for training:<br>
- create a ddpg folder within a tmp folder to store weights

In [None]:
import numpy as np
import os
import tensorflow as tf 
from tensorflow.keras import layers 
from tensorflow.keras.optimizers import Adam

In [13]:
class ReplayBuffer:
    def __init__(self, max_size, input_shape, n_actions):
        self.memory_size = max_size
        self.memory_center = 0
        self.state_memory = np.zeros((self.memory_size, *input_shape))
        self.new_state_memory = np.zeros((self.memory_size, *input_shape))
        self.action_memory = np.zeros((self.memory_size, n_actions))
        self.reward_memory = np.zeros(self.memory_size)
        self.terminal_memory = np.zeros(self.memory_size, dtype=bool)

    def store_transition(self, state, action, reward, new_state, done):
        index = self.memory_center % self.memory_size

        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done

        self.memory_center += 1

    def sample_buffer(self, batch_size):
        max_memory = min(self.memory_center, self.memory_size)

        batch = np.random.choice(max_memory, batch_size, replace=False)

        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

In [14]:

class CriticNetwork(tf.keras.Model):
    def __init__(self, fc1_dims=512, fc2_dims=512,
                name='critic', chkpt_dir='tmp/ddpg'):
        super(CriticNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims

        self.model_name = name
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, 
                                            self.model_name+'_ddpg.h5')

        self.fc1 = layers.Dense(self.fc1_dims, activation='relu')
        self.fc2 = layers.Dense(self.fc2_dims, activation='relu')
        self.q = layers.Dense(1, activation=None) # q value, no activation as we want linearity

    def call(self, state, action):
        action_value = self.fc1(tf.concat([state, action], axis=1))
        action_value = self.fc2(action_value)

        # calculate q value
        q = self.q(action_value)

        return q

class ActorNetwork(tf.keras.Model):
    def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2, 
                name='actor', chkpt_dir='tmp/ddpg'):
        super(ActorNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions # how many moves actor network has

        self.model_name = name
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, 
                                            self.model_name+'_ddpg.h5')

        self.fc1 = layers.Dense(self.fc1_dims, activation='relu')
        self.fc2 = layers.Dense(self.fc2_dims, activation='relu')
        self.mu = layers.Dense(self.n_actions, activation='tanh') # tanh as we want multiples of +-2

    def call(self, state):
        probability = self.fc1(state)
        probability = self.fc2(probability)
        mu = self.mu(probability)

        return mu

In [15]:



class Agent:
    def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None,
                 gamma=0.99, n_actions=2, max_size=1000000, tau=0.005,
                 fc1=400, fc2=300, batch_size=64, noise=0.1):
        
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = ActorNetwork(n_actions=n_actions, 
                                  name='actor')
        self.critic = CriticNetwork(name='critic')

        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_actor.weights
        for i, weight in enumerate(self.actor.weights):
            weights.append(weight * tau + targets[i]*(1-tau))
        self.target_actor.set_weights(weights)

        weights = []
        targets = self.target_critic.weights
        for i, weight in enumerate(self.critic.weights):
            weights.append(weight * tau + targets[i]*(1-tau))
        self.target_critic.set_weights(weights)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_weights(self.actor.checkpoint_file)
        self.target_actor.save_weights(self.target_actor.checkpoint_file)

        self.critic.save_weights(self.critic.checkpoint_file)
        self.target_critic.save_weights(self.target_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor.load_weights(self.actor.checkpoint_file)
        self.target_actor.load_weights(self.target_actor.checkpoint_file)

        self.critic.load_weights(self.critic.checkpoint_file)
        self.target_critic.load_weights(self.target_critic.checkpoint_file)

    def choose_action(self, observation, evaluate=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions],
                                        mean=0.0, stddev=self.noise)
        # note that if the env has an action > 1, we have to multiply by
        # max action at some point
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)

        return actions[0]

    def learn(self):
        if self.memory.memory_center < self.batch_size:
            return

        # sample from memory
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            # calculate critic network loss
            target_actions = self.target_actor(states_)

            # squeeze as we have a batch dim
            critic_value_ = tf.squeeze(self.target_critic(states_, target_actions), 1)
            critic_value = tf.squeeze(self.critic(states, actions), 1)

            target = rewards + self.gamma*critic_value_*(1-done)
            critic_loss = tf.keras.losses.MSE(target, critic_value)

        critic_network_gradient = tape.gradient(critic_loss,
                                                self.critic.trainable_variables)
         
        self.critic.optimizer.apply_gradients(zip(
            critic_network_gradient, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            # calculate actor network loss
            new_policy_actions = self.actor(states) # actions according to actor based on current set of weights
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        # gradient of critic loss with respect to mu parameters
        actor_network_gradient = tape.gradient(actor_loss,
                                               self.actor.trainable_variables)
        
        self.actor.optimizer.apply_gradients(zip(
            actor_network_gradient, self.actor.trainable_variables))

        self.update_network_parameters()

In [17]:
import gym
import matplotlib.pyplot as plt

# helper function
def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)


env = gym.make('Pendulum-v0')
agent = Agent(input_dims=env.observation_space.shape, env=env,
            n_actions=env.action_space.shape[0])
episodes = 250

figure_file = 'pendulum.png'

best_score = env.reward_range[0] # lower bound of reward range
score_history = []
load_checkpoint = False

if load_checkpoint:
    n_steps = 0
    while n_steps <= agent.batch_size:
        # note: observation = state
        observation = env.reset()
        action = env.action_space.sample()
        new_observation, reward, done, info = env.step(action)
        agent.remember(observation, action, reward, new_observation, done)
        n_steps += 1
    agent.learn()
    agent.load_models()
    evaluate = True
else:
    evaluate = False

for i in range(episodes):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        action = agent.choose_action(observation, evaluate)
        new_observation, reward, done, info = env.step(action)
        score += reward
        agent.remember(observation, action, reward, new_observation, done)
        if not load_checkpoint:
            agent.learn()
        observation = new_observation

    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
        if not load_checkpoint:
            agent.save_models()

    print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)

if not load_checkpoint:
    x = [i+1 for i in range(episodes)]
    plot_learning_curve(x, score_history, figure_file)

... saving models ...
episode  0 score -1662.2 avg score -1662.2
... saving models ...
episode  1 score -1075.7 avg score -1368.9
... saving models ...
episode  2 score -1289.3 avg score -1342.4
episode  3 score -1589.8 avg score -1404.3
episode  4 score -1720.7 avg score -1467.6
episode  5 score -1021.2 avg score -1393.2
episode  6 score -1332.7 avg score -1384.5
episode  7 score -1181.6 avg score -1359.1
episode  8 score -1661.7 avg score -1392.8
episode  9 score -1264.6 avg score -1380.0
episode  10 score -998.0 avg score -1345.2
... saving models ...
episode  11 score -888.5 avg score -1307.2
... saving models ...
episode  12 score -636.3 avg score -1255.6
... saving models ...
episode  13 score -843.6 avg score -1226.1
... saving models ...
episode  14 score -867.3 avg score -1202.2
... saving models ...
episode  15 score -861.9 avg score -1180.9
... saving models ...
episode  16 score -644.1 avg score -1149.4
episode  17 score -1409.6 avg score -1163.8
episode  18 score -918.3 av