In [1]:
import numpy as np
import gym
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Lambda
import matplotlib.pyplot as plt
import tensorflow as tf
import threading
import multiprocessing

In [2]:
def build_network1(state_dim, action_dim, action_bound):
    state_input = Input((state_dim,))
    h1 = Dense(64, activation='relu')(state_input)
    h2 = Dense(32, activation='relu')(h1)
    h3 = Dense(16, activation='relu')(h2)
    out_mu = Dense(action_dim, activation='tanh')(h3)
    std_output = Dense(action_dim, activation='softplus')(h3)

    # Scale output to [-action_bound, action_bound]
    mu_output = Lambda(lambda x: x * action_bound)(out_mu)
    model = Model(state_input, [mu_output, std_output])
    # model.summary()
    model._make_predict_function()  # class 안에서 def가 정의되면 필요없음
    return model

In [3]:
class Global_Actor(object):
    """
        Global Actor Network for A3C
    """
    def __init__(self, state_dim, action_dim, action_bound, learning_rate, entropy_beta):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.entropy_beta = entropy_beta

        self.std_bound = [1e-2, 1]  # std bound

        self.model = build_network1(self.state_dim,
                                                            self.action_dim,
                                                            self.action_bound)

        self.actor_optimizer = tf.keras.optimizers.Adam(self.learning_rate)



    ## log policy pdf
    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std**2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(var * 2 * np.pi)
        entropy = 0.5 * (tf.math.log(2 * np.pi * std ** 2) + 1.0)
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True), tf.reduce_sum(entropy, 1, keepdims=True)


    ## train the actor network run by worker
    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            # policy pdf
            mu_a, std_a = self.model(states)
            log_policy_pdf, entropy = self.log_pdf(mu_a, std_a, actions)

            # loss function and its gradient
            loss_policy = log_policy_pdf * advantages
            loss = tf.reduce_sum(-loss_policy - self.entropy_beta * entropy)
        dj_dtheta = tape.gradient(loss, self.model.trainable_variables)

        # gradient clipping
        dj_dtheta, _ = tf.clip_by_global_norm(dj_dtheta, 40) #40

        # pretend gradient wrt global theta
        grads = zip(dj_dtheta, self.model.trainable_variables)
        self.actor_optimizer.apply_gradients(grads)

    ## actor prediction
    def predict(self, state):
        mu_a, _= self.model.predict(np.reshape(state, [1, self.state_dim]))
        return mu_a[0]

In [4]:
class Worker_Actor(object):
    """
        Worker Actor Network for A3C
    """
    def __init__(self, state_dim, action_dim, action_bound):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound

        self.std_bound = [1e-2, 1]  # std bound

        self.model = build_network1(self.state_dim, self.action_dim, self.action_bound)


    ## actor policy
    def get_action(self, state):
        # type of action in env is numpy array
        # np.reshape(state, [1, self.state_dim]) : shape (state_dim,) -> shape (1, state_dim)
        # why [0]?  shape (1, action_dim) -> (action_dim,)
        mu_a, std_a = self.model.predict(np.reshape(state, [1, self.state_dim]))
        mu_a = mu_a[0]
        std_a = std_a[0]
        std_a = np.clip(std_a, self.std_bound[0], self.std_bound[1])
        action = np.random.normal(mu_a, std_a, size=self.action_dim)
        return action

In [5]:
def build_network2(state_dim):
    state_input = Input((state_dim,))
    h1 = Dense(64, activation='relu')(state_input)
    h2 = Dense(32, activation='relu')(h1)
    h3 = Dense(16, activation='relu')(h2)
    v_output = Dense(1, activation='linear')(h3)
    model = Model(state_input, v_output)
    #model.summary()
    model._make_predict_function()  # class 안에서 def가 정의되면 필요없음
    return model

In [6]:
class Global_Critic(object):
    """
        Global Critic Network for A3C: V function approximator
    """
    def __init__(self, state_dim, action_dim, learning_rate):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate

        self.model = build_network2(state_dim)

        self.critic_optimizer = tf.keras.optimizers.Adam(self.learning_rate)


    ## train the critic network run by worker
    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            # loss function and its gradient
            v_values = self.model(states)
            loss = tf.reduce_sum(tf.square(td_targets-v_values))
        dj_dphi = tape.gradient(loss, self.model.trainable_variables)

        # gradient clipping
        dj_dphi, _ = tf.clip_by_global_norm(dj_dphi, 40) #40

        # gradients
        grads = zip(dj_dphi, self.model.trainable_variables)

        self.critic_optimizer.apply_gradients(grads)


In [7]:
class Worker_Critic(object):
    """
        Critic Network for A3C: V function approximator
    """
    def __init__(self, state_dim):

        self.model = build_network2(state_dim)

In [8]:
global_episode_count = 0
global_step = 0
global_episode_reward = []  # save the results

In [9]:
class A3Cagent(object):

    """
        Global network
    """
    def __init__(self, env_name):

        # training environment
        self.env_name = env_name
        self.WORKERS_NUM = multiprocessing.cpu_count() #4

        # hyperparameters
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.ENTROPY_BETA = 0.01

        # get state dimension
        env = gym.make(self.env_name)
        state_dim = env.observation_space.shape[0]
        # get action dimension
        action_dim = env.action_space.shape[0]
        # get action bound
        action_bound = env.action_space.high[0]

        # create global actor and critic networks
        self.global_actor = Global_Actor(state_dim, action_dim, action_bound, self.ACTOR_LEARNING_RATE,
                                         self.ENTROPY_BETA)
        self.global_critic = Global_Critic(state_dim, action_dim, self.CRITIC_LEARNING_RATE)


    def train(self, max_episode_num):

        workers = []

        # create worker
        for i in range(self.WORKERS_NUM):
            worker_name = 'worker%i' % i
            workers.append(A3Cworker(worker_name, self.env_name, self.global_actor,
                                     self.global_critic, max_episode_num))


         # create worker (multi-agents) and do parallel training
        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()
            
        print(global_episode_reward)


    ## save them to file if done
    def plot_result(self):
        plt.plot(global_episode_reward)
        plt.show()


In [10]:
class A3Cworker(threading.Thread):

    """
        local agent network (worker)
    """
    def __init__(self, worker_name, env_name, global_actor, global_critic, max_episode_num):
        threading.Thread.__init__(self)

        #self.lock = threading.Lock()

        # hyperparameters
        self.GAMMA = 0.95
        self.t_MAX = 4 # t-step prediction

        self.max_episode_num = max_episode_num

        # environment
        self.env = gym.make(env_name)
        self.worker_name = worker_name

        # global network sharing
        self.global_actor = global_actor
        self.global_critic = global_critic


        # get state dimension
        self.state_dim = self.env.observation_space.shape[0]
        # get action dimension
        self.action_dim = self.env.action_space.shape[0]
        # get action bound
        self.action_bound = self.env.action_space.high[0]

        # create local actor and critic networks
        self.worker_actor = Worker_Actor(self.state_dim, self.action_dim, self.action_bound)
        self.worker_critic = Worker_Critic(self.state_dim)

        # initial transfer global network parameters to worker network parameters
        self.worker_actor.model.set_weights(self.global_actor.model.get_weights())
        self.worker_critic.model.set_weights(self.global_critic.model.get_weights())


    ## computing Advantages and targets: y_k = r_k + gamma*V(s_k+1), A(s_k, a_k)= y_k - V(s_k)
    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = self.GAMMA * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets


    ## convert (list of np.array) to np.array
    def unpack_batch(self, batch):
        unpack = batch[0]
        for idx in range(len(batch) - 1):
            unpack = np.append(unpack, batch[idx + 1], axis=0)

        return unpack


    # train each worker
    def run(self):

        global global_episode_count, global_step
        global global_episode_reward  # total episode across all workers

        print(self.worker_name, "starts ---")

        while global_episode_count <= int(self.max_episode_num):

            # initialize batch
            batch_state, batch_action, batch_reward = [], [], []

            # reset episode
            step, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset() # shape of state from gym (3,)

            while not done:

                # visualize the environment
                #self.env.render()
                # pick an action (shape of gym action = (action_dim,) )
                action = self.worker_actor.get_action(state)
                # clip continuous action to be within action_bound
                action = np.clip(action, -self.action_bound, self.action_bound)
                # observe reward, new_state, shape of output of gym (state_dim,)
                next_state, reward, done, _ = self.env.step(action)

                # change shape (state_dim,) -> (1, state_dim), same to action, next_state
                state = np.reshape(state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                action = np.reshape(action, [1, self.action_dim])

                # append to the batch
                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append((reward+8)/8) # <-- normalization
                #batch_reward.append(reward)

                # update state and step
                state = next_state
                step += 1
                episode_reward += reward[0]

                # if batch is full or episode ends, start to train global on batch
                if len(batch_state) == self.t_MAX or done:

                    # extract states, actions, rewards from batch
                    states = self.unpack_batch(batch_state)
                    actions = self.unpack_batch(batch_action)
                    rewards = self.unpack_batch(batch_reward)

                    # clear the batch
                    batch_state, batch_action, batch_reward = [], [], []

                    # compute n-step TD target and advantage prediction with global network
                    next_state = np.reshape(next_state, [1, self.state_dim])
                    next_v_value = self.global_critic.model.predict(next_state)
                    n_step_td_targets = self.n_step_td_target(rewards, next_v_value, done)
                    v_values = self.global_critic.model.predict(states)
                    advantages = n_step_td_targets - v_values


                    #with self.lock:
                    # update global critic
                    self.global_critic.train(states, n_step_td_targets)
                    # update global actor
                    self.global_actor.train(states, actions, advantages)

                    # transfer global network parameters to worker network parameters
                    self.worker_actor.model.set_weights(self.global_actor.model.get_weights())
                    self.worker_critic.model.set_weights(self.global_critic.model.get_weights())

                    # update global step
                    global_step += 1

                if done:
                    # update global episode count
                    global_episode_count += 1
                    ## display rewards every episode
                    print('Worker name:', self.worker_name, ', Episode: ', global_episode_count,
                          ', Step: ', step, ', Reward: ', episode_reward)

                    global_episode_reward.append(episode_reward)

In [None]:
max_episode_num = 1000
env_name = 'Pendulum-v0'
agent = A3Cagent(env_name)

agent.train(max_episode_num)

agent.plot_result()

worker0worker1 starts ---
 starts ---
worker2 starts ---
worker3 starts ---


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layer

Worker name: worker2 , Episode:  33 , Step:  200 , Reward:  [-1082.32920994]
Worker name: worker0 , Episode:  34 , Step:  200 , Reward:  [-1100.92773176]
Worker name: worker1 , Episode:  35 , Step:  200 , Reward:  [-1222.12587953]
Worker name: worker3 , Episode:  36 , Step:  200 , Reward:  [-1137.87574534]
Worker name: worker2 , Episode:  37 , Step:  200 , Reward:  [-1239.54764892]
Worker name: worker0 , Episode:  38 , Step:  200 , Reward:  [-1015.52391559]
Worker name: worker1 , Episode:  39 , Step:  200 , Reward:  [-1193.21393883]
Worker name: worker3 , Episode:  40 , Step:  200 , Reward:  [-1136.90006867]
Worker name: worker2 , Episode:  41 , Step:  200 , Reward:  [-989.25127792]
Worker name: worker0 , Episode:  42 , Step:  200 , Reward:  [-1227.89576087]
Worker name: worker1 , Episode:  43 , Step:  200 , Reward:  [-1156.11884423]
Worker name: worker3 , Episode:  44 , Step:  200 , Reward:  [-1034.91138264]
Worker name: worker2 , Episode:  45 , Step:  200 , Reward:  [-1240.77737245]


Worker name: worker3 , Episode:  140 , Step:  200 , Reward:  [-636.74917679]
Worker name: worker0 , Episode:  141 , Step:  200 , Reward:  [-1164.12407522]
Worker name: worker2 , Episode:  142 , Step:  200 , Reward:  [-1003.22126836]
Worker name: worker1 , Episode:  143 , Step:  200 , Reward:  [-859.12711028]
Worker name: worker3 , Episode:  144 , Step:  200 , Reward:  [-990.38045243]
Worker name: worker0 , Episode:  145 , Step:  200 , Reward:  [-805.88600949]
Worker name: worker2 , Episode:  146 , Step:  200 , Reward:  [-876.89654527]
Worker name: worker3 , Episode:  147 , Step:  200 , Reward:  [-981.26953896]
Worker name: worker1 , Episode:  148 , Step:  200 , Reward:  [-825.13568581]
Worker name: worker0 , Episode:  149 , Step:  200 , Reward:  [-1284.51644296]
Worker name: worker2 , Episode:  150 , Step:  200 , Reward:  [-759.83113923]
Worker name: worker3 , Episode:  151 , Step:  200 , Reward:  [-1002.41341761]
Worker name: worker1 , Episode:  152 , Step:  200 , Reward:  [-1015.3777

Worker name: worker2 , Episode:  247 , Step:  200 , Reward:  [-768.85174466]
Worker name: worker1 , Episode:  248 , Step:  200 , Reward:  [-743.54035515]
Worker name: worker0 , Episode:  249 , Step:  200 , Reward:  [-969.34068723]
Worker name: worker3 , Episode:  250 , Step:  200 , Reward:  [-678.08232711]
Worker name: worker2 , Episode:  251 , Step:  200 , Reward:  [-885.64272387]
Worker name: worker1 , Episode:  252 , Step:  200 , Reward:  [-765.8843438]
Worker name: worker0 , Episode:  253 , Step:  200 , Reward:  [-739.64767758]
Worker name: worker3 , Episode:  254 , Step:  200 , Reward:  [-743.6435837]
Worker name: worker2 , Episode:  255 , Step:  200 , Reward:  [-880.95014546]
Worker name: worker1 , Episode:  256 , Step:  200 , Reward:  [-740.92571042]
Worker name: worker0 , Episode:  257 , Step:  200 , Reward:  [-988.02749406]
Worker name: worker3 , Episode:  258 , Step:  200 , Reward:  [-783.06387183]
Worker name: worker2 , Episode:  259 , Step:  200 , Reward:  [-1048.24422334]


Worker name: worker3 , Episode:  354 , Step:  200 , Reward:  [-647.65350145]
Worker name: worker1 , Episode:  355 , Step:  200 , Reward:  [-258.20092543]
Worker name: worker2 , Episode:  356 , Step:  200 , Reward:  [-645.77199929]
Worker name: worker0 , Episode:  357 , Step:  200 , Reward:  [-876.33852675]
Worker name: worker3 , Episode:  358 , Step:  200 , Reward:  [-509.02994911]
Worker name: worker1 , Episode:  359 , Step:  200 , Reward:  [-255.90082977]
Worker name: worker2 , Episode:  360 , Step:  200 , Reward:  [-487.28876345]
Worker name: worker0 , Episode:  361 , Step:  200 , Reward:  [-369.31878329]
Worker name: worker3 , Episode:  362 , Step:  200 , Reward:  [-253.69100974]
Worker name: worker1 , Episode:  363 , Step:  200 , Reward:  [-389.36347919]
Worker name: worker2 , Episode:  364 , Step:  200 , Reward:  [-412.98451207]
Worker name: worker0 , Episode:  365 , Step:  200 , Reward:  [-398.84584708]
Worker name: worker3 , Episode:  366 , Step:  200 , Reward:  [-892.83162985]