In [1]:
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Lambda

import gym
import tensorflow as tf
import matplotlib.pyplot as plt

import threading
import multiprocessing

In [25]:
def build_network1(state_dim,action_dim,action_bound):
    state_input=Input((state_dim,))
    h1=Dense(64,activation='relu')(state_input)
    h2=Dense(32,activation='relu')(h1)
    h3=Dense(16,activation='relu')(h2)
    out_mu=Dense(action_dim,activation='tanh')(h3)
    std_output=Dense(action_dim,activation='softplus')(h3)
    mu_output=Lambda(lambda x: x*action_bound)(out_mu)
    
    model=Model(state_input,[mu_output,std_output])
    #model.summary()
    model._make_predict_function()
    return model

In [14]:
class GlobalActor(object):
    def __init__(self,state_dim,action_dim,action_bound,learning_rate,entropy_beta):
        self.state_dim=state_dim
        self.action_dim=action_dim
        self.action_bound=action_bound
        self.learning_rate=learning_rate
        self.entropy_beta=entropy_beta
        self.std_bound=[1e-2,1]
        self.model=build_network1(self.state_dim,self.action_dim,self.action_bound)
        self.actor_optimizer=tf.keras.optimizers.Adam(self.learning_rate)
    #log policy pdf    
    def log_pdf(self,mu,std,action):
        std=tf.clip_by_value(std,self.std_bound[0],self.std_bound[1])
        var=std**2
        log_policy_pdf=-0.5*(action-mu)**2/var-0.5*tf.math.log(var*2*np.pi)
        entropy=0.5*(tf.math.log(2*np.pi*std**2)+1.0)
        return tf.reduce_sum(log_policy_pdf,1,keepdims=True),tf.reduce_sum(entropy,1,keepdims=True)
    
    def train(self,states,actions,advantages):
        with tf.GradientTape() as tape:
            mu_a,std_a=self.model(states)
            log_policy_pdf,entropy=self.log_pdf(mu_a,std_a,actions)
            loss_policy=log_policy_pdf*advantages
            loss=tf.reduce_sum(-loss_policy-self.entropy_beta*entropy)
        dj_dtheta=tape.gradient(loss,self.model.trainable_variables)
        dj_dtheta,_=tf.clip_by_global_norm(dj_dtheta,40)
        
        grads=zip(dj_dtheta,self.model.trainable_variables)
        self.actor_optimizer.apply_gradients(grads)
        
    def prdict(self,state):
        mu_a,_=self.model.predict(np.reshape(state,[1,self.state_dim]))
        return mu_a[0]

In [35]:
class WorkerActor(object):
    def __init__(self,state_dim,action_dim,action_bound):
        self.state_dim=state_dim
        self.action_dim=action_dim
        self.action_bound=action_bound
        self.std_bound=[1e-2,1]
        self.model=build_network1(self.state_dim,self.action_dim,self.action_bound)
        
    def get_action(self,state):
        mu_a,std_a=self.model.predict(np.reshape(state,[1,self.state_dim]))
        mu_a=mu_a[0]
        std_a=std_a[0]
        std_a=np.clip(std_a,self.std_bound[0],self.std_bound[1])
        action=np.random.normal(mu_a,std_a,size=self.action_dim)
        return action
    

In [36]:
def build_network2(state_dim):
    state_input=Input((state_dim,))
    h1=Dense(64,activation='relu')(state_input)
    h2=Dense(32,activation='relu')(h1)
    h3=Dense(16,activation='relu')(h2)
    v_output=Dense(1,activation='linear')(h3)
    model=Model(state_input,v_output)
    #model.summary()
    model._make_predict_function()
    return model

In [50]:
class GlobalCritic(object):
    
    def __init__(self,state_dim,action_dim,learning_rate):
        self.state_dim=state_dim
        self.action_dim=action_dim
        self.learning_rate=learning_rate
        self.model=build_network2(state_dim)
        self.critic_optimizer=tf.keras.optimizers.Adam(self.learning_rate)
        
    def train(self,states,td_targets):
        with tf.GradientTape() as tape:
            v_values=self.model(states)
            loss=tf.reduce_sum(tf.square(td_targets-v_values))
        dj_dphi=tape.gradient(loss,self.model.trainable_variables)
        dj_dphi,_=tf.clip_by_global_norm(dj_dphi,40)
        
        grads=zip(dj_dphi,self.model.trainable_variables)
        
        self.critic_optimizer.apply_gradients(grads)
    

In [51]:
class WorkerCritic(object):
    def __init__(self,state_dim):
        self.model=build_network2(state_dim)

In [52]:
global_episode_count = 0
global_step = 0
global_episode_reward = []  # save the results

In [53]:
class A3Cagent(object):
    def __init__(self,env_name):
        self.env_name=env_name
        self.WORKERS_NUM=multiprocessing.cpu_count() #4
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.ENTROPY_BETA = 0.01
        
        env = gym.make(self.env_name)
        state_dim = env.observation_space.shape[0]
        # get action dimension
        action_dim = env.action_space.shape[0]
        # get action bound
        action_bound = env.action_space.high[0]
        
        self.global_actor = GlobalActor(state_dim, action_dim, action_bound, self.ACTOR_LEARNING_RATE,
                                         self.ENTROPY_BETA)
        self.global_critic = GlobalCritic(state_dim, action_dim, self.CRITIC_LEARNING_RATE)

    def train(self,max_episode_num):
        workers=[]
        for i in range(self.WORKERS_NUM):
            worker_name='worker%i' %i
            workers.append(A3Cworker(worker_name,self.env_name,self.global_actor,
                                    self.global_critic,max_episode_num))
        for worker in workers:
            worker.start()
        for worker in workers:
            worker.join()
        print(global_episode_reward)
    def plot_result(self):
        plt.plot(global_episode_reward)
        plt.show()

In [54]:
class A3Cworker(threading.Thread):
    def __init__(self,worker_name,env_name,global_actor,global_critic,max_episode_num):
        threading.Thread.__init__(self)
        
        self.GAMMA=0.95
        self.t_MAX=4
        self.max_episode_num=max_episode_num
        
        self.env=gym.make(env_name)
        self.worker_name=worker_name
        
        self.global_actor = global_actor
        self.global_critic = global_critic
        # get state dimension
        self.state_dim = self.env.observation_space.shape[0]
        # get action dimension
        self.action_dim = self.env.action_space.shape[0]
        # get action bound
        self.action_bound = self.env.action_space.high[0]

        # create local actor and critic networks
        self.worker_actor = WorkerActor(self.state_dim, self.action_dim, self.action_bound)
        self.worker_critic = WorkerCritic(self.state_dim)
        
        self.worker_actor.model.set_weights(self.global_actor.model.get_weights())
        self.worker_critic.model.set_weights(self.global_critic.model.get_weights())
        
    def n_step_td_target(self,rewards,next_v_value,done):
        td_targets=np.zeros_like(rewards)
        cumulative=0
        if not done:
            cumulative=next_v_value
        for k in reversed(range(0,len(rewards))):
            cumulative=self.GAMMA*cumulative+rewards[k]
            td_targets[k]=cumulative
        return td_targets
    
    def unpack_batch(self,batch):
        unpack=batch[0]
        for idx in range(len(batch)-1):
            unpack=np.append(unpack,batch[idx+1],axis=0)
        return unpack
    
    def run(self):
        global global_episode_count, global_step
        global global_episode_reward  # total episode across all workers

        print(self.worker_name, "starts ---")
        
        while global_episode_count<=int(self.max_episode_num):
            batch_state,batch_action,batch_reward=[],[],[]
            step,episode_reward,done=0,0,False
            state=self.env.reset()
            
            while not done:
                #self.env.render()
                action=self.worker_actor.get_action(state)
                action=np.clip(action,-self.action_bound,self.action_bound)
                next_state,reward,done,_=self.env.step(action)
                state=np.reshape(state,[1,self.state_dim])
                reward=np.reshape(reward,[1,1])
                action=np.reshape(action,[1,self.action_dim])
                
                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append((reward+8)/8)
                
                state=next_state
                step+=1
                episode_reward+=reward[0]
                
                if len(batch_state)==self.t_MAX or done:
                    states=self.unpack_batch(batch_state)
                    actions=self.unpack_batch(batch_action)
                    rewards=self.unpack_batch(batch_reward)
                    
                    batch_state, batch_action, batch_reward = [], [], []
                    
                    next_state=np.reshape(next_state,[1,self.state_dim])
                    next_v_value=self.global_critic.model.predict(next_state)
                    n_step_td_targets=self.n_step_td_target(rewards,next_v_value,done)
                    v_values=self.global_critic.model.predict(states)
                    advantages=n_step_td_targets-v_values
                    
                    self.global_critic.train(states,n_step_td_targets)
                    self.global_actor.train(states,actions,advantages)
                    
                    self.worker_actor.model.set_weights(self.global_actor.model.get_weights())
                    self.worker_critic.model.set_weights(self.global_critic.model.get_weights())
                    
                    global_step+=1
                    
                if done:
                    global_episode_count+=1
                    print('Worker name:', self.worker_name, ', Episode: ', global_episode_count,
                          ', Step: ', step, ', Reward: ', episode_reward)

                    global_episode_reward.append(episode_reward)

In [None]:
max_episode_num = 1000
env_name = 'Pendulum-v0'
agent = A3Cagent(env_name)

agent.train(max_episode_num)

agent.plot_result()



worker0worker1worker2   worker3 starts ---starts ---starts ---


starts ---


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layer

Worker name: worker0 , Episode:  33 , Step:  200 , Reward:  [-906.2568021]
Worker name: worker2 , Episode:  34 , Step:  200 , Reward:  [-1409.22323095]
Worker name: worker3 , Episode:  35 , Step:  200 , Reward:  [-1698.46008145]
Worker name: worker1 , Episode:  36 , Step:  200 , Reward:  [-1030.67479638]
Worker name: worker0 , Episode:  37 , Step:  200 , Reward:  [-1004.57159363]
Worker name: worker2 , Episode:  38 , Step:  200 , Reward:  [-1050.71747723]
Worker name: worker3 , Episode:  39 , Step:  200 , Reward:  [-1036.47069963]
Worker name: worker1 , Episode:  40 , Step:  200 , Reward:  [-1016.49180005]
Worker name: worker0 , Episode:  41 , Step:  200 , Reward:  [-1348.6196252]
Worker name: worker2 , Episode:  42 , Step:  200 , Reward:  [-967.73524847]
Worker name: worker3 , Episode:  43 , Step:  200 , Reward:  [-967.02725715]
Worker name: worker1 , Episode:  44 , Step:  200 , Reward:  [-1048.38728281]
Worker name: worker0 , Episode:  45 , Step:  200 , Reward:  [-1313.64106649]
Work

Worker name: worker0 , Episode:  140 , Step:  200 , Reward:  [-1137.45334303]
Worker name: worker1 , Episode:  141 , Step:  200 , Reward:  [-1050.542744]
Worker name: worker2 , Episode:  142 , Step:  200 , Reward:  [-1002.86834087]
Worker name: worker3 , Episode:  143 , Step:  200 , Reward:  [-1252.50075003]
Worker name: worker0 , Episode:  144 , Step:  200 , Reward:  [-893.00110259]
Worker name: worker1 , Episode:  145 , Step:  200 , Reward:  [-1023.83026876]
Worker name: worker2 , Episode:  146 , Step:  200 , Reward:  [-906.40322247]
Worker name: worker3 , Episode:  147 , Step:  200 , Reward:  [-895.29734904]
Worker name: worker0 , Episode:  148 , Step:  200 , Reward:  [-736.6230298]
Worker name: worker1 , Episode:  149 , Step:  200 , Reward:  [-880.96390143]
Worker name: worker2 , Episode:  150 , Step:  200 , Reward:  [-691.82653835]
Worker name: worker0 , Episode:  151 , Step:  200 , Reward:  [-1124.77033188]
Worker name: worker3 , Episode:  152 , Step:  200 , Reward:  [-764.975345

Worker name: worker0 , Episode:  247 , Step:  200 , Reward:  [-883.68174886]
Worker name: worker2 , Episode:  248 , Step:  200 , Reward:  [-819.66988999]
Worker name: worker3 , Episode:  249 , Step:  200 , Reward:  [-640.31910477]
Worker name: worker1 , Episode:  250 , Step:  200 , Reward:  [-829.17352427]
Worker name: worker0 , Episode:  251 , Step:  200 , Reward:  [-761.78172893]
Worker name: worker2 , Episode:  252 , Step:  200 , Reward:  [-1188.95938582]
Worker name: worker3 , Episode:  253 , Step:  200 , Reward:  [-1005.28169052]
Worker name: worker1 , Episode:  254 , Step:  200 , Reward:  [-762.09917703]
Worker name: worker0 , Episode:  255 , Step:  200 , Reward:  [-879.63092583]
Worker name: worker2 , Episode:  256 , Step:  200 , Reward:  [-1059.62039461]
Worker name: worker3 , Episode:  257 , Step:  200 , Reward:  [-1027.24437982]
Worker name: worker1 , Episode:  258 , Step:  200 , Reward:  [-1039.55237964]
Worker name: worker0 , Episode:  259 , Step:  200 , Reward:  [-1138.411