In [None]:
import gym
import numpy as np 
from tensorflow.keras.model import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers.merge import Add, Multiply
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

import tensorflow as tf 

import random 
from collections import deque

$$\frac{\delta C}{\delta \Theta_A} = \frac{\delta C}{\delta A} \times \frac{\delta A}{\delta \Theta_A}$$

In [None]:
class A3C():
    def __init__(self, env, sess):
        self.env = env
        self.sess = sess
        
        self.l_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.9995
        self.gamma = 0.95
        self.tau = 0.125
        self.memory = deque(maxlen = 10000)
        
        self.actor_state_input, self.actor_model = create_actor()
        _, self.target_actor_model = create_actor()
        
        self.actor_critic_grad = tf.placeholder(tf.float32, shape = [None, self.env.action_space.shape[0]]) #input 𝛿𝐶 / 𝛿𝐴
        
        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, - self.actor_critic_grad) #𝛿𝐶 / 𝛿Θ_𝐴
        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(self.lrearning_rate).apply_gradients(grads)
        
        self.critic_state_input, self.critic_action_input, self.critic_model = self.create_critic()
        _, _, self.target_critic_model = self.create_critic_model()
        
        self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_input) #calculate 𝛿𝐶 / 𝛿𝐴
        
        self.sess.run(tf.initialize_all_variables())
        
    def create_actor(self):
        state_input = Input(shape = self.env.observation_space.shape)
        h1 = Dense(24, activation = 'relu')(state_input)
        h2 = Dense(48, activation = 'relu')(h1)
        h3 = Dense(24, activation = 'relu')(h2)
        output = Dense(self.env.action_space.shape[0], activation = 'relu')(h3)
        
        model = Model(input = state_input, output = output)
        optimizer = Adam(lr = self.l_rate)
        model.compile(loss = "mse", optimizer = optimizer)
        return state_input, model
    
    def create_critic(self):
        state_input = Input(shape = self.env.observation_space.shape)
        state_h1 = Dense(24, activation = 'relu')(state_input)
        state_h2 = Dense(48)(state_h1)
        
        action_input = Input(shape = self.env.action_space.shape)
        action_h1 = Dense(48)(action_input)
        
        merged = Add()([state_h2, action_h1])
        merged_h1 = Dense(1, activation = 'relu')(merged)
        output = Dense(1, activation = 'relu')(merged_h1)
        
        model = Model(input = [state_input, action_input], output = output)
        optimizer = Adam(lr = self.l_rate)
        model.compile(loss = "mse", optimizer = optimizer)
        return state_input, action_input, model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append([state, action, reward, next_state, done])
        
    def train_actor(self, sample):
        