In [1]:
import gym
import numpy as np 
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Add, Multiply
#from tensorflow.keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K

import tensorflow as tf 

import random 
from collections import deque
import os

Using TensorFlow backend.


$$\frac{\delta C}{\delta \Theta_A} = \frac{\delta C}{\delta A} \times \frac{\delta A}{\delta \Theta_A}$$

In [2]:
batch_size = 50

In [3]:
class A3C():
    def __init__(self, env, sess):
        self.env = env
        self.sess = sess
        
        self.l_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.0001
        self.epsilon_decay = 0.9995
        self.gamma = 0.95
        self.tau = 0.125
        self.memory = deque(maxlen = 10000)
        
        self.actor_state_input, self.actor_model = self.create_actor()
        _, self.target_actor_model = self.create_actor()
        
        self.actor_critic_grad = tf.placeholder(tf.float32, shape = [None, self.env.action_space.shape[0]]) #input 𝛿𝐶 / 𝛿𝐴
        
        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) #𝛿𝐶 / 𝛿Θ_𝐴
        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(self.l_rate).apply_gradients(grads)
        
        self.critic_state_input, self.critic_action_input, self.critic_model = self.create_critic()
        _, _, self.target_critic_model = self.create_critic()
        
        self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_input) #calculate 𝛿𝐶 / 𝛿𝐴
        
        self.sess.run(tf.initialize_all_variables())
        
        
    def create_actor(self):
        state_input = Input(shape = self.env.observation_space.shape)
        h1 = Dense(24, activation = 'relu')(state_input)
        h2 = Dense(48, activation = 'relu')(h1)
        h3 = Dense(24, activation = 'relu')(h2)
        output = Dense(self.env.action_space.shape[0], activation = 'relu')(h3)
        
        model = Model(inputs = state_input, outputs = output)
        optimizer = Adam(lr = self.l_rate)
        model.compile(loss = "mse", optimizer = optimizer)
        return state_input, model
    
    def create_critic(self):
        state_input = Input(shape = self.env.observation_space.shape)
        state_h1 = Dense(24, activation = 'relu')(state_input)
        state_h2 = Dense(48)(state_h1)
        
        action_input = Input(shape = self.env.action_space.shape)
        action_h1 = Dense(48)(action_input)
        
        merged = Add()([state_h2, action_h1])
        merged_h1 = Dense(24, activation = 'relu')(merged)
        output = Dense(1, activation = 'relu')(merged_h1)
        
        model = Model(inputs = [state_input, action_input], outputs = output)
        optimizer = Adam(lr = self.l_rate)
        model.compile(loss = "mse", optimizer = optimizer)
        return state_input, action_input, model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append([state, action, reward, next_state, done])
        
    def train_actor(self, sample):
        for state, action, _, _, _ in sample:
            pred_action = self.actor_model.predict(state)
            grads = self.sess.run(self.critic_grads, feed_dict = {self.critic_state_input : state, 
                                                                  self.critic_action_input : pred_action})[0]
            
            self.sess.run(self.optimize, feed_dict = {self.actor_state_input : state, self.actor_critic_grad : grads})
            
    def train_critic(self, sample):
        for state, action, reward, next_state, done in sample:
            value = reward
            if not done:
                target_action = self.target_actor_model.predict(next_state)
                future_reward = self.target_critic_model.predict([next_state, target_action])[0][0]
                value += self.gamma * future_reward
            self.critic_model.fit([state, action], value, verbose = 0)
            
    def train(self):
        if len(self.memory) < batch_size:
            return
        
        sample = random.sample(self.memory, batch_size)
        self.train_critic(sample)
        self.train_actor(sample)
        
    def update_actor_target(self):
        actor_model_weights = self.actor_model.get_weights()
        actor_target_weights = self.target_actor_model.get_weights()
    
        for i in range(len(actor_model_weights)):
            actor_target_weights[i] = self.tau * actor_model_weights[i] + (1 - self.tau) * actor_target_weights[i]
        self.target_actor_model.set_weights(actor_target_weights)
        
    def update_critic_target(self):
        critic_model_weights = self.critic_model.get_weights()
        critic_target_weights = self.target_critic_model.get_weights()
        
        for i in range(len(critic_model_weights)):
            critic_target_weights[i] = self.tau * critic_model_weights[i] + (1 - self.tau) * critic_target_weights[i]
        self.target_critic_model.set_weights(critic_target_weights)
        
    def update_target(self):
        self.update_actor_target()
        self.update_critic_target()
        
    def act(self, state):
        self.epsilon = max(self.epsilon_decay * self.epsilon , self.epsilon_min)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return self.actor_model.predict(state)
    
    def save_weights(self, path = './model_weights/', filename = '_'):
        
        if not os.path.exists(path):
            os.mkdir(path)
            
        self.actor_model.save_weights(path +'_actor_'+ filename)
        self.target_actor_model.save_weights(path +'_target_actor_'+ filename)
        self.critic_model.save_weights(path +'critic_'+ filename)
        self.target_critic_model.save_weights(path +'_target_critic_actor_'+ filename)
        
    def load_weights(self, path = './model_weights/', filename = '_'):
        
        self.actor_model.load_weights(path +'_actor_'+ filename)
        self.target_actor_model.load_weights(path +'_target_actor_'+ filename)
        self.critic_model.load_weights(path +'critic_'+ filename)
        self.target_critic_model.load_weights(path +'_target_critic_actor_'+ filename)
    

In [4]:
episode_infos = {'run' : [], 'total_reward' : [], 'avg_reward' : []}

In [5]:
def main():
       
    sess = tf.Session()
    K.set_session(sess)
    env = gym.make('Pendulum-v0')
    actor_critic = A3C(env, sess)
    
    #max_episodes = 10000
    max_steps = 500
    run = 0
    avg_reward = 0
    while True:
        run_reward = 0
        run += 1
        
        state = env.reset()
        state = np.reshape(state, [1, env.observation_space.shape[0]])
        
        for step in range(max_steps):
            action = actor_critic.act(state)
            action = np.reshape(action, [1, env.action_space.shape[0]])
        
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, env.observation_space.shape[0]])
        
            actor_critic.remember(state, action, reward, next_state, done)
            actor_critic.train()
            actor_critic.update_target()
        
            state = next_state
            run_reward += reward
            
            #env.render()
        
        if run == 1:
            avg_reward = run_reward
        avg_reward = 0.98 * avg_reward + 0.02 * run_reward
        
        episode_infos['run'].append(run)
        episode_infos['total_reward'].append(run_reward)
        episode_infos['avg_reward'].append(avg_reward)
                
        print('epsilon : ' + str(actor_critic.epsilon))
        print('run : ' + str(run) + ' score : ' + str(run_reward) + ' avg_score : ' + str(avg_reward))

In [6]:
main()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Instructions for updating:
Use tf.cast instead.
epsilon : 0.7787520933134615
run : 1 score : [-2833.3909] avg_score : [-2833.3909]
epsilon : 0.606454822840097
run : 2 score : [-2253.334] avg_score : [-2821.7898]
epsilon : 0.4722779627867691
run : 3 score : [-2334.2202] avg_score : [-2812.0383]
epsilon : 0.3677874521460121
run : 4 score : [-4588.4077] avg_score : [-2847.5657]
epsilon : 0.28641524825313086
run : 5 score : [-4198.906] avg_score : [-2874.5925]
epsilon : 0.22304647413401948
run : 6 score : [-4608.947] avg_score : [-2909.2798]
epsilon : 0.17369790863805412
run : 7 score : [-4288.5117] avg_score : [-2936.8645]
epsilon : 0.13526760995605422
run : 8 score : [-4308.6074] avg_score : [-2964.2993]
epsilon : 0.10533993441078586
run : 9 score : [-3552.119] avg_score : [-2976.056]
epsilon : 0.0820336944319021
run : 10 score : [-4601.8477] a

KeyboardInterrupt: 

In [None]:
t_r = np.array(episode_infos['total_reward']).reshape([1,episode_infos['run'][-1]])[0]
a_r = np.array(episode_infos['avg_reward']).reshape([1,episode_infos['run'][-1]])[0]

In [None]:
import plotly.offline as py
import plotly.graph_objs as go

# Create random data with numpy
import numpy as np

py.init_notebook_mode()

random_x = np.array(range(episode_infos['run'][-1]))#episode_infos['run'][-1]np.linspace(0, 1, episode_infos['run'][-1])

# Create a trace
reward_total = go.Scatter(
    x = random_x,
    y = t_r,
    line = dict(
        color = ('rgb(255, 125, 33)'),
        width = 1,)
)

reward_100_avg_ = go.Scatter(
    x = random_x,
    y = a_r,
    line = dict(
        color = ('rgb(66, 134, 244)'),
        width = 1,
        dash = 'dash')
)

In [None]:
py.iplot([reward_total, reward_100_avg_], filename='basic-area')