In [1]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
import random
from collections import deque


env_to_use = 'Pendulum-v0'

# hyperparameters
gamma = 0.99				# reward discount factor
h1_actor = 8			# hidden layer 1 size for the actor
h2_actor = 8				# hidden layer 2 size for the actor
h3_actor = 8				# hidden layer 3 size for the actor
h1_critic = 8				# hidden layer 1 size for the critic
h2_critic = 8				# hidden layer 2 size for the critic
h3_critic = 8				# hidden layer 3 size for the critic
lr_actor = 1e-3				# learning rate for the actor
lr_critic = 1e-3			# learning rate for the critic
lr_decay = 1				# learning rate decay (per episode)
l2_reg_actor = 1e-6			# L2 regularization factor for the actor
l2_reg_critic = 1e-6		# L2 regularization factor for the critic
dropout_actor = 0			# dropout rate for actor (0 = no dropout)
dropout_critic = 0			# dropout rate for critic (0 = no dropout)
num_episodes = 150		# number of episodes
max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
tau = 1e-2				# soft target update rate
train_every = 1			# number of steps to run the policy (and collect experience) before updating network weights
replay_memory_capacity = int(1e5)	# capacity of experience replay memory
minibatch_size = 1024	# size of minibatch from experience replay memory for updates
initial_noise_scale = 0.1	# scale of the exploration noise process (1.0 is the range of each action dimension)
noise_decay = 0.99		# decay rate (per episode) of the scale of the exploration noise process
exploration_mu = 0.0	# mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_sigma = 0.2	# sigma parameter for the exploration noise process: dXt = theta*(mu-Xt	)*dt + sigma*dWt

# game parameters
env = gym.make(env_to_use)
state_dim = np.prod(np.array(env.observation_space.shape)) 	# Get total number of dimensions in state
action_dim = np.prod(np.array(env.action_space.shape))		# Assuming continuous action space

# set seeds to 0
env.seed(0)
np.random.seed(0)

# prepare monitorings
#outdir = '/tmp/ddpg-agent-results'
#env = wrappers.Monitor(env, outdir, force=True)


  result = entry_point.load(False)


In [2]:
replay_memory = deque(maxlen=replay_memory_capacity)			# used for O(1) popleft() operation

def add_to_memory(experience):
	replay_memory.append(experience)

def sample_from_memory(minibatch_size):
	return random.sample(replay_memory, minibatch_size)

In [3]:
import tensorflow as tf
class ANN():
    tf.reset_default_graph()
    # placeholders
    state_ph  =  tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
    action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
    reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
    next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
    is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
    #is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout

    # episode counter
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_inc_op = episodes.assign_add(1)
    
    def __init__(self):
        with tf.variable_scope('actor'):
            self.actor_net_value = ANN.generate_actor_network(self,trainable = True, reuse = False)

        # slow target actor network
        with tf.variable_scope('slow_target_actor', reuse=False):
            self.target_actor_net_value = tf.stop_gradient(ANN.generate_actor_network(self,trainable = False, reuse = False))

        with tf.variable_scope('critic') as scope:
            self.critic_net_value = ANN.generate_critic_network(self,trainable = True, reuse = False)
            self.q_value_for_actor_net = ANN.generate_critic_network(self,trainable = True, reuse = True,mode=2)

        # slow target critic
        with tf.variable_scope('slow_target_critic', reuse=False):
            self.target_critic_net_value = tf.stop_gradient(ANN.generate_critic_network(self,trainable = False, reuse = False,mode=3))
        # isolate vars for each network
        
        self.actor_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
        self.target_actor_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
        self.critic_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
        self.target_critic_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')

         # actor network
    def predict_graph(self):
        return self.actor_net_value
    def generate_actor_network(self,trainable, reuse):
        hidden = tf.layers.dense(ANN.state_ph, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
        hidden_2 = tf.layers.dense(hidden, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
        hidden_3 = tf.layers.dense(hidden_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
        actions_unscaled = tf.layers.dense(hidden_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
        actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
        return actions

   
    # will use this to initialize both the critic network its slowly-changing target network with same structure
    def generate_critic_network(self,trainable, reuse,mode=1):
        if mode==1:
            state_action = tf.concat([ANN.state_ph, ANN.action_ph], axis=1)
        if mode==2:
            state_action = tf.concat([ANN.state_ph,self.actor_net_value], axis=1)
        if mode==3:
            state_action = tf.concat([ANN.next_state_ph,self.target_actor_net_value], axis=1)
        hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
        hidden_2 = tf.layers.dense(hidden, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
        hidden_3 = tf.layers.dense(hidden_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
        q_values = tf.layers.dense(hidden_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
        return q_values
    def train_graph(self):
        # One step TD targets y_i for (s,a) from experience replay
        # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
        # = r_i if s' terminal
        updated_q_values = tf.expand_dims(ANN.reward_ph, 1) + tf.expand_dims(ANN.is_not_terminal_ph, 1) * gamma * self.target_critic_net_value

        # 1-step temporal difference errors
        td_errors = updated_q_values - self.critic_net_value

        # critic loss function (mean-square value error with regularization)
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        for var in self.critic_net_vars:
            if not 'bias' in var.name:
                critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)

        # critic optimizer
        critic_train_op = tf.train.AdamOptimizer(lr_critic).minimize(critic_loss)

        # actor loss function (mean Q-values under current policy with regularization)
        actor_loss = -1*tf.reduce_mean(self.q_value_for_actor_net)
        for var in self.actor_net_vars:
            if not 'bias' in var.name:
                actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

        # actor optimizer
        # the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed)
        actor_train_op = tf.train.AdamOptimizer(lr_actor).minimize(actor_loss, var_list=self.actor_net_vars)
        return actor_train_op,critic_train_op
        
    def update_wts_graph(self):
            # update values for slowly-changing targets towards current actor and critic
            update_slow_target_ops = []
            for i, target_actor_var in enumerate(self.target_actor_net_vars):
                update_slow_target_actor_op = target_actor_var.assign(tau*self.actor_net_vars[i]+(1-tau)*target_actor_var)
                update_slow_target_ops.append(update_slow_target_actor_op)

            for i, slow_target_var in enumerate(self.target_critic_net_vars):
                update_slow_target_critic_op = slow_target_var.assign(tau*self.critic_net_vars[i]+(1-tau)*slow_target_var)
                update_slow_target_ops.append(update_slow_target_critic_op)

            update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')
            return update_slow_targets_op 

In [4]:
Model=ANN()
actor_net_value=Model.predict_graph()
actor_train_op,critic_train_op=Model.train_graph()
update_wts_op=Model.update_wts_graph()

In [5]:



# initialize session
sess = tf.Session()	
sess.run(tf.global_variables_initializer())



In [6]:
#####################################################################################################
## Training

total_steps = 0
for ep in range(num_episodes):

	total_reward = 0
	steps_in_ep = 0

	# Initialize exploration noise process
	noise_process = np.zeros(action_dim)
	noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low)

	# Initial state
	observation = env.reset()
	#if ep%10 == 0: env.render()
    
	for t in range(max_steps_ep):

		# choose action based on deterministic policy
		action_for_state, = sess.run(actor_net_value, 
			feed_dict = {Model.state_ph: observation[None]})

		# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process)
		# print(action_for_state)
		noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim)
		# print(noise_scale*noise_process)
		action_for_state += noise_scale*noise_process

		# take step
		next_observation, reward, done, _info = env.step(action_for_state)
		#if ep%10 == 0: env.render()
		total_reward += reward

		add_to_memory((observation, action_for_state, reward, next_observation, 
			# is next_observation a terminal state?
			# 0.0 if done and not env.env._past_limit() else 1.0))
			0.0 if done else 1.0))

		# update network weights to fit a minibatch of experience
		if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:

			# grab N (s,a,r,s') tuples from replay memory
			minibatch = sample_from_memory(minibatch_size)

			# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
			_, _ = sess.run([critic_train_op, actor_train_op], 
				feed_dict = {
					Model.state_ph: np.asarray([elem[0] for elem in minibatch]),
					Model.action_ph: np.asarray([elem[1] for elem in minibatch]),
					Model.reward_ph: np.asarray([elem[2] for elem in minibatch]),
					Model.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
					Model.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
					})

			# update slow actor and critic targets towards current actor and critic
			_ = sess.run(update_wts_op)

		observation = next_observation
		total_steps += 1
		steps_in_ep += 1
		
		if done: 
			# Increment episode counter
			_ = sess.run(Model.episode_inc_op)
			break
		
	print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale))

# Finalize and upload results
env.close()
gym.upload(outdir)

Episode  0, Reward: -1647.251, Steps: 200, Final noise scale:   0.400
Episode  1, Reward: -957.547, Steps: 200, Final noise scale:   0.396
Episode  2, Reward: -1512.650, Steps: 200, Final noise scale:   0.392
Episode  3, Reward: -1469.922, Steps: 200, Final noise scale:   0.388
Episode  4, Reward: -1663.252, Steps: 200, Final noise scale:   0.384
Episode  5, Reward: -1586.491, Steps: 200, Final noise scale:   0.380
Episode  6, Reward: -1458.635, Steps: 200, Final noise scale:   0.377
Episode  7, Reward: -1390.783, Steps: 200, Final noise scale:   0.373
Episode  8, Reward: -1204.401, Steps: 200, Final noise scale:   0.369
Episode  9, Reward: -1532.131, Steps: 200, Final noise scale:   0.365
Episode 10, Reward: -1503.814, Steps: 200, Final noise scale:   0.362
Episode 11, Reward: -1177.988, Steps: 200, Final noise scale:   0.358
Episode 12, Reward: -1163.216, Steps: 200, Final noise scale:   0.355
Episode 13, Reward: -905.482, Steps: 200, Final noise scale:   0.351
Episode 14, Reward: -6

Episode 119, Reward: -1089.488, Steps: 200, Final noise scale:   0.121
Episode 120, Reward: -251.479, Steps: 200, Final noise scale:   0.120
Episode 121, Reward: -134.164, Steps: 200, Final noise scale:   0.119
Episode 122, Reward: -138.877, Steps: 200, Final noise scale:   0.117
Episode 123, Reward: -267.235, Steps: 200, Final noise scale:   0.116
Episode 124, Reward: -133.958, Steps: 200, Final noise scale:   0.115
Episode 125, Reward: -264.998, Steps: 200, Final noise scale:   0.114
Episode 126, Reward: -136.956, Steps: 200, Final noise scale:   0.113
Episode 127, Reward: -136.275, Steps: 200, Final noise scale:   0.112
Episode 128, Reward: -131.482, Steps: 200, Final noise scale:   0.111
Episode 129, Reward: -234.409, Steps: 200, Final noise scale:   0.109
Episode 130, Reward: -10.195, Steps: 200, Final noise scale:   0.108
Episode 131, Reward: -251.501, Steps: 200, Final noise scale:   0.107
Episode 132, Reward: -133.757, Steps: 200, Final noise scale:   0.106
Episode 133, Reward:

NameError: name 'outdir' is not defined

In [None]:
env.close()