In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import gym
import six
import matplotlib
import matplotlib.pyplot as plt

import tensorflow as tf

# Environment

In [2]:
env_name = 'Pendulum-v0'
env = gym.make(env_name)

# Observation Spec
print("Observation Shape={}".format(env.observation_space.shape))
print("Observation Range (Low)={}".format(env.observation_space.low))
print("Observation Range (High)={}".format(env.observation_space.high))
print("\r")

# Action Spec
print("Action Shape={}".format(env.action_space.shape))
print("Action Range (Low)={}".format(env.action_space.low))
print("Action Range (High)={}".format(env.action_space.high))

Observation Shape=(3,)
Observation Range (Low)=[-1. -1. -8.]
Observation Range (High)=[1. 1. 8.]

Action Shape=(1,)
Action Range (Low)=[-2.]
Action Range (High)=[2.]


Observation Spec
(3,)
[-1. -1. -8.]
[1. 1. 8.]

Action Spec
(1,)
[-2.]
[2.]

# Actor Network

In [3]:
class ActorNetwork(tf.keras.Model):
    def __init__(self, observation_spec, action_spec, name="Actor"):
        super(ActorNetwork, self).__init__()
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        hidden_units = 64
        
        # Layers
        self._layer1 = tf.keras.layers.Dense(hidden_units, activation='relu')
        self._layer2 = tf.keras.layers.Dense(hidden_units, activation='relu')
        self._output = tf.keras.layers.Dense(action_spec.shape[0])

    def call(self, observation):
        output = self._layer1(observation)
        output = self._layer2(output)
        output = self._output(output)
        return output
    
    def copy(self, name):
        # Create a new copy of this network with the initialization params that were
        # passed to this one.
        return type(self)(self._observation_spec, self._action_spec, name)
    
    def _build(self):
        if not self.built and self._observation_spec is not None:
            # Generate a single random observation to build the network if it hasn't
            # been built already.
            random_observation = tf.random.uniform((1,self._observation_spec.shape[0]), 
                                             self._observation_spec.low, 
                                             self._observation_spec.high)
            action = self.__call__(random_observation)
            print(action)

    @property
    def variables(self):
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights


In [4]:
actor_network = ActorNetwork(env.observation_space, env.action_space)
actorVariables = actor_network.variables

tf.Tensor([[-0.46198443]], shape=(1, 1), dtype=float32)


# Critic Network

In [5]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, observation_spec, action_spec, name="Critic"):
        super(CriticNetwork, self).__init__()
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        hidden_units = 64
        
        # Observation Layers
        self._observation_layer1 = tf.keras.layers.Dense(hidden_units, activation='relu')
        self._observation_layer2 = tf.keras.layers.Dense(hidden_units, activation='relu')
        
        # Action Layers
        self._action_layer1 = tf.keras.layers.Dense(hidden_units, activation='relu')
        
        # Merged Layers
        self._merge_layer1 = tf.keras.layers.Add()
        self._merge_layer2 = tf.keras.layers.Dense(hidden_units, activation='relu')
        
        # Output Layer: Q-value for the action taken based on this observation.
        self._output = tf.keras.layers.Dense(1, activation='relu')

    def call(self, observation, action):
        obs_output = self._observation_layer1(observation)
        obs_output = self._observation_layer2(obs_output)
        
        action_output = self._action_layer1(action)
        
        output = self._merge_layer1([obs_output, action_output])
        output = self._merge_layer2(output)
        q = self._output(output)
        return q
    
    def copy(self, name):
        # Create a new copy of this network with the initialization params that were
        # passed to this one.
        return type(self)(self._observation_spec, self._action_spec, name)
    
    def _build(self):
        if not self.built and self._observation_spec is not None:
            # Generate a single random observation and action to build the network if it hasn't
            # been built already.
            random_observation = tf.random.uniform((1, self._observation_spec.shape[0]), 
                                             self._observation_spec.low, 
                                             self._observation_spec.high)
            random_action = tf.random.uniform((1, self._action_spec.shape[0]), 
                                              self._action_spec.low, 
                                              self._action_spec.high)

            q = self.__call__(random_observation, random_action)
            print(q)

    @property
    def variables(self):
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights


In [6]:
critic_network = CriticNetwork(env.observation_space, env.action_space)
criticVariables = critic_network.variables

tf.Tensor([[0.]], shape=(1, 1), dtype=float32)


# Agent

In [7]:
class ActorCriticAgent:
    def __init__(self, 
                 observation_spec, 
                 action_spec, 
                 actor_network,
                 critic_network, 
                 actor_learning_rate = 3e-4, 
                 critic_learning_rate = 3e-4, 
                 gamma = 0.99): 
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        self._actor_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate)
        
        self._actor_network = actor_network
        self._target_actor_network = actor_network.copy("actor-target")
        
        self._critic_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate)
        self._critic_network = critic_network
        self._target_critic_network = critic_network.copy("critic-target")
        
        self._gamma = gamma
        
    def to_tensors(batch):
        no_state = np.zeros(self.observation_size)

        observations = np.array([ x[0] for x in batch ])
        actions = np.array([ x[1] for x in batch ])
        rewards = np.array([ x[2] for x in batch ])
        next_observations = np.array([ (no_state if x[4] is True else x[3]) for x in batch ])

    def train_test(self):
        batch_size = 32
        
        observations = tf.random.uniform((batch_size, self._observation_spec.shape[0]), 
                                         self._observation_spec.low, 
                                         self._observation_spec.high)
        
        actions = tf.random.uniform((batch_size, self._action_spec.shape[0]), 
                                    self._action_spec.low, 
                                    self._action_spec.high)
        
        next_observations = tf.random.uniform((batch_size, self._observation_spec.shape[0]), 
                                              self._observation_spec.low, 
                                              self._observation_spec.high)
        
        rewards = tf.constant(1, shape=(batch_size,1), dtype=tf.float32)
        
        loss = self.train_batch(observations, actions, rewards, next_observations)
        
        print(loss)
        
    def train_batch(self, observations, actions, rewards, next_observations): 
        
        # Compute Critic Loss and apply gradients.
        critic_variables = self._critic_network.variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(critic_variables)
            critic_loss = self.critic_loss(observations, actions, rewards, next_observations)
        
        critic_grads = tape.gradient(critic_loss, critic_variables)
        critic_grads_and_vars = tuple(zip(critic_grads, critic_variables))
        self._critic_optimizer.apply_gradients(critic_grads_and_vars)
        
        # Compute Actor Loss and apply gradients
        actor_variables = self._actor_network.variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(actor_variables)
            actor_loss = self.actor_loss(observations)
            
        actor_grads = tape.gradient(actor_loss, actor_variables)
        actor_grads_and_vars = tuple(zip(actor_grads, actor_variables))
        self._actor_optimizer.apply_gradients(actor_grads_and_vars)

        # Increment counter and conditionally target networks.
        #self.train_step_counter.assign_add(1)
        #self._update_target()
        
        total_loss = actor_loss + critic_loss
        return total_loss
        

    def critic_loss(self, observations, actions, rewards, next_observations):        
        with tf.name_scope('critic_loss'):
            target_actions = self._target_actor_network(observations)

            target_q_values = self._target_critic_network(next_observations, target_actions)

            td_targets = tf.stop_gradient(rewards + self._gamma * target_q_values)

            q_values = self._critic_network(observations, actions)

            critic_loss = tf.compat.v1.losses.mean_squared_error(td_targets, q_values)

            critic_loss = tf.reduce_mean(critic_loss)

            return critic_loss
                
    def actor_loss(self, observations):
        with tf.name_scope('actor_loss'):
            actions = self._actor_network(observations)
            
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(actions)
                q_values = self._critic_network(observations, actions)

            dqdas = tape.gradient([q_values], actions)

            actor_losses = []
            
            for dqda, action in zip(dqdas, actions):
                loss = tf.compat.v1.losses.mean_squared_error(tf.stop_gradient(dqda + action), 
                                                              action,
                                                             reduction=tf.compat.v1.losses.Reduction.NONE)
                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)
            
            return actor_loss

In [8]:
agent = ActorCriticAgent(env.observation_space, 
                         env.action_space, 
                         actor_network, 
                         critic_network)

In [9]:
agent.train_test()

tf.Tensor(1.7272506, shape=(), dtype=float32)
