In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment

from tf_agents.networks import network

from tf_agents.policies import tf_policy
from tf_agents.policies import random_tf_policy

from tf_agents.agents.sac import sac_agent

from tf_agents.drivers import dynamic_step_driver

from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step


from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

from tf_agents.specs import tensor_spec
from tf_agents.specs import distribution_spec
from tf_agents.distributions import tanh_bijector_stable

tf.compat.v1.enable_v2_behavior()

In [2]:
env_name = "Pendulum-v0" 
#env_name = "LunarLanderContinuous-v2" 

num_iterations = 20000 

initial_collect_steps = 1000  
collect_steps_per_iteration = 1 
replay_buffer_max_length = 25000 

batch_size = 256 

critic_learning_rate = 1e-3 
actor_learning_rate = 1e-3 
alpha_learning_rate = 1e-3 
target_update_tau = 0.005 
target_update_period = 1 
gamma = 0.99 
reward_scale_factor = 1.0 
gradient_clipping = None 

actor_fc_layer_params = (256,)
critic_joint_fc_layer_params = (256,)

log_interval = 200 

num_eval_episodes = 10 
eval_interval = 2000 
max_episode_steps = 1000

load_weights = False

In [3]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

In [4]:
class SimpleActorDistributionNetwork(network.DistributionNetwork):
    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 hidden_units = 256,
                 name="ActorNormalDistributionNetwork"):
        
        output_spec = self._output_distribution_spec(output_tensor_spec, name) 
        
        super(SimpleActorDistributionNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            output_spec=output_spec,
            name=name)

        num_actions = output_tensor_spec.shape.num_elements()
          
        # hidden layer to encode
        self._hidden_layer = tf.keras.layers.Dense(
            hidden_units,
            activation=tf.keras.activations.relu,
            kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
            input_shape=input_tensor_spec.shape)
        
        # means layer for distribution
        init_means_output_factor = 0.1
        std_bias_initializer_value = 0.0
        
        self._means_projection_layer = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Zeros(),
            name='means_projection_layer')

        # standard dev layer for distribution
        self._stddev_projection_layer = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Constant(
                value=std_bias_initializer_value),
            name='stddev_projection_layer')
        
        # Scale
        action_means, action_magnitudes = common.spec_means_and_magnitudes(output_tensor_spec)
        bijectors = [tfp.bijectors.Shift(action_means),
                     tfp.bijectors.Scale(action_magnitudes),
                     tanh_bijector_stable.Tanh()]
        self._bijector_chain = tfp.bijectors.Chain(bijectors)
        
        
    def _output_distribution_spec(self, sample_spec, network_name):
        input_param_shapes = tfp.distributions.Normal.param_static_shapes(
            sample_spec.shape)

        input_param_spec = {
            name: tensor_spec.TensorSpec(  
                shape=shape,
                dtype=sample_spec.dtype,
                name=network_name + '_' + name)
            for name, shape in input_param_shapes.items()
        }

        def distribution_builder(*args, **kwargs):            
            distribution = tfp.distributions.Normal(*args, **kwargs)
            return tfp.distributions.TransformedDistribution(distribution=distribution, bijector=self._bijector_chain)

        return distribution_spec.DistributionSpec(distribution_builder, input_param_spec, sample_spec=sample_spec)

    
    def call(self, observations, step_type, network_state, training=False):
        state = self._hidden_layer(observations, training=training)
        
        means = self._means_projection_layer(state, training=training)

        stds = self._stddev_projection_layer(state, training=training)
        stds = tf.clip_by_value(stds, -20, 2)
        stds = tf.exp(stds)
        
        return self.output_spec.builder(loc=means, scale=stds), network_state


In [5]:
class SimpleActorPolicy(tf_policy.Base):
    def __init__(self,
        time_step_spec,
        action_spec,
        actor_network,
        training=False):

        actor_network.create_variables()
        self._actor_network = actor_network
        self._training = training

        super(SimpleActorPolicy, self).__init__(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            policy_state_spec=actor_network.state_spec)

    def _variables(self):
        return self._actor_network.variables

    def _distribution(self, time_step, policy_state):
        distributions, policy_state = self._actor_network(time_step.observation, 
                                                          time_step.step_type, 
                                                          policy_state, 
                                                          training=self._training)

        return policy_step.PolicyStep(distributions, policy_state)

In [18]:
# Critic
class SimpleCriticNetwork(network.Network):
    def __init__(self,
                 input_tensor_spec,
                 hidden_units = 256,
                 name='SimpleCriticNetwork'):
        
        super(SimpleCriticNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name=name)
                
        # hidden layer to encode
        self._hidden_layer = tf.keras.layers.Dense(
            hidden_units,
            activation=tf.keras.activations.relu,
            kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform())
        
        self._value = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
            name='value')


    def call(self, inputs, step_type=(), network_state=(), training=False):
        observations, actions = inputs
        observation_action = tf.concat([observations, actions], 1)
        encoding = self._hidden_layer(observation_action, training=training)
        value = self._value(encoding, training=training)
        return tf.reshape(value, [-1]), network_state

In [19]:
actor_net = SimpleActorDistributionNetwork(observation_spec, action_spec)
critic_net = SimpleCriticNetwork((observation_spec, action_spec))

In [20]:
# Agent
global_step = tf.compat.v1.train.get_or_create_global_step()
agent = sac_agent.SacAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=alpha_learning_rate),
    actor_policy_ctor=SimpleActorPolicy,
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=global_step)
agent.initialize()

In [21]:
# Load Weights
def load_model_weights():
    agent._actor_network.load_weights("./{}/actor/saved_actor".format(env_name))
    agent._critic_network_1.load_weights("./{}/critic1/saved_critic".format(env_name))
    agent._critic_network_2.load_weights("./{}/critic2/saved_critic".format(env_name))

if (load_weights == True):   
    print("Loaded Weights")
    load_model_weights()

In [22]:
# Create the replay buffer for training
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

# Collect some random samples to start.
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

dynamic_step_driver.DynamicStepDriver(
    train_env, 
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps).run()

# Create collection driver
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps, replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env, 
    agent.collect_policy,
    observers=observers,
    num_steps=collect_steps_per_iteration)

# Create a data set for the training loop
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

In [23]:
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("avg_return={}; max_return={}".format(avg_return, np.amax(returns)))

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_op.run()
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = env_steps.result().numpy()
    episodes = num_episodes.result().numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: episodes={1}: Average Return = {2}'.format(step, episodes, avg_return))
        if (avg_return > max(returns)):
            # Save Weights
            print("Save Weights: avg_return={}; max_return={}".format(avg_return, np.amax(returns)))
            agent._actor_network.save_weights("./{}/actor/saved_actor".format(env_name))
            agent._critic_network_1.save_weights("./{}/critic1/saved_critic".format(env_name))
            agent._critic_network_2.save_weights("./{}/critic2/saved_critic".format(env_name))
        returns.append(avg_return)


avg_return=-1351.9837646484375; max_return=-1351.9837646484375
step = 200: loss = 10.101609230041504
step = 400: loss = 14.692626953125
step = 600: loss = 18.98578453063965
step = 800: loss = 20.074172973632812
step = 1000: loss = 27.197154998779297
step = 1200: loss = 31.086673736572266
step = 1400: loss = 45.417564392089844
step = 1600: loss = 56.09059143066406
step = 1800: loss = 70.34383392333984
step = 2000: loss = 88.80768585205078
step = 2000: episodes=10: Average Return = -979.494140625
Save Weights: avg_return=-979.494140625; max_return=-1351.9837646484375
step = 2200: loss = 76.24503326416016
step = 2400: loss = 101.06133270263672
step = 2600: loss = 68.80683135986328
step = 2800: loss = 114.81466674804688
step = 3000: loss = 84.49185943603516
step = 3200: loss = 87.3978500366211
step = 3400: loss = 87.2748031616211
step = 3600: loss = 131.154296875
step = 3800: loss = 94.44224548339844
step = 4000: loss = 153.72279357910156
step = 4000: episodes=20: Average Return = -275.071

In [None]:
for i in range (10):
    rewards = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = agent.policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        rewards += time_step.reward
        eval_py_env.render()
    print(rewards)

tf.Tensor([-2.9839423], shape=(1,), dtype=float32)
tf.Tensor([-125.76819], shape=(1,), dtype=float32)
tf.Tensor([-3.0254204], shape=(1,), dtype=float32)
tf.Tensor([-246.59395], shape=(1,), dtype=float32)
tf.Tensor([-355.1019], shape=(1,), dtype=float32)
