In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.policies import actor_policy
from tf_agents.environments import suite_gym
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

tf.compat.v1.enable_v2_behavior()

In [2]:
env_name = "Pendulum-v0" 
#env_name = "LunarLanderContinuous-v2" 

num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 1000 # @param {type:"integer"} 
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 25000 # @param {type:"integer"}

batch_size = 256 # @param {type:"integer"}

critic_learning_rate = 1e-3 # @param {type:"number"}
actor_learning_rate = 1e-3 # @param {type:"number"}
alpha_learning_rate = 1e-3 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}
gradient_clipping = None # @param

actor_fc_layer_params = (256,)
critic_joint_fc_layer_params = (256,)

log_interval = 200 # @param {type:"integer"}

num_eval_episodes = 10 # @param {type:"integer"}
eval_interval = 2000 # @param {type:"integer"}
max_episode_steps = 1000

load_weights = False

In [3]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

print('Observation Spec:')
print(observation_spec)
print('Action Spec:')
print(action_spec)

Observation Spec:
BoundedTensorSpec(shape=(3,), dtype=tf.float32, name='observation', minimum=array([-1., -1., -8.], dtype=float32), maximum=array([1., 1., 8.], dtype=float32))
Action Spec:
BoundedTensorSpec(shape=(1,), dtype=tf.float32, name='action', minimum=array(-2., dtype=float32), maximum=array(2., dtype=float32))


In [4]:
# Critic
critic_net = critic_network.CriticNetwork(
    (observation_spec, action_spec),
    observation_fc_layer_params=None,
    action_fc_layer_params=None,
    joint_fc_layer_params=critic_joint_fc_layer_params)

In [5]:
from tf_agents.networks import network
from tf_agents.specs import tensor_spec
from tf_agents.specs import distribution_spec
from tf_agents.distributions import utils as distribution_utils

def std_clip_transform(stddevs):
    stddevs = tf.nest.map_structure(lambda t: tf.clip_by_value(t, -20, 2),
                                  stddevs)
    return tf.exp(stddevs)


class ActorNormalDistributionNetwork(network.DistributionNetwork):
    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 hidden_units = 256,
                 name="ActorNormalDistributionNetwork"):
        
        output_spec = self._output_distribution_spec(output_tensor_spec, name) 
        
        super(ActorNormalDistributionNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            output_spec=output_spec,
            name=name)

        num_actions = output_tensor_spec.shape.num_elements()
          
        # hidden layer to encode
        self._hidden_layer = tf.keras.layers.Dense(
            hidden_units,
            activation=tf.keras.activations.relu,
            kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
            input_shape=input_tensor_spec.shape)
        
        # means layer for distribution
        init_means_output_factor = 0.1
        std_bias_initializer_value = 0.0
        
        self._means_projection_layer = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Zeros(),
            name='means_projection_layer')

        # standard dev layer for distribution
        self._stddev_projection_layer = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Constant(
                value=std_bias_initializer_value),
            name='stddev_projection_layer')
        
    def _output_distribution_spec(self, sample_spec, network_name):
        input_param_shapes = tfp.distributions.Normal.param_static_shapes(
            sample_spec.shape)

        input_param_spec = {
            name: tensor_spec.TensorSpec(  # pylint: disable=g-complex-comprehension
                shape=shape,
                dtype=sample_spec.dtype,
                name=network_name + '_' + name)
            for name, shape in input_param_shapes.items()
        }

        def distribution_builder(*args, **kwargs):
            distribution = tfp.distributions.Normal(*args, **kwargs)
            return distribution_utils.scale_distribution_to_spec(
                distribution, sample_spec)

        return distribution_spec.DistributionSpec(
            distribution_builder, input_param_spec, sample_spec=sample_spec)

    
    def call(self, observations, step_type, network_state, training=False):
        state = self._hidden_layer(observations, training=training)
        
        means = self._means_projection_layer(state, training=training)

        stds = self._stddev_projection_layer(state, training=training)
        stds = std_clip_transform(stds)

        output_actions = self.output_spec.build_distribution(loc=means, scale=stds)
        return output_actions, network_state

In [6]:
actor_net = ActorNormalDistributionNetwork(observation_spec,
                                               action_spec)

test_policy = actor_policy.ActorPolicy(
    time_step_spec=train_env.time_step_spec(),
    action_spec=train_env.action_spec(),
    actor_network=actor_net,
    training=True)


time_step = train_env.reset()
policy_step = test_policy.distribution(time_step)
print(policy_step)
action = policy_step.action
print(action)
sample = action.sample()
print(sample)

Instructions for updating:
`AffineScalar` bijector is deprecated; please use `tfb.Shift(loc)(tfb.Scale(...))` instead.
PolicyStep(action=<tfp.distributions.SquashToSpecNormal 'ActorNormalDistributionNetwork_SquashToSpecNormal' batch_shape=? event_shape=? dtype=float32>, state=(), info=())
tfp.distributions.SquashToSpecNormal("ActorNormalDistributionNetwork_SquashToSpecNormal", dtype=float32)
tf.Tensor([[1.8942659]], shape=(1, 1), dtype=float32)


In [7]:
# Agent
global_step = tf.compat.v1.train.get_or_create_global_step()
agent = sac_agent.SacAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=alpha_learning_rate),
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=global_step)
agent.initialize()

In [8]:
# Load Weights
def load_model_weights():
    agent._actor_network.load_weights("./{}/actor/saved_actor".format(env_name))
    agent._critic_network_1.load_weights("./{}/critic1/saved_critic".format(env_name))
    agent._critic_network_2.load_weights("./{}/critic2/saved_critic".format(env_name))

if (load_weights == True):   
    print("Loaded Weights")
    load_model_weights()

In [9]:
# Create the replay buffer for training
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

# Collect some random samples to start.
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

dynamic_step_driver.DynamicStepDriver(
    train_env, 
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps).run()

# Create collection driver
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps, replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env, 
    agent.collect_policy,
    observers=observers,
    num_steps=collect_steps_per_iteration)

# Create a data set for the training loop
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

In [None]:
def measure_distribution(timestep, policy, num_samples = 1000):
    samples = np.zeros(num_samples)
    for i in range (num_samples):
        samples[i] = policy.action(timestep).action.numpy()
    return np.mean(samples), np.std(samples)

def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("avg_return={}; max_return={}".format(avg_return, np.amax(returns)))

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_op.run()
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = env_steps.result().numpy()
    episodes = num_episodes.result().numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: episodes={1}: Average Return = {2}'.format(step, episodes, avg_return))
        if (avg_return > max(returns)):
            # Save Weights
            print("Save Weights: avg_return={}; max_return={}".format(avg_return, np.amax(returns)))
            agent._actor_network.save_weights("./{}/actor/saved_actor".format(env_name))
            agent._critic_network_1.save_weights("./{}/critic1/saved_critic".format(env_name))
            agent._critic_network_2.save_weights("./{}/critic2/saved_critic".format(env_name))
        returns.append(avg_return)


avg_return=-1174.628662109375; max_return=-1174.628662109375
step = 200: loss = 11.34770393371582
step = 400: loss = 11.721802711486816
step = 600: loss = 25.475940704345703
step = 800: loss = 26.16969871520996
step = 1000: loss = 24.063261032104492
step = 1200: loss = 39.975547790527344
step = 1400: loss = 70.51896667480469
step = 1600: loss = 57.419036865234375
step = 1800: loss = 57.44429016113281
step = 2000: loss = 53.252220153808594
step = 2000: episodes=10: Average Return = -999.21728515625
Save Weights: avg_return=-999.21728515625; max_return=-1174.628662109375
step = 2200: loss = 106.83100891113281
step = 2400: loss = 60.8735237121582
step = 2600: loss = 92.16670227050781
step = 2800: loss = 77.19509887695312
step = 3000: loss = 78.80830383300781
step = 3200: loss = 125.9784164428711
step = 3400: loss = 114.60777282714844
step = 3600: loss = 130.6441192626953
step = 3800: loss = 120.84144592285156
step = 4000: loss = 96.01883697509766
step = 4000: episodes=20: Average Return =

In [None]:
for i in range (10):
    rewards = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = agent.policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        rewards += time_step.reward
        eval_py_env.render()
    print(rewards)