In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.policies import actor_policy
from tf_agents.environments import suite_gym
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

tf.compat.v1.enable_v2_behavior()

In [2]:
env_name = "Pendulum-v0" 

num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 1000 # @param {type:"integer"} 
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 25000 # @param {type:"integer"}

batch_size = 256 # @param {type:"integer"}

critic_learning_rate = 1e-3 # @param {type:"number"}
actor_learning_rate = 1e-3 # @param {type:"number"}
alpha_learning_rate = 1e-3 # @param {type:"number"}
target_update_tau = 0.05 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}
gradient_clipping = None # @param

actor_fc_layer_params = (256,)
critic_joint_fc_layer_params = (256,)

log_interval = 200 # @param {type:"integer"}

num_eval_episodes = 10 # @param {type:"integer"}
eval_interval = 2000 # @param {type:"integer"}
max_episode_steps = 1000

load_weights = False

In [3]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

print('Observation Spec:')
print(observation_spec)
print('Action Spec:')
print(action_spec)

Observation Spec:
BoundedTensorSpec(shape=(3,), dtype=tf.float32, name='observation', minimum=array([-1., -1., -8.], dtype=float32), maximum=array([1., 1., 8.], dtype=float32))
Action Spec:
BoundedTensorSpec(shape=(1,), dtype=tf.float32, name='action', minimum=array(-2., dtype=float32), maximum=array(2., dtype=float32))


In [4]:
# Critic
critic_net = critic_network.CriticNetwork(
    (observation_spec, action_spec),
    observation_fc_layer_params=None,
    action_fc_layer_params=None,
    joint_fc_layer_params=critic_joint_fc_layer_params)

In [7]:
from tf_agents.networks import categorical_projection_network
from tf_agents.networks import encoding_network
from tf_agents.networks import network
from tf_agents.specs import tensor_spec
from tf_agents.utils import nest_utils

from tf_agents.distributions import utils as distribution_utils
from tf_agents.networks import bias_layer
from tf_agents.networks import utils as network_utils
from tf_agents.specs import distribution_spec
from tf_agents.specs import tensor_spec


def tanh_squash_to_spec(inputs, spec):
    """Maps inputs with arbitrary range to range defined by spec using `tanh`."""
    means = (spec.maximum + spec.minimum) / 2.0
    magnitudes = (spec.maximum - spec.minimum) / 2.0

    return means + magnitudes * tf.tanh(inputs)


class NormalProjectionNetwork(network.DistributionNetwork):
    def __init__(self,
        sample_spec,
        activation_fn=None,
        init_means_output_factor=0.1,
        std_bias_initializer_value=0.0,
        mean_transform=tanh_squash_to_spec,
        std_transform=tf.nn.softplus,
        state_dependent_std=False,
        scale_distribution=False,
        name='NormalProjectionNetwork'):
        print("Using Normal")
        if len(tf.nest.flatten(sample_spec)) != 1:
            raise ValueError('Normal Projection network only supports single spec '
                       'samples.')
        self._scale_distribution = scale_distribution
        output_spec = self._output_distribution_spec(sample_spec, name)
        super(NormalProjectionNetwork, self).__init__(
            # We don't need these, but base class requires them.
            input_tensor_spec=None,
            state_spec=(),
            output_spec=output_spec,
            name=name)

        self._sample_spec = sample_spec
        self._mean_transform = mean_transform
        self._std_transform = std_transform
        self._state_dependent_std = state_dependent_std

        self._means_projection_layer = tf.keras.layers.Dense(
            sample_spec.shape.num_elements(),
            activation=activation_fn,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Zeros(),
            name='means_projection_layer')

        self._stddev_projection_layer = None
        if self._state_dependent_std:
            self._stddev_projection_layer = tf.keras.layers.Dense(
                sample_spec.shape.num_elements(),
                activation=activation_fn,
                kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                  scale=init_means_output_factor),
                bias_initializer=tf.keras.initializers.Constant(
                  value=std_bias_initializer_value),
                name='stddev_projection_layer')
        else:
            self._bias = bias_layer.BiasLayer(
                bias_initializer=tf.keras.initializers.Constant(
                    value=std_bias_initializer_value))

    def _output_distribution_spec(self, sample_spec, network_name):
        input_param_shapes = tfp.distributions.Normal.param_static_shapes(
            sample_spec.shape)

        input_param_spec = {
            name: tensor_spec.TensorSpec(  # pylint: disable=g-complex-comprehension
                shape=shape,
                dtype=sample_spec.dtype,
                name=network_name + '_' + name)
            for name, shape in input_param_shapes.items()
        }

        def distribution_builder(*args, **kwargs):
            distribution = tfp.distributions.Normal(*args, **kwargs)
            if self._scale_distribution:
                return distribution_utils.scale_distribution_to_spec(
                    distribution, sample_spec)
            return distribution

        return distribution_spec.DistributionSpec(
            distribution_builder, input_param_spec, sample_spec=sample_spec)

    def call(self, inputs, outer_rank, training=False):
        if inputs.dtype != self._sample_spec.dtype:
            raise ValueError(
            'Inputs to NormalProjectionNetwork must match the sample_spec.dtype.')
        # outer_rank is needed because the projection is not done on the raw
        # observations so getting the outer rank is hard as there is no spec to
        # compare to.
        batch_squash = network_utils.BatchSquash(outer_rank)
        inputs = batch_squash.flatten(inputs)

        means = self._means_projection_layer(inputs, training=training)
        means = tf.reshape(means, [-1] + self._sample_spec.shape.as_list())

        # If scaling the distribution later, use a normalized mean.
        if not self._scale_distribution and self._mean_transform is not None:
            means = self._mean_transform(means, self._sample_spec)
        means = tf.cast(means, self._sample_spec.dtype)

        if self._state_dependent_std:
            stds = self._stddev_projection_layer(inputs, training=training)
        else:
            stds = self._bias(tf.zeros_like(means), training=training)
            stds = tf.reshape(stds, [-1] + self._sample_spec.shape.as_list())

        if self._std_transform is not None:
            stds = self._std_transform(stds)
        stds = tf.cast(stds, self._sample_spec.dtype)

        means = batch_squash.unflatten(means)
        stds = batch_squash.unflatten(stds)

        return self.output_spec.build_distribution(loc=means, scale=stds)


def _categorical_projection_net(action_spec, logits_init_output_factor=0.1):
    return categorical_projection_network.CategoricalProjectionNetwork(
        action_spec, logits_init_output_factor=logits_init_output_factor)


def _normal_projection_net(action_spec,init_means_output_factor=0.1):
    return NormalProjectionNetwork(
        action_spec,
        mean_transform=None,
        state_dependent_std=True,
        init_means_output_factor=init_means_output_factor,
        std_transform=sac_agent.std_clip_transform,
        scale_distribution=True)


class ActorDistributionNetwork(network.DistributionNetwork):
    def __init__(self,
        input_tensor_spec,
        output_tensor_spec,
        preprocessing_layers=None,
        preprocessing_combiner=None,
        conv_layer_params=None,
        fc_layer_params=(200, 100),
        dropout_layer_params=None,
        activation_fn=tf.keras.activations.relu,
        batch_squash=True,
        dtype=tf.float32,
        discrete_projection_net=_categorical_projection_net,
        continuous_projection_net=_normal_projection_net,
        name='ActorDistributionNetwork'):
    
        kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform()

        encoder = encoding_network.EncodingNetwork(
            input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            fc_layer_params=fc_layer_params,
            dropout_layer_params=dropout_layer_params,
            activation_fn=activation_fn,
            kernel_initializer=kernel_initializer,
            batch_squash=batch_squash,
            dtype=dtype)

        def map_proj(spec):
            if tensor_spec.is_discrete(spec):
                return discrete_projection_net(spec)
            else:
                return continuous_projection_net(spec)

        projection_networks = tf.nest.map_structure(map_proj, output_tensor_spec)
        output_spec = tf.nest.map_structure(lambda proj_net: proj_net.output_spec,
                                            projection_networks)

        super(ActorDistributionNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            output_spec=output_spec,
            name=name)

        self._encoder = encoder
        self._projection_networks = projection_networks
        self._output_tensor_spec = output_tensor_spec

    @property
    def output_tensor_spec(self):
        return self._output_tensor_spec

    def call(self, observations, step_type, network_state, training=False):
        state, network_state = self._encoder(
            observations,
            step_type=step_type,
            network_state=network_state,
            training=training)
        outer_rank = nest_utils.get_outer_rank(observations, self.input_tensor_spec)
        output_actions = tf.nest.map_structure(
            lambda proj_net: proj_net(state, outer_rank), self._projection_networks)
        return output_actions, network_state

In [8]:
# Actor
actor_net = ActorDistributionNetwork(
    observation_spec,
    action_spec,
    fc_layer_params=actor_fc_layer_params)

Using Normal


In [9]:
train_policy = actor_policy.ActorPolicy(
    time_step_spec=train_env.time_step_spec(),
    action_spec=train_env.action_spec(),
    actor_network=actor_net,
    training=True)

time_step = train_env.reset()
policy_step = train_policy.distribution(time_step)
print(policy_step)
action = policy_step.action
print(action)
sample = action.sample()
print(sample)

Instructions for updating:
`AffineScalar` bijector is deprecated; please use `tfb.Shift(loc)(tfb.Scale(...))` instead.
PolicyStep(action=<tfp.distributions.SquashToSpecNormal 'ActorDistributionNetwork_NormalProjectionNetwork_SquashToSpecNormal' batch_shape=? event_shape=? dtype=float32>, state=(), info=())
tfp.distributions.SquashToSpecNormal("ActorDistributionNetwork_NormalProjectionNetwork_SquashToSpecNormal", dtype=float32)
tf.Tensor([[1.2909049]], shape=(1, 1), dtype=float32)


In [10]:
# Agent
global_step = tf.compat.v1.train.get_or_create_global_step()
agent = sac_agent.SacAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=alpha_learning_rate),
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=global_step)
agent.initialize()

In [11]:
# Load Weights
def load_model_weights():
    agent._actor_network.load_weights("./{}/actor/saved_actor".format(env_name))
    agent._critic_network_1.load_weights("./{}/critic1/saved_critic".format(env_name))
    agent._critic_network_2.load_weights("./{}/critic2/saved_critic".format(env_name))

if (load_weights == True):   
    print("Loaded Weights")
    load_model_weights()

In [12]:
# Create the replay buffer for training
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

# Collect some random samples to start.
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

dynamic_step_driver.DynamicStepDriver(
    train_env, 
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps).run()

# Create collection driver
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps, replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env, 
    agent.collect_policy,
    observers=observers,
    num_steps=collect_steps_per_iteration)

# Create a data set for the training loop
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

In [13]:
def measure_distribution(timestep, policy, num_samples = 1000):
    samples = np.zeros(num_samples)
    for i in range (num_samples):
        samples[i] = policy.action(timestep).action.numpy()
    return np.mean(samples), np.std(samples)

def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("avg_return={}; max_return={}".format(avg_return, np.amax(returns)))

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_op.run()
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = env_steps.result().numpy()
    episodes = num_episodes.result().numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: episodes={1}: Average Return = {2}'.format(step, episodes, avg_return))
        if (avg_return > max(returns)):
            # Save Weights
            print("Save Weights: avg_return={}; max_return={}".format(avg_return, np.amax(returns)))
            agent._actor_network.save_weights("./{}/actor/saved_actor".format(env_name))
            agent._critic_network_1.save_weights("./{}/critic1/saved_critic".format(env_name))
            agent._critic_network_2.save_weights("./{}/critic2/saved_critic".format(env_name))
        returns.append(avg_return)


avg_return=-1307.593505859375; max_return=-1307.593505859375
step = 200: loss = 67.79924774169922
step = 400: loss = 115.33133697509766
step = 600: loss = 177.2347412109375
step = 800: loss = 225.99684143066406
step = 1000: loss = 270.737548828125
step = 1200: loss = 293.6075744628906
step = 1400: loss = 352.45703125
step = 1600: loss = 378.1627197265625
step = 1800: loss = 370.14508056640625
step = 2000: loss = 399.9150695800781
step = 2000: episodes=10: Average Return = -1172.70654296875
Save Weights: avg_return=-1172.70654296875; max_return=-1307.593505859375
step = 2200: loss = 395.9053649902344
step = 2400: loss = 423.9303283691406
step = 2600: loss = 419.1785583496094
step = 2800: loss = 412.9429931640625
step = 3000: loss = 431.1529235839844
step = 3200: loss = 424.0841064453125
step = 3400: loss = 435.0840148925781
step = 3600: loss = 443.4576416015625
step = 3800: loss = 434.5008239746094
step = 4000: loss = 492.8074645996094
step = 4000: episodes=20: Average Return = -212.836

KeyboardInterrupt: 

In [None]:
for i in range (10):
    rewards = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = agent.policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        rewards += time_step.reward
        eval_py_env.render()
    print(rewards)

tf.Tensor([-128.00604], shape=(1,), dtype=float32)
tf.Tensor([-1.6130695], shape=(1,), dtype=float32)
