In [1]:
import collections
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.specs import tensor_spec
from tf_agents.specs import distribution_spec
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step
from tf_agents.drivers import dynamic_step_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.networks import network
from tf_agents.policies import tf_policy
from tf_agents.policies import random_tf_policy
from tf_agents.agents import tf_agent
from tf_agents.metrics import tf_metrics
from tf_agents.utils import nest_utils
from tf_agents.utils import common

import matplotlib.pyplot as plt

tf.compat.v1.enable_v2_behavior()

In [2]:
#env_name = "Pendulum-v0"
env_name = "LunarLanderContinuous-v2" 

num_iterations = 100000 

initial_collect_steps = 10000  
collect_steps_per_iteration = 1 
replay_buffer_max_length = num_iterations 

batch_size = 256 

value_learning_rate = 3e-4 
softq_learning_rate = 3e-4 
policy_learning_rate = 3e-4 
target_update_tau = 0.005 
target_update_period = 1 
gamma = 0.99 

value_fc_layer_params = (256, 256)
softq_fc_layer_params = (256, 256)
policy_fc_layer_params = (256, 256)

log_interval = 200#5000 

num_eval_episodes = 10 
eval_interval = 5000#10000 
max_episode_steps = 1000

In [3]:
# Environments
train_py_env = suite_gym.load(env_name)#, max_episode_steps=max_episode_steps)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

time_step_spec = train_env.time_step_spec()
observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

In [4]:
# Helper Functions and Networks
class ValueNetwork(network.Network):
    def __init__(self,
                 observation_spec,
                 fc_layer_params,
                 init_w = 3e-3,
                 name='ValueNetwork'):
        
        super(ValueNetwork, self).__init__(
            input_tensor_spec=observation_spec,
            state_spec=(),
            name=name)
          
        self._encoding_layers = []
        for num_units in fc_layer_params:
            self._encoding_layers.append(tf.keras.layers.Dense(
                num_units,
                activation=tf.keras.activations.relu,
                kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
                name='%s/dense' % name))
        
        self._value = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
            name='value')


    def call(self, inputs, step_type=(), network_state=(), training=False):
        encoding = inputs
        
        for layer in self._encoding_layers:
            encoding = layer(encoding, training=training)

        value = self._value(encoding, training=training)
        return tf.reshape(value, [-1]), network_state

class SoftQNetwork(network.Network):
    def __init__(self,
                 observation_and_action_spec,
                 fc_layer_params,
                 init_w = 3e-3,
                 name='SoftQNetwork'):
        
        super(SoftQNetwork, self).__init__(
            input_tensor_spec=observation_and_action_spec,
            state_spec=(),
            name=name)
          
        self._encoding_layers = []
        for num_units in fc_layer_params:
            self._encoding_layers.append(tf.keras.layers.Dense(
                num_units,
                activation=tf.keras.activations.relu,
                kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
                name='%s/dense' % name))
        
        self._value = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
            name='value')


    def call(self, inputs, step_type=(), network_state=(), training=False):
        observations, actions = inputs
        encoding = tf.concat([observations, actions], 1)
        
        for layer in self._encoding_layers:
            encoding = layer(encoding, training=training)

        value = self._value(encoding, training=training)
        return tf.reshape(value, [-1]), network_state

def spec_means_and_magnitudes(action_spec):
    action_means = tf.nest.map_structure(
        lambda spec: (spec.maximum + spec.minimum) / 2.0, action_spec)
    action_magnitudes = tf.nest.map_structure(
        lambda spec: (spec.maximum - spec.minimum) / 2.0, action_spec)
    return tf.cast(action_means, dtype=tf.float32), tf.cast(action_magnitudes, dtype=tf.float32) 
    
class PolicyNetwork(network.DistributionNetwork):
    def __init__(self,
                 observation_spec,
                 action_spec,
                 fc_layer_params,
                 init_w=3e-3, 
                 log_std_min=-20, 
                 log_std_max=2,
                 name="ActorNormalDistributionNetwork"):
        
        action_dist_spec = self._build_distribution_spec(action_spec, name) 
        
        super(PolicyNetwork, self).__init__(
            input_tensor_spec=observation_spec,
            state_spec=(),
            output_spec=action_dist_spec,
            name=name)
        
        self._log_std_min = log_std_min
        self._log_std_max = log_std_max

        num_actions = action_spec.shape.num_elements()
          
        self._encoding_layers = []
        for num_units in fc_layer_params:
            self._encoding_layers.append(tf.keras.layers.Dense(
                num_units,
                activation=tf.keras.activations.relu,
                kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
                name='%s/dense' % name))
        
        self._means_linear = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
            name='means_linear')

        # standard dev layer for distribution
        self._log_std_linear = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.keras.initializers.RandomUniform(minval=-init_w, maxval=init_w),
            name='log_std_linear')
        
        action_means, action_magnitudes = spec_means_and_magnitudes(action_spec)
        bijectors = [tfp.bijectors.Shift(action_means),
                     tfp.bijectors.Scale(action_magnitudes),
                     tfp.bijectors.Tanh()]

        self._bijector_chain = tfp.bijectors.Chain(bijectors)
        
        
    def _build_distribution_spec(self, sample_spec, network_name):
        input_param_shapes = tfp.distributions.Normal.param_static_shapes(sample_spec.shape)

        input_param_spec = {
            name: tensor_spec.TensorSpec(  
                shape=shape,
                dtype=sample_spec.dtype,
                name=network_name + '_' + name)
            for name, shape in input_param_shapes.items()
        }

        return distribution_spec.DistributionSpec(None, input_param_spec, sample_spec=sample_spec)

    
    def call(self, observations, step_type, network_state, training=False):  
        # Feed through fc layers.
        encoding = observations
        
        for layer in self._encoding_layers:
            encoding = layer(encoding, training=training)
        
        # Compute means.
        means = self._means_linear(encoding, training=training)

        # Compute stds:  (take log of std, clip, and exponentiate to get std.)
        log_stds = self._log_std_linear(encoding, training=training)
        log_stds = tf.clip_by_value(log_stds, self._log_std_min, self._log_std_max)
        stds = tf.exp(log_stds)
        
        # Build a distribution using the means and stds.
        distribution = tfp.distributions.Normal(loc=means, scale=stds)
        
        # Take the TanH and shift and scale to fit action spec.
        distribution = tfp.distributions.TransformedDistribution(distribution=distribution, bijector=self._bijector_chain)
        
        return distribution, network_state
    
class ActorPolicy(tf_policy.Base):
    def __init__(self,
        time_step_spec,
        action_spec,
        policy_network,
        training=False):

        policy_network.create_variables()
        self._policy_network = policy_network
        self._training = training

        super(ActorPolicy, self).__init__(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            policy_state_spec=policy_network.state_spec)

    def _variables(self):
        return self._policy_network.variables

    def _distribution(self, time_step, policy_state):
        distributions, policy_state = self._policy_network(time_step.observation,
                                                           time_step.step_type,
                                                           policy_state,
                                                           training=self._training)

        return policy_step.PolicyStep(distributions, policy_state)
    
def actions_and_logprobs(policy, time_steps, epsilon=1e-6):
    action_distribution = policy.distribution(time_steps).action
    actions = action_distribution.sample()
    log_probs = action_distribution.log_prob(actions) - tf.math.log(1-tf.math.pow(actions, 2) + epsilon)
    log_probs = tf.reduce_sum(input_tensor=log_probs, axis=1)
    return actions, log_probs, action_distribution.distribution.loc, action_distribution.distribution.scale

def experience_to_transitions(experience):
    transitions = trajectory.to_transition(experience)
    time_steps, policy_steps, next_time_steps = transitions
    actions = policy_steps.action
    time_steps, actions, next_time_steps = tf.nest.map_structure(
        lambda t: tf.squeeze(t, axis=1),
        (time_steps, actions, next_time_steps))
    return time_steps, actions, next_time_steps

def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [5]:
# Create networks
value_network = ValueNetwork(observation_spec, value_fc_layer_params)
value_network.create_variables()

target_value_network = value_network.copy(name='TargetValueNetwork')
target_value_network.create_variables()

softq_network = SoftQNetwork((observation_spec, action_spec), softq_fc_layer_params)
softq_network.create_variables()

policy_network = PolicyNetwork(observation_spec, action_spec, policy_fc_layer_params)
policy_network.create_variables()

collect_policy = ActorPolicy(time_step_spec, action_spec, policy_network, training=False)
train_policy = ActorPolicy(time_step_spec, action_spec, policy_network, training=True)

# Full copy of network variables.
common.soft_variables_update(
    value_network.variables,
    target_value_network.variables,
    tau=1.0)

# Optimizers
value_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=value_learning_rate)
softq_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=softq_learning_rate)
policy_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=policy_learning_rate)

# Loss Objects
value_loss_fn  = tf.compat.v1.losses.mean_squared_error
softq_loss_fn = tf.compat.v1.losses.mean_squared_error

# Create the replay buffer for training
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=collect_policy.trajectory_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

# Collect some random samples to start.
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

dynamic_step_driver.DynamicStepDriver(
    train_env, 
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps).run()

# Create collection driver
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps, replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env, 
    collect_policy,
    observers=observers,
    num_steps=collect_steps_per_iteration)

# Create a data set for the training loop
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

In [6]:
# Loss Functions
def compute_softq_loss(time_steps, 
                       actions, 
                       next_time_steps,
                       gamma=0.99):
    expected_q_values,_ = softq_network((time_steps.observation, actions))    
    expected_values,_ = value_network(time_steps.observation)
    target_values,_ = target_value_network(next_time_steps.observation)
    next_q_values = tf.stop_gradient(next_time_steps.reward + 
                                     tf.where(next_time_steps.is_last(), 0.0, gamma) * target_values)
    softq_loss = softq_loss_fn(expected_q_values, next_q_values)
    return softq_loss



def compute_value_loss(time_steps):    
    expected_values,_ = value_network(time_steps.observation)    
    new_actions, log_probs,_,_ = actions_and_logprobs(train_policy, time_steps)    
    expected_new_q_values,_ = softq_network((time_steps.observation, new_actions))    
    next_values = tf.stop_gradient(expected_new_q_values - log_probs)    
    value_loss = value_loss_fn(expected_values, next_values)
    
    return value_loss

def compute_policy_loss(time_steps,
                        mean_lambda=1e-3,
                        std_lambda=1e-3):
    
    new_actions, log_probs, means, stds = actions_and_logprobs(train_policy, time_steps)
    expected_new_q_values,_ = softq_network((time_steps.observation, new_actions))
    expected_values,_ = value_network(time_steps.observation)
    
    log_prob_targets = expected_new_q_values - expected_values
    policy_loss = tf.reduce_mean(log_probs * (log_probs - log_prob_targets))
    
    mean_loss = tf.reduce_mean(mean_lambda * tf.math.pow(means, 2))
    std_loss = tf.reduce_mean(std_lambda * tf.math.pow(tf.math.log(stds), 2))
    
    policy_loss = policy_loss + mean_loss + std_loss
    
    return policy_loss
    
def train(experience):
    time_steps, actions, next_time_steps = experience_to_transitions(experience)    
    
    # Soft Q Network Update
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(softq_network.trainable_variables)
        softq_loss = compute_softq_loss(time_steps, actions, next_time_steps)

    tf.debugging.check_numerics(softq_loss, 'softq_loss is inf or nan.')
    softq_grads = tape.gradient(softq_loss, softq_network.trainable_variables)
    softq_optimizer.apply_gradients(list(zip(softq_grads, softq_network.trainable_variables)))
    
    # Value Network Update
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(value_network.trainable_variables)
        value_loss = compute_value_loss(time_steps)

    tf.debugging.check_numerics(value_loss, 'value_loss is inf or nan.')
    value_grads = tape.gradient(value_loss, value_network.trainable_variables)
    value_optimizer.apply_gradients(list(zip(value_grads, value_network.trainable_variables)))
    
    # Policy Network Update
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(policy_network.trainable_variables)
        policy_loss = compute_policy_loss(time_steps)

    tf.debugging.check_numerics(policy_loss, 'value_loss is inf or nan.')
    policy_grads = tape.gradient(policy_loss, policy_network.trainable_variables)
    policy_optimizer.apply_gradients(list(zip(policy_grads, policy_network.trainable_variables)))
    
    loss = softq_loss + value_loss + policy_loss
    
    common.soft_variables_update(
        value_network.variables,
        target_value_network.variables,
        tau=target_update_tau)

    return loss

In [7]:
# Training Loop
train = common.function(train)

avg_return = compute_avg_return(eval_env, collect_policy, num_eval_episodes)
returns = [avg_return]
print("avg_return={}; max_return={}".format(avg_return, np.amax(returns)))

for _ in range(num_iterations):
    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_op.run()
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, _ = next(iterator)
    train_loss = train(experience)

    step = env_steps.result().numpy()
    episodes = num_episodes.result().numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, collect_policy, num_eval_episodes)
        print('step = {0}: episodes={1}: Average Return = {2}'.format(step, episodes, avg_return))
        returns.append(avg_return)

avg_return=-255.2966766357422; max_return=-255.2966766357422
step = 200: loss = 73.11649322509766
step = 400: loss = 80.35567474365234
step = 600: loss = 177.26473999023438
step = 800: loss = 66.3226089477539
step = 1000: loss = 26.098495483398438
step = 1200: loss = 23.025917053222656
step = 1400: loss = 59.6632080078125
step = 1600: loss = 31.872257232666016
step = 1800: loss = 91.0958251953125
step = 2000: loss = 81.86097717285156
step = 2200: loss = 71.95896911621094
step = 2400: loss = 136.72573852539062
step = 2600: loss = 21.989234924316406
step = 2800: loss = 47.65901565551758
step = 3000: loss = 18.632272720336914
step = 3200: loss = 36.99702072143555
step = 3400: loss = 23.883214950561523
step = 3600: loss = 19.61241340637207
step = 3800: loss = 54.72982406616211
step = 4000: loss = 12.680683135986328
step = 4200: loss = 23.739927291870117
step = 4400: loss = 22.95187759399414
step = 4600: loss = 33.02753448486328
step = 4800: loss = 85.91046142578125
step = 5000: loss = 21.0

step = 39600: loss = 27.06711196899414
step = 39800: loss = 43.92499542236328
step = 40000: loss = 18.906190872192383
step = 40000: episodes=332: Average Return = -117.76213073730469
step = 40200: loss = 13.8683500289917
step = 40400: loss = 37.60437774658203
step = 40600: loss = 19.521469116210938
step = 40800: loss = 48.09539031982422
step = 41000: loss = 30.523204803466797
step = 41200: loss = 23.770299911499023
step = 41400: loss = 8.290160179138184
step = 41600: loss = 36.392330169677734
step = 41800: loss = 22.972782135009766
step = 42000: loss = 53.22207260131836
step = 42200: loss = 61.2640380859375
step = 42400: loss = 57.871978759765625
step = 42600: loss = 10.37488079071045
step = 42800: loss = 16.660268783569336
step = 43000: loss = 7.201270580291748
step = 43200: loss = 16.15927505493164
step = 43400: loss = 29.45286750793457
step = 43600: loss = 9.896378517150879
step = 43800: loss = 25.404382705688477
step = 44000: loss = 46.80332946777344
step = 44200: loss = 40.2837486

KeyboardInterrupt: 

In [8]:
for i in range(10):
    rewards = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = collect_policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        rewards += time_step.reward
        eval_py_env.render()
    print(rewards)

tf.Tensor([-55.089905], shape=(1,), dtype=float32)
tf.Tensor([-20.39988], shape=(1,), dtype=float32)
tf.Tensor([-65.65068], shape=(1,), dtype=float32)
tf.Tensor([-29.178307], shape=(1,), dtype=float32)
tf.Tensor([-61.93116], shape=(1,), dtype=float32)
tf.Tensor([-22.067902], shape=(1,), dtype=float32)
tf.Tensor([-74.69744], shape=(1,), dtype=float32)
tf.Tensor([-39.224876], shape=(1,), dtype=float32)
tf.Tensor([-26.235672], shape=(1,), dtype=float32)
tf.Tensor([-71.9828], shape=(1,), dtype=float32)
