In [1]:
import collections
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.specs import tensor_spec
from tf_agents.specs import distribution_spec
#from tf_agents.environments import suite_gym
from tf_agents.environments import gym_wrapper
from tf_agents.environments import tf_py_environment
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step
from tf_agents.drivers import dynamic_step_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.networks import network
from tf_agents.policies import tf_policy
from tf_agents.policies import random_tf_policy
from tf_agents.agents import tf_agent
from tf_agents.metrics import tf_metrics
from tf_agents.utils import nest_utils
from tf_agents.utils import common

import matplotlib.pyplot as plt

tf.compat.v1.enable_v2_behavior()

In [2]:
env_name = "Pendulum-v0"
#env_name = "LunarLanderContinuous-v2" 

num_iterations = 40000#100000 

initial_collect_steps = 1000#10000  
collect_steps_per_iteration = 1 
replay_buffer_max_length = num_iterations 

batch_size = 256 

value_learning_rate = 3e-4 
softq_learning_rate = 3e-4 
policy_learning_rate = 3e-4 
target_update_tau = 0.01 
target_update_period = 1 
gamma = 0.99 

value_fc_layer_params = (256, 256)
softq_fc_layer_params = (256, 256)
policy_fc_layer_params = (256, 256)

log_interval = 200#5000 

num_eval_episodes = 10 
eval_interval = 1000#10000 
#max_episode_steps = 1000

In [3]:
# Environments
import gym
class NormalizedActions(gym.ActionWrapper):
    def _action(self, action):
        low  = self.action_space.low
        high = self.action_space.high
        
        action = low + (action + 1.0) * 0.5 * (high - low)
        action = np.clip(action, low, high)
        
        return action

    def _reverse_action(self, action):
        low  = self.action_space.low
        high = self.action_space.high
        
        action = 2 * (action - low) / (high - low) - 1
        action = np.clip(action, low, high)
        
        return actions


train_py_env = gym_wrapper.GymWrapper(NormalizedActions(gym.make(env_name)))
eval_py_env = gym_wrapper.GymWrapper(NormalizedActions(gym.make(env_name)))
    
#train_py_env = suite_gym.load(env_name)#, max_episode_steps=max_episode_steps)
#eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

time_step_spec = train_env.time_step_spec()
observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

In [4]:
# Helper Functions and Networks
class ValueNetwork(network.Network):
    def __init__(self,
                 observation_spec,
                 fc_layer_params,
                 init_w = 3e-3,
                 name='ValueNetwork'):
        
        super(ValueNetwork, self).__init__(
            input_tensor_spec=observation_spec,
            state_spec=(),
            name=name)
          
        self._encoding_layers = []
        for num_units in fc_layer_params:
            self._encoding_layers.append(tf.keras.layers.Dense(
                num_units,
                activation=tf.keras.activations.relu,
                kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
                name='%s/dense' % name))
        
        self._value = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
            name='value')


    def call(self, observations, step_type, network_state, training=False):
        encoding = observations
        
        for layer in self._encoding_layers:
            encoding = layer(encoding, training=training)

        value = self._value(encoding, training=training)
        return tf.reshape(value, [-1]), network_state

    
class SoftQNetwork(network.Network):
    def __init__(self,
                 observation_and_action_spec,
                 fc_layer_params,
                 init_w = 3e-3,
                 name='SoftQNetwork'):
        
        super(SoftQNetwork, self).__init__(
            input_tensor_spec=observation_and_action_spec,
            state_spec=(),
            name=name)
          
        self._encoding_layers = []
        for num_units in fc_layer_params:
            self._encoding_layers.append(tf.keras.layers.Dense(
                num_units,
                activation=tf.keras.activations.relu,
                kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
                name='%s/dense' % name))
        
        self._value = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
            name='value')


    def call(self, observations_and_actions, step_type, network_state, training=False):
        observations, actions = observations_and_actions
        encoding = tf.concat([observations, actions], 1)
        
        for layer in self._encoding_layers:
            encoding = layer(encoding, training=training)

        value = self._value(encoding, training=training)
        return tf.reshape(value, [-1]), network_state

def spec_means_and_magnitudes(action_spec):
    action_means = tf.nest.map_structure(
        lambda spec: (spec.maximum + spec.minimum) / 2.0, action_spec)
    action_magnitudes = tf.nest.map_structure(
        lambda spec: (spec.maximum - spec.minimum) / 2.0, action_spec)
    return tf.cast(action_means, dtype=tf.float32), tf.cast(action_magnitudes, dtype=tf.float32) 
    
class PolicyNetwork(network.Network):
    def __init__(self,
                 observation_spec,
                 action_spec,
                 fc_layer_params,
                 init_w=3e-3, 
                 log_std_min=-20, 
                 log_std_max=2,
                 name="ActorNormalDistributionNetwork"):

        super(PolicyNetwork, self).__init__(
            input_tensor_spec=observation_spec,
            state_spec=(),
            name=name)
        
        self._log_std_min = log_std_min
        self._log_std_max = log_std_max

        num_actions = action_spec.shape.num_elements()
          
        self._encoding_layers = []
        for num_units in fc_layer_params:
            self._encoding_layers.append(tf.keras.layers.Dense(
                num_units,
                activation=tf.keras.activations.relu,
                kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(),
                name='%s/dense' % name))
        
        init_means_output_factor = 0.1
        std_bias_initializer_value = 0.0

        # mean layer for distribution
        self._means_linear = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Zeros(),
            name='means_linear')

        # standard dev layer for distribution
        self._log_std_linear = tf.keras.layers.Dense(
            num_actions,
            activation=None,
            kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
                scale=init_means_output_factor),
            bias_initializer=tf.keras.initializers.Constant(
                value=std_bias_initializer_value),
            name='log_std_linear')
        
    def call(self, observations, step_type, network_state, training=False):  
        # Feed through fc layers.
        encoding = observations
        
        for layer in self._encoding_layers:
            encoding = layer(encoding, training=training)
        
        # Compute means.
        means = self._means_linear(encoding, training=training)

        # Compute stds:  (take log of std, clip, and exponentiate to get std.)
        log_stds = self._log_std_linear(encoding, training=training)
        log_stds = tf.clip_by_value(log_stds, self._log_std_min, self._log_std_max)
        
        return (means, log_stds), network_state
    
class ActorPolicy(tf_policy.Base):
    def __init__(self,
        time_step_spec,
        action_spec,
        policy_network,
        training=False):

        policy_network.create_variables()
        self._policy_network = policy_network
        self._training = training

        super(ActorPolicy, self).__init__(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            policy_state_spec=policy_network.state_spec)

    def _variables(self):
        return self._policy_network.variables

    def _action(self, time_step, policy_state, seed):
        means_and_log_std, policy_state = self._policy_network(time_step.observation,
                                                           time_step.step_type,
                                                           policy_state,
                                                           training=self._training)
        
        
        mean, log_std = means_and_log_std
        std = tf.exp(log_std)
        
        normal = tfp.distributions.Normal(loc=mean, scale=std)
        z = normal.sample()
        action = tf.math.tanh(z)
                
        return policy_step.PolicyStep(action, policy_state)
    
def actions_and_logprobs(policy_network, time_steps, training, epsilon=1e-6):
    mean_and_log_std, _ = policy_network(time_steps.observation,
                                         time_steps.step_type,
                                         network_state=(),
                                         training=training)
    
    
    mean, log_std = mean_and_log_std
    std = tf.exp(log_std)

    normal = tfp.distributions.Normal(loc=mean, scale=std)
    z = normal.sample()
    action = tf.math.tanh(z)

    log_prob = normal.log_prob(z) - tf.math.log(1 - tf.math.pow(action, 2) + epsilon)
    log_prob = tf.reduce_sum(input_tensor=log_prob, axis=1)

    return action, log_prob, z, mean, log_std


def experience_to_transitions(experience):
    transitions = trajectory.to_transition(experience)
    time_steps, policy_steps, next_time_steps = transitions
    actions = policy_steps.action
    time_steps, actions, next_time_steps = tf.nest.map_structure(
        lambda t: tf.squeeze(t, axis=1),
        (time_steps, actions, next_time_steps))
    return time_steps, actions, next_time_steps

def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [5]:
# Create networks
value_network = ValueNetwork(observation_spec, value_fc_layer_params)
value_network.create_variables()

target_value_network = value_network.copy(name='TargetValueNetwork')
target_value_network.create_variables()

softq_network = SoftQNetwork((observation_spec, action_spec), softq_fc_layer_params)
softq_network.create_variables()

policy_network = PolicyNetwork(observation_spec, action_spec, policy_fc_layer_params)
policy_network.create_variables()

collect_policy = ActorPolicy(time_step_spec, action_spec, policy_network, training=False)

# Full copy of network variables.
common.soft_variables_update(
    value_network.variables,
    target_value_network.variables,
    tau=1.0)

# Optimizers
value_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=value_learning_rate)
softq_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=softq_learning_rate)
policy_optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=policy_learning_rate)

# Loss Objects
value_loss_fn  = tf.compat.v1.losses.mean_squared_error
softq_loss_fn = tf.compat.v1.losses.mean_squared_error

# Create the replay buffer for training
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=collect_policy.trajectory_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

# Collect some random samples to start.
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

dynamic_step_driver.DynamicStepDriver(
    train_env, 
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps).run()

# Create collection driver
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps, replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env, 
    collect_policy,
    observers=observers,
    num_steps=collect_steps_per_iteration)

# Create a data set for the training loop
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)



In [6]:
# Loss Functions
def compute_softq_loss(time_steps, 
                       actions, 
                       next_time_steps,
                       gamma=0.99):
    # Forward pass on Q network
    expected_q_value,_ = softq_network((time_steps.observation, actions),
                                       time_steps.step_type,
                                       network_state=(),
                                       training=True)    
    
    # Compute Targets
    target_value,_ = target_value_network(next_time_steps.observation,
                                          next_time_steps.step_type,
                                          network_state=(),
                                          training=False)
    next_q_value = next_time_steps.reward + tf.where(next_time_steps.is_last(), 0.0, 1.0) * gamma * target_value
    next_q_value = tf.stop_gradient(next_q_value)
    softq_loss = softq_loss_fn(expected_q_value, next_q_value)

    return softq_loss

def compute_value_loss(time_steps):
    # Forward pass on value network
    expected_value,_ = value_network(time_steps.observation,
                                     time_steps.step_type,
                                     network_state=(),
                                     training=True)
    
    # Compute Targets
    action, log_prob, z, mean, log_std = actions_and_logprobs(policy_network, time_steps, training=False)
    expected_new_q_value,_ = softq_network((time_steps.observation, action),
                                           time_steps.step_type,
                                           network_state=(),
                                           training=False)    
    
    next_value = expected_new_q_value - log_prob
    next_value = tf.stop_gradient(next_value)
    value_loss = value_loss_fn(expected_value, next_value)
    return value_loss

def compute_policy_loss(time_steps,
                        mean_lambda=1e-3,
                        std_lambda=1e-3,
                        z_lambda=0.0):
        
    # Forward pass on policy network
    action, log_prob, z, mean, log_std = actions_and_logprobs(policy_network, time_steps, training=True)

    # Compute Targets
    expected_value,_ = value_network(time_steps.observation,
                                     time_steps.step_type,
                                     network_state=(),
                                     training=False)
    
    expected_new_q_value,_ = softq_network((time_steps.observation, action),
                                           time_steps.step_type,
                                           network_state=(),
                                           training=False)    

    log_prob_target = expected_new_q_value - expected_value
    policy_loss = (log_prob * (log_prob - log_prob_target))
    policy_loss = tf.reduce_mean(policy_loss)
    
    mean_loss = mean_lambda * tf.reduce_mean(tf.math.pow(mean, 2))
    std_loss = std_lambda * tf.reduce_mean(tf.math.pow(log_std, 2))
    z_loss = z_lambda * tf.reduce_mean(tf.math.pow(z, 2))

    return policy_loss + mean_loss + std_loss + z_loss
    
def train(experience):
    time_steps, actions, next_time_steps = experience_to_transitions(experience)    
    
    # Soft Q Network Update
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(softq_network.trainable_variables)
        softq_loss = compute_softq_loss(time_steps, actions, next_time_steps)

    tf.debugging.check_numerics(softq_loss, 'softq_loss is inf or nan.')
    softq_grads = tape.gradient(softq_loss, softq_network.trainable_variables)
    softq_optimizer.apply_gradients(list(zip(softq_grads, softq_network.trainable_variables)))
    
    # Value Network Update
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(value_network.trainable_variables)
        value_loss = compute_value_loss(time_steps)

    tf.debugging.check_numerics(value_loss, 'value_loss is inf or nan.')
    value_grads = tape.gradient(value_loss, value_network.trainable_variables)
    value_optimizer.apply_gradients(list(zip(value_grads, value_network.trainable_variables)))
    
    # Policy Network Update
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(policy_network.trainable_variables)
        policy_loss = compute_policy_loss(time_steps)

    tf.debugging.check_numerics(policy_loss, 'value_loss is inf or nan.')
    policy_grads = tape.gradient(policy_loss, policy_network.trainable_variables)
    policy_optimizer.apply_gradients(list(zip(policy_grads, policy_network.trainable_variables)))
    
    loss = softq_loss + value_loss + policy_loss
    
    common.soft_variables_update(
        value_network.variables,
        target_value_network.variables,
        tau=target_update_tau)

    return loss

In [7]:
# Training Loop
train = common.function(train)

avg_return = compute_avg_return(eval_env, collect_policy, num_eval_episodes)
returns = [avg_return]
print("avg_return={}; max_return={}".format(avg_return, np.amax(returns)))

for _ in range(num_iterations):
    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_op.run()
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, _ = next(iterator)
    train_loss = train(experience)

    step = env_steps.result().numpy()
    episodes = num_episodes.result().numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, collect_policy, num_eval_episodes)
        print('step = {0}: episodes={1}: Average Return = {2}'.format(step, episodes, avg_return))
        returns.append(avg_return)

avg_return=-1195.135498046875; max_return=-1195.135498046875
step = 200: loss = 1.1680642366409302
step = 400: loss = 4.698259353637695
step = 600: loss = 1.518215537071228
step = 800: loss = 19.684005737304688
step = 1000: loss = 41.48929214477539
step = 1000: episodes=5: Average Return = -1279.041015625
step = 1200: loss = 43.52537155151367
step = 1400: loss = 5.897590637207031
step = 1600: loss = 45.54631805419922
step = 1800: loss = 29.014049530029297
step = 2000: loss = 60.02605438232422
step = 2000: episodes=10: Average Return = -1058.091796875
step = 2200: loss = 44.53179931640625
step = 2400: loss = 87.1839828491211
step = 2600: loss = 33.8336296081543
step = 2800: loss = 150.0789337158203
step = 3000: loss = 85.1163558959961
step = 3000: episodes=15: Average Return = -1164.81640625
step = 3200: loss = 63.95691680908203
step = 3400: loss = 9.742003440856934
step = 3600: loss = 51.67510986328125
step = 3800: loss = 52.92380905151367
step = 4000: loss = 278.74298095703125
step = 

KeyboardInterrupt: 

In [None]:
for i in range(10):
    rewards = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = collect_policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        rewards += time_step.reward
        eval_py_env.render()
    print(rewards)

tf.Tensor([-1040.5687], shape=(1,), dtype=float32)
tf.Tensor([-583.11], shape=(1,), dtype=float32)
tf.Tensor([-2.2975605], shape=(1,), dtype=float32)
tf.Tensor([-358.5087], shape=(1,), dtype=float32)
tf.Tensor([-669.21893], shape=(1,), dtype=float32)
