In [1]:
import collections
import numpy as np
import tensorflow as tf

from tf_agents.utils import common
from tf_agents.utils import composite
from tf_agents.utils import training as training_lib
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.policies import random_tf_policy
from tf_agents.policies import tf_policy
from tf_agents.networks import network
from tf_agents.agents import tf_agent
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver

tf.compat.v1.enable_v2_behavior()

In [2]:
num_iterations = 750000 

initial_collect_steps = 1000   
collect_steps_per_iteration = 1  
replay_buffer_max_length = num_iterations  

batch_size = 256  
learning_rate = 3e-4  
log_interval = 5000  

num_eval_episodes = 10  
eval_interval = 25000 
max_episode_steps = 1000

gamma = 0.99 
temp = 0.001 

In [3]:
env_name = 'LunarLander-v2'

train_py_env = suite_gym.load(env_name, max_episode_steps=max_episode_steps)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

print(train_env.observation_spec())
print(train_env.action_spec())

BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32))
BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(3))


In [4]:
class ActionValueNetwork(network.Network):
    def __init__(self,
                 input_tensor_spec,
                 action_spec,
                 hidden_units,
                 name=None):
        
        super(ActionValueNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name=name)

        action_spec = tf.nest.flatten(action_spec)[0]
        num_actions = action_spec.maximum - action_spec.minimum + 1

        self._hidden = tf.keras.layers.Dense(hidden_units, 
                                             activation=tf.keras.activations.relu, 
                                             kernel_initializer='orthogonal',
                                             input_shape=input_tensor_spec.shape)
        
        self._q_value_layer = tf.keras.layers.Dense(num_actions,
                                                    activation=None,
                                                    kernel_initializer='orthogonal')


    def call(self, observation, step_type=None, network_state=(), training=False):
        action_values = self._hidden(observation, training=training)
        action_values = self._q_value_layer(action_values, training=training)
        return action_values, network_state

In [5]:
q_net = ActionValueNetwork(train_env.observation_spec(), 
                           train_env.action_spec(), 
                           256)
q_net.load_weights("./lunar_expected/saved")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x10ab652d0>

In [6]:
def softmax_with_temp(q_values, temp):
    preferences = tf.divide(q_values, temp)
    max_preference = tf.math.reduce_max(preferences, axis=1, keepdims=True)
    exp_preferences = tf.exp(preferences-max_preference)
    sum_exp_preferences = tf.reduce_sum(tf.exp(preferences-max_preference), axis=1, keepdims=True)
    return tf.divide(exp_preferences, sum_exp_preferences)

class SoftMaxPolicyWithTemp(tf_policy.Base):
    def __init__(self, time_step_spec, action_spec, q_network, temp, name=None):
        
        super(SoftMaxPolicyWithTemp, self).__init__(
            time_step_spec,
            action_spec,
            policy_state_spec=q_network.state_spec,
            name=name)
        
        self._temp = temp
        self._q_network = q_network
        
    def _variables(self):
        return self._q_network.variables
    
    def _action(self, time_step, policy_state, seed):
        network_observation = time_step.observation
        q_values, policy_state = self._q_network(network_observation, time_step.step_type, policy_state)
        probs = softmax_with_temp(q_values, self._temp)
        action = tf.random.categorical(tf.math.log(probs), 1)
        action = tf.expand_dims(tf.squeeze(action), 0)
        return policy_step.PolicyStep(action, policy_state)

In [7]:
class MyLossInfo(collections.namedtuple('MyLossInfo',
                                         ('td_loss', 'td_error'))):
    pass

def compute_td_targets(next_q_values, rewards, discounts):
    return tf.stop_gradient(rewards + discounts * next_q_values)

class MyAgent(tf_agent.TFAgent):
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 q_network,
                 optimizer,
                 gamma,
                 temp,
                 n_step_update=1,
                 train_step_counter=None,
                 name=None):
        tf.Module.__init__(self, name=name)

        self._q_network = q_network
        self._q_network.create_variables()
        self._target_q_network = common.maybe_copy_target_network_with_checks(
            self._q_network, None, 'TargetQNetwork')

        self._n_step_update = n_step_update
        self._optimizer = optimizer
        self._td_errors_loss_fn = common.element_wise_squared_loss
        self._gamma = gamma
        self._update_target = self._get_target_updater()
        
        self._temp = temp

        collect_policy = SoftMaxPolicyWithTemp(train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, temp=self._temp)
        policy = SoftMaxPolicyWithTemp(train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, temp=self._temp)

        train_sequence_length = n_step_update + 1

        print("gamma={}; temp={}".format(gamma, temp))
        super(MyAgent, self).__init__(
            time_step_spec,
            action_spec,
            policy,
            collect_policy,
            train_sequence_length=train_sequence_length,
            train_step_counter=train_step_counter)
        
    def _get_target_updater(self, tau=1.0, period=1):
        with tf.name_scope('update_targets'):

            def update():
                return common.soft_variables_update(
                    self._q_network.variables,
                    self._target_q_network.variables,
                    tau,
                    tau_non_trainable=1.0)

        return common.Periodically(update, period, 'periodic_update_targets')
        
    def _initialize(self):
        common.soft_variables_update(self._q_network.variables, self._target_q_network.variables, tau=1.0)
        
    def _experience_to_transitions(self, experience):
        transitions = trajectory.to_transition(experience)
        transitions = tf.nest.map_structure(lambda x: composite.squeeze(x, 1),
                                          transitions)
        time_steps, policy_steps, next_time_steps = transitions
        actions = policy_steps.action
        return time_steps, actions, next_time_steps

    # Use @common.function in graph mode or for speeding up.
    def _train(self, experience, weights):
        with tf.GradientTape() as tape:
            loss_info = self._loss(experience, training=True)
        
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        variables_to_train = self._q_network.trainable_weights
        non_trainable_weights = self._q_network.non_trainable_weights
        assert list(variables_to_train), "No variables in the agent's q_network."
        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = list(zip(grads, variables_to_train))

        training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter)

        self._update_target()

        return loss_info

    def _loss(self, experience, training=False):
        time_steps, actions, next_time_steps = self._experience_to_transitions(experience)

        with tf.name_scope('loss'):
            q_values = self._compute_q_values(time_steps, actions, training=training)

            next_q_values = self._compute_next_q_values(next_time_steps)

            td_targets = compute_td_targets(
                next_q_values,
                rewards = next_time_steps.reward,
                discounts = self._gamma * next_time_steps.discount)
            
            valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
            td_error = valid_mask * (td_targets - q_values)

            td_loss = valid_mask * self._td_errors_loss_fn(td_targets, q_values)

            loss = tf.reduce_mean(input_tensor=td_loss)

            if self._q_network.losses:
                loss = loss + tf.reduce_mean(self._q_network.losses)

            return tf_agent.LossInfo(loss, MyLossInfo(td_loss=td_loss,
                                                     td_error=td_error))

        
    def _compute_q_values(self, time_steps, actions, training=False):
        network_observation = time_steps.observation

        q_values, _ = self._q_network(network_observation, time_steps.step_type,
                                      training=training)
        multi_dim_actions = self._action_spec.shape.rank > 0
        return common.index_with_actions(
            q_values,
            tf.cast(actions, dtype=tf.int32),
            multi_dim_actions=multi_dim_actions)
    
    def _compute_next_q_values(self, next_time_steps):
        network_observation = next_time_steps.observation

        next_target_q_values, _ = self._target_q_network(network_observation, next_time_steps.step_type)

        probs = softmax_with_temp(next_target_q_values, self._temp)

        v = tf.math.reduce_sum(next_target_q_values * probs, axis=1, keepdims=True)
        return tf.squeeze(v)

In [8]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = MyAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    gamma=gamma,
    temp=temp,
    train_step_counter=train_step_counter)

agent.initialize()

gamma=0.99; temp=0.001


In [9]:
# Create the replay buffer for training
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

# Collect some random samples to start.
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

dynamic_step_driver.DynamicStepDriver(
    train_env, 
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=initial_collect_steps).run()

# Create collection driver
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps, replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env, 
    agent.collect_policy,
    observers=observers,
    num_steps=collect_steps_per_iteration)

# Create a data set for the training loop
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("avg_return={}; max_return={}".format(avg_return, np.amax(returns)))

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_op.run()
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = env_steps.result().numpy()
    episodes = num_episodes.result().numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: episodes={1}: Average Return = {2}'.format(step, episodes, avg_return))
        if (avg_return > max(returns)):
            # Save Weights
            print("Save Weights: avg_return={}; max_return={}".format(avg_return, np.amax(returns)))
            agent._q_network.save_weights("./lunar_expected/saved")
        returns.append(avg_return)

In [11]:
for i in range(15):
    rewards = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = agent.policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        rewards += time_step.reward
        eval_py_env.render()
    print(rewards)

tf.Tensor([178.8996], shape=(1,), dtype=float32)
tf.Tensor([-47.12119], shape=(1,), dtype=float32)
tf.Tensor([-35.595383], shape=(1,), dtype=float32)
tf.Tensor([241.51048], shape=(1,), dtype=float32)
tf.Tensor([199.51816], shape=(1,), dtype=float32)
tf.Tensor([35.399612], shape=(1,), dtype=float32)
tf.Tensor([-74.43492], shape=(1,), dtype=float32)
tf.Tensor([145.70358], shape=(1,), dtype=float32)
tf.Tensor([158.64182], shape=(1,), dtype=float32)
tf.Tensor([-66.16146], shape=(1,), dtype=float32)
tf.Tensor([233.58209], shape=(1,), dtype=float32)
tf.Tensor([116.82008], shape=(1,), dtype=float32)
tf.Tensor([117.16545], shape=(1,), dtype=float32)
tf.Tensor([-47.695408], shape=(1,), dtype=float32)
tf.Tensor([125.30995], shape=(1,), dtype=float32)
