In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf

import dqn_agent
#from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
#from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

tf.compat.v1.enable_v2_behavior()

In [None]:
env_name = 'CartPole-v1'  # @param
num_iterations = 10000  # @param

initial_collect_steps = 1000  # @param
collect_steps_per_iteration = 1  # @param
replay_buffer_capacity = 100000  # @param

fc_layer_params = (100,)

batch_size = 64  # @param
learning_rate = 1e-3  # @param
log_interval = 200  # @param

num_eval_episodes = 10  # @param
eval_interval = 1000  # @param

In [None]:
env = suite_gym.load(env_name)

In [None]:
print('Observation Spec:')
print(env.time_step_spec().observation)
print('Action Spec:')
print(env.action_spec())

In [None]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [None]:
import abc
import six
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step
from tensorflow.python.keras.engine import network as keras_network  # TF internal
from tensorflow.python.util import tf_decorator  # TF internal
from tensorflow.python.util import tf_inspect  # TF internal

class _NetworkMeta(abc.ABCMeta):
    def __new__(mcs, classname, baseclasses, attrs):
        if baseclasses[0] == keras_network.Network:
            # This is just Network below.  Return early.
            return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)

        init = attrs.get("__init__", None)

        if not init:
            # This wrapper class does not define an __init__.  When someone creates
            # the object, the __init__ of its parent class will be called.  We will
            # call that __init__ instead separately since the parent class is also a
            # subclass of Network.  Here just create the class and return.
            return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)

        arg_spec = tf_inspect.getargspec(init)
        if arg_spec.varargs is not None:
            raise RuntimeError(
                "%s.__init__ function accepts *args.  This is not allowed." %
                classname)

        def capture_init(self, *args, **kwargs):
            if len(args) > len(arg_spec.args) + 1:
                # Error case: more inputs than args.  Call init so that the appropriate
                # error can be raised to the user.
                init(self, *args, **kwargs)
            for i, arg in enumerate(args):
                # Add +1 to skip `self` in arg_spec.args.
                kwargs[arg_spec.args[1 + i]] = arg
            init(self, **kwargs)
            setattr(self, "_saved_kwargs", kwargs)

        attrs["__init__"] = tf_decorator.make_decorator(init, capture_init)
        return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)


@six.add_metaclass(_NetworkMeta)
class Network(keras_network.Network):
    def __init__(self, input_tensor_spec, state_spec, name):
        super(Network, self).__init__(name=name)
        self._input_tensor_spec = input_tensor_spec
        self._state_spec = state_spec

    @property
    def state_spec(self):
        return self._state_spec

    def _build(self):
        if not self.built and self.input_tensor_spec is not None:
            random_input = tensor_spec.sample_spec_nest(self.input_tensor_spec, outer_dims=(1,))
            step_type = tf.expand_dims(time_step.StepType.FIRST, 0)
            self.__call__(random_input, step_type, None)

    @property
    def input_tensor_spec(self):
        """Returns the spec of the input to the network of type InputSpec."""
        return self._input_tensor_spec

    @property
    def variables(self):
        """Return the variables for all the network layers.
        If the network hasn't been built, builds it on random input (generated
        using self._input_tensor_spec) to build all the layers and their variables.
        Raises:
        ValueError:  If the network fails to build.
        """
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights

    def copy(self, **kwargs):
        print("self._saved_kwargs={}".format(self._saved_kwargs))
        return type(self)(**dict(self._saved_kwargs, **kwargs))

    def __call__(self, inputs, *args, **kwargs):
        tf.nest.assert_same_structure(inputs, self.input_tensor_spec)
        return super(Network, self).__call__(inputs, *args, **kwargs)


class DDQN(tf.keras.Model):
    def __init__(self, input_tensor_spec, state_spec, name="DDQN"):
        super(DDQN, self).__init__()
        
        self._input_tensor_spec = input_tensor_spec
        self._state_spec = ()
        self.init_state_spec = state_spec
        
        hidden_units = 32
        action_spec = tf.nest.flatten(state_spec)[0]
        num_actions = action_spec.maximum - action_spec.minimum + 1
        
        # Shared layers
        self._shared1 = tf.keras.layers.Dense(hidden_units, 
                         activation='relu', 
                         kernel_initializer='he_uniform')
     
        self._shared2 = tf.keras.layers.Dense(hidden_units, 
                         activation='relu', 
                         kernel_initializer='he_uniform')

        # Learn advantage
        self._adv1 = tf.keras.layers.Dense(hidden_units, 
                         activation='relu', 
                         kernel_initializer='he_uniform')
        self._adv2 = tf.keras.layers.Dense(num_actions, 
                         activation='linear', 
                         kernel_initializer='he_uniform')
        self._adv3 = tf.keras.layers.Lambda(lambda a: a - tf.keras.backend.mean(a, axis=1, keepdims=True), 
                                            output_shape=(num_actions,))
        
        # Learn value
        self._value1 = tf.keras.layers.Dense(hidden_units, 
                         activation='relu', 
                         kernel_initializer='he_uniform')
        self._value2 = tf.keras.layers.Dense(1, 
                         activation='linear', 
                         kernel_initializer='he_uniform')
        
        # Combine into Q
        self._q_out = tf.keras.layers.Add()

    def call(self, observation, step_type=None, network_state=()):
        shared = self._shared1(observation)
        shared = self._shared2(shared)
        adv = self._adv1(shared)
        adv = self._adv2(adv)
        adv = self._adv3(adv)
        value = self._value1(shared)
        value = self._value2(shared)
        q_out = self._q_out([adv, value])
        return q_out, network_state
    
    def copy(self, name):
        print("copy override")
        return type(self)(self._input_tensor_spec, self.init_state_spec, name)
    
    @property
    def state_spec(self):
        return self._state_spec

    def _build(self):
        if not self.built and self.input_tensor_spec is not None:
            random_input = tensor_spec.sample_spec_nest(self.input_tensor_spec, outer_dims=(1,))
            #step_type = tf.expand_dims(time_step.StepType.FIRST, 0)
            self.__call__(random_input, None, None)

    @property
    def input_tensor_spec(self):
        """Returns the spec of the input to the network of type InputSpec."""
        return self._input_tensor_spec

    @property
    def variables(self):
        """Return the variables for all the network layers.
        If the network hasn't been built, builds it on random input (generated
        using self._input_tensor_spec) to build all the layers and their variables.
        Raises:
        ValueError:  If the network fails to build.
        """
        try:
            print("before build:")
            print(self.weights)
            self._build()
            print("after build:")
            print(self.weights)
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights




In [None]:
q_net = DDQN(
    train_env.observation_spec(),
    train_env.action_spec())

#q_net = q_network.QNetwork(
#    train_env.observation_spec(),
#    train_env.action_spec(),
#    fc_layer_params=fc_layer_params)

In [None]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
    train_step_counter=train_step_counter)
tf_agent.initialize()

In [None]:
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

In [None]:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [None]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity)

In [None]:
def collect_step(environment, policy):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)

for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy)

In [None]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)

iterator = iter(dataset)

In [None]:
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
tf_agent.train = common.function(tf_agent.train)

# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    for _ in range(collect_steps_per_iteration):
        collect_step(train_env, tf_agent.collect_policy)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = tf_agent.train(experience)

    step = tf_agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)


In [None]:
steps = range(0, num_iterations + 1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.ylim(top=550)

In [None]:
time_step = eval_env.reset()
while not time_step.is_last():
    action_step = tf_agent.policy.action(time_step)
    time_step = eval_env.step(action_step.action)
    eval_py_env.render()

In [None]:
def _experience_to_transitions(experience):
    transitions = trajectory.to_transition(experience)

    transitions = tf.nest.map_structure(lambda x: tf.squeeze(x, [1]), transitions)

    time_steps, policy_steps, next_time_steps = transitions
    actions = policy_steps.action
    return time_steps, actions, next_time_steps

def element_wise_squared_loss(x, y):
    return tf.compat.v1.losses.mean_squared_error(x, y, reduction=tf.compat.v1.losses.Reduction.NONE)

def element_wise_huber_loss(x, y):
    return tf.compat.v1.losses.huber_loss(x, y, reduction=tf.compat.v1.losses.Reduction.NONE)

def compute_td_targets(next_q_values, rewards, discounts):
    return tf.stop_gradient(rewards + discounts * next_q_values)

def index_with_actions(q_values, actions, multi_dim_actions=False):
    if actions.shape.ndims is None:
        raise ValueError('actions should have known rank.')
    
    batch_dims = actions.shape.ndims
    if multi_dim_actions:
        # In the multidimensional case, the last dimension of actions indexes the
        # vector of actions for each batch, so exclude it from the batch dimensions.
        batch_dims -= 1

    outer_shape = tf.shape(input=actions)
    batch_indices = tf.meshgrid(*[tf.range(outer_shape[i]) for i in range(batch_dims)], indexing='ij')
    batch_indices = [
        tf.expand_dims(batch_index, -1) for batch_index in batch_indices
    ]
    
    if not multi_dim_actions:
        actions = tf.expand_dims(actions, -1)
        
    action_indices = tf.concat(batch_indices + [actions], -1)
    return tf.gather_nd(q_values, action_indices)

def _compute_next_q_values(self, next_time_steps):
    next_target_q_values, _ = self._target_q_network(next_time_steps.observation, next_time_steps.step_type)
    return tf.reduce_max(input_tensor=next_target_q_values, axis=-1)

def _loss(self, experience, reward_scale_factor=1.0, gamma = 1.0):
    time_steps, actions, next_time_steps = self._experience_to_transitions(experience)

    with tf.name_scope('loss'):
        actions = tf.nest.flatten(actions)[0]
        q_values, _ = self._q_network(time_steps.observation, time_steps.step_type)

        print("ndims={}".format(tf.nest.flatten(self._action_spec)[0].shape.ndims))
        multi_dim_actions = tf.nest.flatten(self._action_spec)[0].shape.ndims > 0
        q_values = index_with_actions(q_values,
                                      tf.cast(actions, dtype=tf.int32),
                                      multi_dim_actions=multi_dim_actions)

        next_q_values = self._compute_next_q_values(next_time_steps)

        td_targets = compute_td_targets(next_q_values,
                                        rewards=reward_scale_factor * next_time_steps.reward,
                                        discounts=gamma * next_time_steps.discount)

        valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
        td_error = valid_mask * (td_targets - q_values)

        td_loss = valid_mask * element_wise_squared_loss(td_targets, q_values)

        loss = tf.reduce_mean(input_tensor=td_loss)

        return loss, td_loss, td_error
    
def _train(self, experience):
    with tf.GradientTape() as tape:
        loss, td_loss, td_error = _loss(self, experience)
    variables_to_train = self._q_network.trainable_weights
    grads = tape.gradient(loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(grads, variables_to_train))
    self._optimizer.apply_gradients(grads_and_vars,
                                    global_step=self.train_step_counter)

    self._update_target()

    return loss, td_loss, td_error

In [None]:
experience, unused_info = next(iterator)

loss, td_loss, td_error = _train(tf_agent, experience)

In [None]:
print(loss)
print(td_loss)
print(td_error)