In [1]:
# Note: If you haven't installed tf-agents yet, run:
try:
    %tensorflow_version 2.x
except:
    pass
!pip install --upgrade tensorflow-probability
!pip install tf-agents

Looking in indexes: https://pypi.org/simple, https://artifactory.spotify.net/artifactory/api/pypi/pypi/simple/
Requirement already up-to-date: tensorflow-probability in /Users/lingh/.pyenv/versions/3.7.0/envs/my-virtual-env-3.7.0/lib/python3.7/site-packages (0.9.0)
Looking in indexes: https://pypi.org/simple, https://artifactory.spotify.net/artifactory/api/pypi/pypi/simple/


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

from tf_agents.specs import array_spec
from tf_agents.specs import tensor_spec
from tf_agents.networks import network

from tf_agents.policies import py_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import scripted_py_policy

from tf_agents.policies import tf_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import actor_policy
from tf_agents.policies import q_policy
from tf_agents.policies import greedy_policy

from tf_agents.trajectories import time_step as ts

tf.compat.v1.enable_v2_behavior()

In Reinforcement Learning terminology, policies map an observation from the environment to an action or a distribution over actions. In TF-Agents, observations from the environment are contained in a named tuple TimeStep('step_type', 'discount', 'reward', 'observation'), and policies map timesteps to actions or distributions over actions. Most policies use timestep.observation, some policies use timestep.step_type (e.g. to reset the state at the beginning of an episode in stateful policies), but timestep.discount and timestep.reward are usually ignored.

Policies are related to other components in TF-Agents in the following way. Most policies have a neural network to compute actions and/or distributions over actions from TimeSteps. Agents can contain one or more policies for different purposes, e.g. a main policy that is being trained for deployment, and a noisy policy for data collection. Policies can be saved/restored, and can be used indepedently of the agent for data collection, evaluation etc.

Some policies are easier to write in Tensorflow (e.g. those with a neural network), whereas others are easier to write in Python (e.g. following a script of actions). So in TF agents, we allow both Python and Tensorflow policies. Morever, policies written in TensorFlow might have to be used in a Python environment, or vice versa, e.g. a TensorFlow policy is used for training but later deployed in a production python environment. To make this easier, we provide wrappers for converting between python and TensorFlow policies.

Another interesting class of policies are policy wrappers, which modify a given policy in a certain way, e.g. add a particular type of noise, make a greedy or epsilon-greedy version of a stochastic policy, randomly mix multiple policies etc.

### Python Policies

In [4]:
class Base(object):

    @abc.abstractmethod
    def __init__(self, time_step_spec, action_spec, policy_state_spec=()):
        self._time_step_spec = time_step_spec
        self._action_spec = action_spec
        self._policy_state_spec = policy_state_spec

    @abc.abstractmethod
    def reset(self, policy_state=()):
    # return initial_policy_state.
        pass

    @abc.abstractmethod
    def action(self, time_step, policy_state=()):
    # return a PolicyStep(action, state, info) named tuple.
        pass

    @abc.abstractmethod
    def distribution(self, time_step, policy_state=()):
    # Not implemented in python, only for TF policies.
        pass

    @abc.abstractmethod
    def update(self, policy):
    # update self to be similar to the input `policy`.
        pass

    @abc.abstractmethod
    def copy(self):
    # return a copy of self.
        pass

    @property
    def time_step_spec(self):
        return self._time_step_spec

    @property
    def action_spec(self):
        return self._action_spec

    @property
    def policy_state_spec(self):
        return self._policy_state_spec

The most important method is action(time_step) which maps a time_step containing an observation from the environment to a PolicyStep named tuple containing the following attributes:

- action: The action to be applied to the environment.
- state: The state of the policy (e.g. RNN state) to be fed into the next call to action.
- info: Optional side information such as action log probabilities.

The time_step_spec and action_spec are specifications for the input time step and the output action. Policies also have a reset function which is typically used for resetting the state in stateful policies. The copy function returns a copy of self and the update(new_policy) function updates self towards new_policy.

Now, let us look at a couple of examples of python policies.

In [5]:
action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
my_random_py_policy = random_py_policy.RandomPyPolicy(time_step_spec=None,
    action_spec=action_spec)
time_step = None
action_step = my_random_py_policy.action(time_step)
print(action_step)
action_step = my_random_py_policy.action(time_step)
print(action_step)

PolicyStep(action=array([ 10, -10], dtype=int32), state=(), info=())
PolicyStep(action=array([ 6, 10], dtype=int32), state=(), info=())


In [6]:
action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
action_script = [(1, np.array([5, 2], dtype=np.int32)), 
                 (0, np.array([0, 0], dtype=np.int32)), # Setting `num_repeates` to 0 will skip this action.
                 (2, np.array([1, 2], dtype=np.int32)), 
                 (1, np.array([3, 4], dtype=np.int32))]

my_scripted_py_policy = scripted_py_policy.ScriptedPyPolicy(time_step_spec=None, action_spec=action_spec, action_script=action_script)

policy_state = my_scripted_py_policy.get_initial_state()
time_step = None
print('Executing scripted policy...')
action_step = my_scripted_py_policy.action(time_step, policy_state)
print(action_step)
action_step= my_scripted_py_policy.action(time_step, action_step.state)
print(action_step)
action_step = my_scripted_py_policy.action(time_step, action_step.state)
print(action_step)

print('Resetting my_scripted_py_policy...')
policy_state = my_scripted_py_policy.get_initial_state()
action_step = my_scripted_py_policy

Executing scripted policy...
PolicyStep(action=array([5, 2], dtype=int32), state=[0, 1], info=())
PolicyStep(action=array([1, 2], dtype=int32), state=[2, 1], info=())
PolicyStep(action=array([1, 2], dtype=int32), state=[2, 2], info=())
Resetting my_scripted_py_policy...


### TensorFlow Policies

#### Example 1: Random TF Policy

In [7]:
action_spec = tensor_spec.BoundedTensorSpec(
    (2,), tf.float32, minimum=-1, maximum=3)
input_tensor_spec = tensor_spec.TensorSpec((2,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)

my_random_tf_policy = random_tf_policy.RandomTFPolicy(
    action_spec=action_spec, time_step_spec=time_step_spec)
observation = tf.ones(time_step_spec.observation.shape)
time_step = ts.restart(observation)
action_step = my_random_tf_policy.action(time_step)

print('Action:')
print(action_step.action)

Action:
tf.Tensor([-0.01249933  1.4199572 ], shape=(2,), dtype=float32)


#### Example 2: Actor Policy

An actor policy can be created using either a network that maps time_steps to actions or a network that maps time_steps to distributions over actions.

In [10]:
class ActionNet(network.Network):

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super(ActionNet, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name='ActionNet')
        self._output_tensor_spec = output_tensor_spec
        self._layers = [
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh),
        ]

    def call(self, observations, step_type, network_state):
        del step_type

        output = tf.cast(observations, dtype=tf.float32)
        for layer in self.layers:
            output = layer(output)
        actions = tf.reshape(output, [-1] + self._output_tensor_spec.shape.as_list())

        # Scale and shift actions to the correct range if necessary.
        return actions, network_state

In [11]:
input_tensor_spec = tensor_spec.TensorSpec((4,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)
action_spec = tensor_spec.BoundedTensorSpec((3,),
                                            tf.float32,
                                            minimum=-1,
                                            maximum=1)

action_net = ActionNet(input_tensor_spec, action_spec)

my_actor_policy = actor_policy.ActorPolicy(
    time_step_spec=time_step_spec,
    action_spec=action_spec,
    actor_network=action_net)

In [12]:
batch_size = 2
observations = tf.ones([2] + time_step_spec.observation.shape.as_list())

time_step = ts.restart(observations, batch_size)

action_step = my_actor_policy.action(time_step)
print('Action:')
print(action_step.action)

distribution_step = my_actor_policy.distribution(time_step)
print('Action distribution:')
print(distribution_step.action)

Action:
tf.Tensor(
[[-0.45615217  0.7324672  -0.1420335 ]
 [-0.45615217  0.7324672  -0.1420335 ]], shape=(2, 3), dtype=float32)
Action distribution:
tfp.distributions.Deterministic("Deterministic", batch_shape=[2, 3], event_shape=[], dtype=float32)


In [13]:
class ActionDistributionNet(ActionNet):

    def call(self, observations, step_type, network_state):
        action_means, network_state = super(ActionDistributionNet, self).call(
        observations, step_type, network_state)

        action_std = tf.ones_like(action_means)
        return tfp.distributions.Normal(action_means, action_std), network_state


action_distribution_net = ActionDistributionNet(input_tensor_spec, action_spec)

my_actor_policy = actor_policy.ActorPolicy(
    time_step_spec=time_step_spec,
    action_spec=action_spec,
    actor_network=action_distribution_net)

action_step = my_actor_policy.action(time_step)
print('Action:')
print(action_step.action)
distribution_step = my_actor_policy.distribution(time_step)
print('Action distribution:')
print(distribution_step.action)

Action:
tf.Tensor(
[[-1.          0.47452495 -0.8223118 ]
 [-0.7682057  -0.01630288  0.337681  ]], shape=(2, 3), dtype=float32)
Action distribution:
tfp.distributions.Normal("ActionNet_Normal", batch_shape=[2, 3], event_shape=[], dtype=float32)


#### Q Policy 

In [14]:
input_tensor_spec = tensor_spec.TensorSpec((4,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)
action_spec = tensor_spec.BoundedTensorSpec((1,),
                                            tf.int32,
                                            minimum=-1,
                                            maximum=1)
num_actions = action_spec.maximum - action_spec.minimum + 1


class QNetwork(network.Network):

    def __init__(self, input_tensor_spec, action_spec, num_actions=num_actions, name=None):
        super(QNetwork, self).__init__(
        input_tensor_spec=input_tensor_spec,
        state_spec=(),
        name=name)
        self._layers.append(tf.keras.layers.Dense(num_actions))
    
    def call(self, inputs, step_type=None, network_state=()):
        del step_type
        inputs = tf.cast(inputs, tf.float32)
        for layer in self.layers:
            inputs = layer(inputs)
        return inputs, network_state

batch_size = 2
observation = tf.ones([batch_size] + time_step_spec.observation.shape.as_list())
time_steps = ts.restart(observation, batch_size=batch_size)

my_q_network = QNetwork(
    input_tensor_spec=input_tensor_spec,
    action_spec=action_spec)
my_q_policy = q_policy.QPolicy(
    time_step_spec, action_spec, q_network=my_q_network)
action_step = my_q_policy.action(time_steps)
distribution_step = my_q_policy.distribution(time_steps)

print('Action:')
print(action_step.action)

print('Action distribution:')
print(distribution_step.action)

Action:
tf.Tensor(
[[1]
 [1]], shape=(2, 1), dtype=int32)
Action distribution:
tfp.distributions.ShiftedCategorical("ShiftedCategorical", batch_shape=[2, 1], event_shape=[], dtype=int32)


#### Policy Wrappers

In [15]:
my_greedy_policy = greedy_policy.GreedyPolicy(my_q_policy)

action_step = my_greedy_policy.action(time_steps)
print('Action:')
print(action_step.action)

distribution_step = my_greedy_policy.distribution(time_steps)
print('Action distribution:')
print(distribution_step.action)

Action:
tf.Tensor(
[[1]
 [1]], shape=(2, 1), dtype=int32)
Action distribution:
tfp.distributions.DeterministicWithLogProb("Deterministic", batch_shape=[2, 1], event_shape=[], dtype=int32)
