In [1]:
# Note: If you haven't installed tf-agents or gym yet, run:
try:
    %tensorflow_version 2.x
except:
    pass
!pip install --upgrade tensorflow-probability
!pip install tf-agents
!pip install gym

Looking in indexes: https://pypi.org/simple, https://artifactory.spotify.net/artifactory/api/pypi/pypi/simple/
Requirement already up-to-date: tensorflow-probability in /Users/lingh/.pyenv/versions/3.7.0/envs/my-virtual-env-3.7.0/lib/python3.7/site-packages (0.9.0)
Looking in indexes: https://pypi.org/simple, https://artifactory.spotify.net/artifactory/api/pypi/pypi/simple/
Looking in indexes: https://pypi.org/simple, https://artifactory.spotify.net/artifactory/api/pypi/pypi/simple/


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf


from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.policies import random_py_policy
from tf_agents.policies import random_tf_policy
from tf_agents.metrics import py_metrics
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import py_driver
from tf_agents.drivers import dynamic_episode_driver

tf.compat.v1.enable_v2_behavior()

#### Python Drivers

In [6]:
class PyDriver(object):

    def __init__(self, env, policy, observers, max_steps=1, max_episodes=1):
        self._env = env
        self._policy = policy
        self._observers = observers or []
        self._max_steps = max_steps or np.inf
        self._max_episodes = max_episodes or np.inf

    def run(self, time_step, policy_state=()):
        num_steps = 0
        num_episodes = 0
        while num_steps < self._max_steps and num_episodes < self._max_episodes:

            # Compute an action using the policy for the given time_step
            action_step = self._policy.action(time_step, policy_state)

            # Apply the action to the environment and get the next step
            next_time_step = self._env.step(action_step.action)

            # Package information into a trajectory
            traj = trajectory.Trajectory(
             time_step.step_type,
             time_step.observation,
             action_step.action,
             action_step.info,
             next_time_step.step_type,
             next_time_step.reward,
             next_time_step.discount)

            for observer in self._observers:
                observer(traj)

            # Update statistics to check termination
            num_episodes += np.sum(traj.is_last())
            num_steps += np.sum(~traj.is_boundary())

            time_step = next_time_step
            policy_state = action_step.state

        return time_step, policy_state

In [7]:
env = suite_gym.load('CartPole-v0')
policy = random_py_policy.RandomPyPolicy(time_step_spec=env.time_step_spec(), 
                                         action_spec=env.action_spec())
replay_buffer = []
metric = py_metrics.AverageReturnMetric()
observers = [replay_buffer.append, metric]
driver = py_driver.PyDriver(env, policy, observers, max_steps=20, max_episodes=1)

initial_time_step = env.reset()
final_time_step, _ = driver.run(initial_time_step)

print('Replay Buffer:')
for traj in replay_buffer:
  print(traj)

print('Average Return: ', metric.result())

Replay Buffer:
Trajectory(step_type=array(0, dtype=int32), observation=array([ 0.03968893,  0.02144465, -0.0392565 , -0.0027633 ], dtype=float32), action=array(0), policy_info=(), next_step_type=array(1, dtype=int32), reward=array(1., dtype=float32), discount=array(1., dtype=float32))
Trajectory(step_type=array(1, dtype=int32), observation=array([ 0.04011782, -0.17309296, -0.03931177,  0.27727985], dtype=float32), action=array(1), policy_info=(), next_step_type=array(1, dtype=int32), reward=array(1., dtype=float32), discount=array(1., dtype=float32))
Trajectory(step_type=array(1, dtype=int32), observation=array([ 0.03665596,  0.02256713, -0.03376617, -0.02753823], dtype=float32), action=array(0), policy_info=(), next_step_type=array(1, dtype=int32), reward=array(1., dtype=float32), discount=array(1., dtype=float32))
Trajectory(step_type=array(1, dtype=int32), observation=array([ 0.0371073 , -0.17205472, -0.03431693,  0.25430277], dtype=float32), action=array(0), policy_info=(), next_st

#### TensorFlow Drivers

In [8]:
env = suite_gym.load('CartPole-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)

tf_policy = random_tf_policy.RandomTFPolicy(action_spec=tf_env.action_spec(),
                                            time_step_spec=tf_env.time_step_spec())


num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps]
driver = dynamic_episode_driver.DynamicEpisodeDriver(
    tf_env, tf_policy, observers, num_episodes=2)

# Initial driver.run will reset the environment and initialize the policy.
final_time_step, policy_state = driver.run()

print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))
final_time_step TimeStep(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.01141971,  0.03352176,  0.027735  , -0.02424971]],
      dtype=float32)>)
Number of Steps:  68
Number of Episodes:  2


In [9]:
# Continue running from previous state
final_time_step, _ = driver.run(final_time_step, policy_state)

print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())

final_time_step TimeStep(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.02745439, -0.02013117,  0.0220114 , -0.00767116]],
      dtype=float32)>)
Number of Steps:  107
Number of Episodes:  4
