In [29]:
import gym
import gin
import os
import imageio
import tensorflow as tf
tf.enable_v2_behavior()

from safemrl.envs import minitaur
from safemrl.algos import agents
from tf_agents.environments import tf_py_environment, gym_wrapper
from tf_agents.agents.sac import sac_agent
from tf_agents.networks import actor_distribution_network
from tf_agents.utils import common
gin.enter_interactive_mode()

In [2]:
gin.add_config_file_search_path(os.environ.get('CONFIG_DIR'))
gin.parse_config_files_and_bindings(['minitaur_default.gin', 'sac.gin'], [])

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [10]:
env = minitaur.TaskAgnWrapper(gym.make('MinitaurGoalVelocityEnv-v0'))
tf_env = tf_py_environment.TFPyEnvironment(gym_wrapper.GymWrapper(env))

In [12]:
global_step = tf.compat.v1.train.get_or_create_global_step()
time_step_spec = tf_env.time_step_spec()
observation_spec = time_step_spec.observation
action_spec = tf_env.action_spec()

actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_spec,
        action_spec,
        fc_layer_params=(256, 256),
        continuous_projection_net=agents.normal_projection_net)

critic_net = agents.CriticNetwork(
      (observation_spec, action_spec),
      joint_fc_layer_params=(256, 256))

tf_agent = sac_agent.SacAgent(
    time_step_spec,
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=3e-4),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=3e-4),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=3e-4),
    target_update_tau=0.005,
    target_update_period=1,
    td_errors_loss_fn=tf.keras.losses.mse,
    gamma=0,
    reward_scale_factor=1.,
    gradient_clipping=1.,
    debug_summaries=False,
    summarize_grads_and_vars=False,
    train_step_counter=global_step)

Instructions for updating:
`AffineScalar` bijector is deprecated; please use `tfb.Shift(loc)(tfb.Scale(...))` instead.


In [27]:
global_step.numpy()

1000000

In [26]:
train_dir = 'tfagents/baselines/sac-1e-5-actor/sac-1e-5-actor/train'
train_checkpointer = common.Checkpointer(
    ckpt_dir=train_dir,
    agent=tf_agent,
    global_step=global_step)
train_checkpointer.initialize_or_restore()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x14f17cbe0>

In [28]:
policy = tf_agent.policy

In [6]:
traj_len = 0
time_step = tf_env.reset()
frames = [env.render('rgb_array')]
pol_state = policy.get_initial_state(1)
while not time_step.is_last():
    action_step = policy.action(time_step, pol_state)
    action, pol_state = action_step.action, action_step.state
    time_step = tf_env.step(action)
    frames.append(env.render('rgb_array'))
    traj_len += 1

In [7]:
traj_len

363

In [11]:
i = 0
path = './videos/sac/{}-steps/episode-{}.mp4'.format(global_step.numpy(), i)
while osp.exists(path):
    i += 1
    path = './videos/sac/{}-steps/episode-{}.mp4'.format(global_step.numpy(), i)

writer = imageio.get_writer(path)

for frame in frames:
    writer.append_data(frame)

writer.close()

