In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib
import matplotlib.pyplot as plt

import tensorflow as tf
tf.compat.v1.enable_v2_behavior()

import critic_network, ddpg_agent, actor_network
#from tf_agents.agents.ddpg import critic_network, ddpg_agent, actor_network
from tf_agents.agents.sac import sac_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
import actor_distribution_network
import normal_projection_network
#from tf_agents.networks import actor_distribution_network
#from tf_agents.networks import normal_projection_network
from tf_agents.networks import network
from tf_agents.networks import encoding_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.utils import nest_utils
from tf_agents.networks import utils
from tf_agents.utils import common as common_utils

import tensorflow_probability as tfp

In [2]:
env_name = 'Pendulum-v0'  # @param
num_iterations = 20000  # @param

initial_collect_steps = 1000  # @param
collect_steps_per_iteration = 1  # @param
replay_buffer_capacity = 100000  # @param

critic_learning_rate = 3e-4  # @param
actor_learning_rate = 3e-4  # @param
alpha_learning_rate = 3e-4 # @param
target_update_tau = 0.005 # @param
target_update_period = 1 #@param

batch_size = 64  # @param

gamma = 0.99 #@param
reward_scale_factor = 1.0 #@param
gradient_clipping = None #@param

actor_fc_layer_params = (256, 128)
critic_joint_fc_layer_params = (256, 64)
log_interval = 200  # @param

num_eval_episodes = 10  # @param
eval_interval = 1000  # @param

In [3]:
env = suite_gym.load(env_name)

train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

observation_spec = train_env.observation_spec()
action_spec = train_env.action_spec()

In [4]:
time_step = eval_env.reset()

# Actor Network

In [5]:
actor_net = actor_network.ActorNetwork(observation_spec,
                         action_spec,
                         fc_layer_params=actor_fc_layer_params)

In [6]:
action = actor_net(time_step.observation)
print(action)
actor_net.summary()

(<tf.Tensor: id=93, shape=(1, 1), dtype=float32, numpy=array([[-0.00135688]], dtype=float32)>, ())
Model: "ActorNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            multiple                  0         
_________________________________________________________________
input_mlp/dense (Dense)      multiple                  1024      
_________________________________________________________________
input_mlp/dense (Dense)      multiple                  32896     
_________________________________________________________________
action (Dense)               multiple                  129       
Total params: 34,049
Trainable params: 34,049
Non-trainable params: 0
_________________________________________________________________


# Critic Network

In [7]:
critic_net = critic_network.CriticNetwork(
    (observation_spec, action_spec),
    observation_fc_layer_params=None,
    action_fc_layer_params=None,
    joint_fc_layer_params=critic_joint_fc_layer_params)

#q = critic_net((time_step.observation, action))
#print(q)
#critic_net.summary()

# Actor Projection Network

In [8]:
def normal_projection_net(action_spec,init_means_output_factor=0.1):
    return normal_projection_network.NormalProjectionNetwork(
        action_spec,
        mean_transform=None,
        state_dependent_std=True,
        init_means_output_factor=init_means_output_factor,
        std_transform=sac_agent.std_clip_transform,
        scale_distribution=True)


actor_proj_net = actor_distribution_network.ActorDistributionNetwork(
    observation_spec,
    action_spec,
    fc_layer_params=actor_fc_layer_params,
    continuous_projection_net=normal_projection_net)

In [9]:
time_step = eval_env.reset()
sample_output,_ = actor_proj_net(time_step.observation, time_step.step_type, None)
print(sample_output.sample())
actor_proj_net.summary()

states=(1, 3)
batch_squash states=(1, 3)
_mlp_layers states=(1, 3)
_mlp_layers states=(1, 256)
_mlp_layers states=(1, 128)
unflatten states=(1, 128)
inputs=(1, 128)
means=(1, 1)
means=(1, 1)
stds=(1, 1)
std transform
stds=(1, 1)
means=[[0.02837696]], stds=[[1.017018]]
distribution=tfp.distributions.Normal("ActorDistributionNetwork_NormalProjectionNetwork_Normal", batch_shape=[1, 1], event_shape=[], dtype=float32)
scale distribution
tf.Tensor([[-1.8646111]], shape=(1, 1), dtype=float32)
Model: "ActorDistributionNetwork"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_4 (Flatten)          multiple                  0         
_________________________________________________________________
input_mlp/dense (Dense)      multiple                  1024      
_________________________________________________________________
input_mlp/dense (Dense)      multiple                  32896     
_____________

# DDPG Agent

In [10]:
global_step = tf.compat.v1.train.get_or_create_global_step()
ddpg_agent = ddpg_agent.DdpgAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=global_step)
ddpg_agent.initialize()

# SAC Agent

In [11]:
global_step = tf.compat.v1.train.get_or_create_global_step()
sac_agent = sac_agent.SacAgent(
    train_env.time_step_spec(),
    action_spec,
    actor_network=actor_net,
    critic_network=critic_net,
    actor_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=actor_learning_rate),
    critic_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=critic_learning_rate),
    alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
        learning_rate=alpha_learning_rate),
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    train_step_counter=global_step)
sac_agent.initialize()

In [12]:
agent = ddpg_agent

eval_policy = greedy_policy.GreedyPolicy(agent.policy)
collect_policy = agent.collect_policy

In [13]:
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [14]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity)

In [15]:
initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
        train_env,
        collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=initial_collect_steps)
initial_collect_driver.run()

(TimeStep(step_type=<tf.Tensor: id=84546, shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>, reward=<tf.Tensor: id=84547, shape=(1,), dtype=float32, numpy=array([-10.723537], dtype=float32)>, discount=<tf.Tensor: id=84548, shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=<tf.Tensor: id=84549, shape=(1, 3), dtype=float32, numpy=array([[-0.9967617 ,  0.08041225, -4.0316567 ]], dtype=float32)>),
 ())

In [16]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)

iterator = iter(dataset)

In [17]:
collect_driver = dynamic_step_driver.DynamicStepDriver(
    train_env,
    collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=collect_steps_per_iteration)

In [18]:
agent.train = common.function(agent.train)
collect_driver.run = common.function(collect_driver.run)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, eval_policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    for _ in range(collect_steps_per_iteration):
        collect_driver.run()

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience)

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, eval_policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

W0727 19:27:41.812472 4598883776 deprecation.py:323] From /Users/jwatts/TensorflowProjects/lib/python3.7/site-packages/tensorflow/python/ops/losses/losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


actions=(64, 1)
q_values=(64,)
actions=[<tf.Tensor 'actor_loss/ActorNetwork/add:0' shape=(64, 1) dtype=float32>]
dqdas=[<tf.Tensor 'actor_loss/Reshape_2:0' shape=(64, 1) dtype=float32>]
actions=(64, 1)
q_values=(64,)
actions=[<tf.Tensor 'actor_loss/ActorNetwork/add:0' shape=(64, 1) dtype=float32>]
dqdas=[<tf.Tensor 'actor_loss/Reshape_2:0' shape=(64, 1) dtype=float32>]
step = 200: loss = 5.003885746002197
step = 400: loss = 0.7563710808753967
step = 600: loss = 0.7254889011383057
step = 800: loss = 0.8168790340423584
step = 1000: loss = 1.0504266023635864
step = 1000: Average Return = -1563.223876953125
step = 1200: loss = 9.42701530456543
step = 1400: loss = 2.1316187381744385
step = 1600: loss = 68.51822662353516
step = 1800: loss = 2.499830484390259
step = 2000: loss = 10.711779594421387


KeyboardInterrupt: 

In [None]:
steps = range(0, num_iterations + 1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.ylim(top=0)

In [None]:
episode_return = 0.0
time_step = eval_env.reset()

while not time_step.is_last():
    action_step = agent.policy.action(time_step)
    time_step = eval_env.step(action_step.action)
    episode_return += time_step.reward
    eval_py_env.render()
    
print(episode_return)

In [None]:
def test_actor_step(time_steps):
    actions, _ = agent._actor_network(time_steps.observation, time_steps.step_type)
    print("predicted actions={}".format(actions.numpy()))
    
    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(actions)
        q_values, _ = agent._critic_network((time_steps.observation, actions), time_steps.step_type)
        print("q with predicted actions={}".format(q_values.numpy()))
        actions = tf.nest.flatten(actions)

    dqdas = tape.gradient([q_values], actions)
    print("dqdas={}".format(dqdas[0].numpy()))
    print("adjusted action={}".format(actions + dqdas[0]))
    
    q_values, _ = agent._critic_network((time_steps.observation, actions + dqdas[0]), time_steps.step_type)
    print("q with adjusted actions={}".format(q_values))

In [None]:
test_actor_step(time_step)