In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import gym
import six
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

from collections import deque

import tensorflow as tf

# Environment

In [2]:
env_name = 'Pendulum-v0'

eval_env = gym.make(env_name)
train_env = gym.make(env_name)

# Observation Spec
print("Observation Shape={}".format(train_env.observation_space.shape))
print("Observation Range (Low)={}".format(train_env.observation_space.low))
print("Observation Range (High)={}".format(train_env.observation_space.high))
print("\r")

# Action Spec
print("Action Shape={}".format(train_env.action_space.shape))
print("Action Range (Low)={}".format(train_env.action_space.low))
print("Action Range (High)={}".format(train_env.action_space.high))

Observation Shape=(3,)
Observation Range (Low)=[-1. -1. -8.]
Observation Range (High)=[1. 1. 8.]

Action Shape=(1,)
Action Range (Low)=[-2.]
Action Range (High)=[2.]


# Memory

In [3]:
class Memory:   # stored as ( s, a, r, s_, d )
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def add(self, sample):
        self.memory.append(sample)        

    def sample(self, n):
        n = min(n, len(self.memory))
        return random.sample(self.memory, n)

    def isFull(self):
        return len(self.memory) >= self.memory.maxlen
    
    def update(self, idx, p):
        pass

In [4]:
memory_capacity = 50000
memory = Memory(memory_capacity)

# Actor Network

In [9]:
class ActorNetwork(tf.keras.Model):
    def __init__(self, observation_spec, action_spec, name="Actor"):
        super(ActorNetwork, self).__init__()
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        self._action_mean = tf.constant((action_spec.high + action_spec.low) / 2)
        self._action_scale = tf.constant((action_spec.high - action_spec.low) / 2)
        
        hidden_units = 128
        
        # Layers
        self._layer1 = tf.keras.layers.Dense(hidden_units, 
                                             activation='relu', 
                                             kernel_initializer='he_uniform')
        self._layer2 = tf.keras.layers.Dense(hidden_units, 
                                             activation='relu', 
                                             kernel_initializer='he_uniform')
        self._output = tf.keras.layers.Dense(action_spec.shape[0], 
                                             activation='tanh', 
                                             kernel_initializer='he_uniform')

    def call(self, observation):
        output = self._layer1(observation)
        output = self._layer2(output)
        output = self._output(output)
        return output * self._action_scale + self._action_mean
    
    def copy(self, name):
        # Create a new copy of this network with the initialization params that were
        # passed to this one.
        return type(self)(self._observation_spec, self._action_spec, name)
    
    def _build(self):
        if not self.built and self._observation_spec is not None:
            # Generate a single random observation to build the network if it hasn't
            # been built already.
            random_observation = tf.random.uniform((1,self._observation_spec.shape[0]), 
                                             self._observation_spec.low, 
                                             self._observation_spec.high)
            action = self.__call__(random_observation)

    @property
    def variables(self):
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights


In [10]:
actor_network = ActorNetwork(train_env.observation_space, train_env.action_space)
actor_variables = actor_network.variables
actor_network.summary()

Model: "actor_network_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  512       
_________________________________________________________________
dense_4 (Dense)              multiple                  16512     
_________________________________________________________________
dense_5 (Dense)              multiple                  129       
Total params: 17,153
Trainable params: 17,153
Non-trainable params: 0
_________________________________________________________________


# Critic Network

In [16]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, observation_spec, action_spec, name="Critic"):
        super(CriticNetwork, self).__init__()
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        hidden_units = 128
        
        # Observation Layers
        self._observation_layer1 = tf.keras.layers.Dense(hidden_units, 
                                                         activation='relu',
                                                         kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
            scale=1. / 3., mode='fan_in', distribution='uniform'))
        #self._observation_layer2 = tf.keras.layers.Dense(hidden_units, 
        #                                                 activation='relu',
        #                                                 kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
        #    scale=1. / 3., mode='fan_in', distribution='uniform'))
        
        # Action Layers
        self._action_layer1 = tf.keras.layers.Dense(hidden_units, 
                                                    activation='relu', 
                                                    kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
            scale=1. / 3., mode='fan_in', distribution='uniform'))
        
        # Merged Layers
        self._merge_layer1 = tf.keras.layers.Add()
        self._merge_layer2 = tf.keras.layers.Dense(hidden_units, 
                                                   activation='relu', 
                                                   kernel_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
            scale=1. / 3., mode='fan_in', distribution='uniform'))
        
        # Output Layer: Q-value for the action taken based on this observation.
        self._output_layer = tf.keras.layers.Dense(1, 
                                                   activation='relu', 
                                                   kernel_initializer=tf.keras.initializers.RandomUniform(
                minval=-0.003, maxval=0.003))

    def call(self, observation, action):
        print(observation)
        print(action)
        obs_output = self._observation_layer1(observation)
        print("obs_output={}".format(obs_output))
        #obs_output = self._observation_layer2(obs_output)
        #print("obs_output={}".format(obs_output))
        
        action_output = self._action_layer1(action)
        print("action_output={}".format(action_output))
        
        output = self._merge_layer1([obs_output, action_output])
        print("output={}".format(output))

        output = self._merge_layer2(output)
        print("output={}".format(output))

        output = self._output_layer(output)
        print("output={}".format(output))
        return output
    
    def copy(self, name):
        # Create a new copy of this network with the initialization params that were
        # passed to this one.
        return type(self)(self._observation_spec, self._action_spec, name)
    
    def _build(self):
        if not self.built and self._observation_spec is not None:
            # Generate a single random observation and action to build the network if it hasn't
            # been built already.
            random_observation = tf.random.uniform((1, self._observation_spec.shape[0]), 
                                             [-0.1, -0.1, -0.1], 
                                             [0.1, 0.1, 0.1])
            random_action = tf.random.uniform((1, self._action_spec.shape[0]), 
                                              [-0.1], 
                                              [0.1])

            q = self.__call__(random_observation, random_action)

    @property
    def variables(self):
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights


In [17]:
critic_network = CriticNetwork(train_env.observation_space, train_env.action_space)
critic_variables = critic_network.variables
critic_network.summary()

tf.Tensor([[-0.05010579  0.05722728 -0.07889152]], shape=(1, 3), dtype=float32)
tf.Tensor([[0.0462544]], shape=(1, 1), dtype=float32)
obs_output=[[0.         0.         0.         0.06373786 0.         0.06809693
  0.01212819 0.00752896 0.         0.         0.03520053 0.07635716
  0.         0.         0.         0.03842733 0.         0.05193348
  0.02520209 0.         0.04383183 0.05485588 0.         0.
  0.0192854  0.         0.0137463  0.         0.01956771 0.04075774
  0.         0.         0.         0.05153321 0.01403232 0.
  0.         0.         0.         0.00986948 0.         0.
  0.         0.         0.         0.         0.00081735 0.
  0.         0.07648408 0.02781786 0.         0.02492818 0.
  0.         0.         0.         0.03180475 0.00222764 0.0276683
  0.         0.03869043 0.00172387 0.         0.         0.
  0.06178748 0.02292705 0.00437802 0.         0.         0.03413288
  0.03820348 0.         0.00146915 0.         0.         0.
  0.0410388  0.00354084 0.  

# Agent

In [None]:
class ActorCriticAgent:
    def __init__(self, 
                 observation_spec, 
                 action_spec, 
                 actor_network,
                 critic_network, 
                 actor_learning_rate = 3e-4, 
                 critic_learning_rate = 3e-4, 
                 gamma = 0.99): 
        
        self._train_step_counter = tf.compat.v2.Variable(0)
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        self._actor_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate)
        
        self._actor_network = actor_network
        self._target_actor_network = actor_network.copy("actor-target")
        
        self._critic_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate)
        self._critic_network = critic_network
        self._target_critic_network = critic_network.copy("critic-target")
        
        self._gamma = gamma
        
        self._initial_value = 0
        self._damping = 0.15
        self._stddev = 0.2
        self._x = tf.compat.v2.Variable(initial_value=self._initial_value, 
                                        dtype=tf.float32,
                                        trainable=False)
        
        self._update_target()
        
    def collect_policy(self, observation):
        observation = tf.expand_dims(tf.constant(observation), axis=0)
        action = self._actor_network(observation)
        
        noise = tf.random.normal(shape=self._x.shape,
                                 stddev=self._stddev,
                                 dtype=self._x.dtype)
        
        self._x.assign((1. - self._damping) * self._x + noise)
        
        return self._x + action

    def eval_policy(self, observation):
        observation = tf.expand_dims(tf.constant(observation), axis=0)
        action = self._actor_network(observation)
        return action
    
    def _update_network(self, source_variables, target_variables):
        updates = []
        for (v_s, v_t) in zip(source_variables, target_variables):
            update = v_t.assign(v_s)
            updates.append(update)
        return tf.group(*updates)

    def _update_target(self):
        self._update_network(self._actor_network.variables, self._target_actor_network.variables)
        self._update_network(self._critic_network.variables, self._target_critic_network.variables)

    def train(self, batch): 
        no_state = np.zeros(self._observation_spec.shape[0])
        observations = tf.constant([ x[0] for x in batch ], dtype=tf.float32)
        actions = tf.constant([ x[1] for x in batch ], dtype=tf.float32)
        rewards = tf.expand_dims(tf.constant([ x[2] for x in batch ], dtype=tf.float32),axis=-1)
        next_observations = tf.constant([ (no_state if x[4] is True else x[3]) for x in batch ], dtype=tf.float32)
        
        # Compute Critic Loss and apply gradients.
        critic_variables = self._critic_network.variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(critic_variables)
            critic_loss = self.critic_loss(observations, actions, rewards, next_observations)
        
        critic_grads = tape.gradient(critic_loss, critic_variables)
        critic_grads_and_vars = tuple(zip(critic_grads, critic_variables))
        self._critic_optimizer.apply_gradients(critic_grads_and_vars)
        
        # Compute Actor Loss and apply gradients
        actor_variables = self._actor_network.variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(actor_variables)
            actor_loss = self.actor_loss(observations)
            
        actor_grads = tape.gradient(actor_loss, actor_variables)
        actor_grads_and_vars = tuple(zip(actor_grads, actor_variables))
        self._actor_optimizer.apply_gradients(actor_grads_and_vars)

        # Increment counter and conditionally target networks.
        self._train_step_counter.assign_add(1)
        self._update_target()
        
        total_loss = actor_loss + critic_loss
        return total_loss
        

    def critic_loss(self, observations, actions, rewards, next_observations):        
        with tf.name_scope('critic_loss'):
            target_actions = self._target_actor_network(observations)

            target_q_values = self._target_critic_network(next_observations, target_actions)

            td_targets = tf.stop_gradient(rewards + self._gamma * target_q_values)

            q_values = self._critic_network(observations, actions)

            critic_loss = tf.compat.v1.losses.mean_squared_error(td_targets, q_values)

            critic_loss = tf.reduce_mean(critic_loss)

            return critic_loss
                
    def actor_loss(self, observations):
        with tf.name_scope('actor_loss'):
            actions = self._actor_network(observations)
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(actions)
                q_values = self._critic_network(observations, actions)
                actions = tf.nest.flatten(actions)

            dqdas = tape.gradient([q_values], actions)

            actor_losses = []
            
            for dqda, action in zip(dqdas, actions):
                loss = tf.compat.v1.losses.mean_squared_error(tf.stop_gradient(dqda + action), 
                                                              action,
                                                             reduction=tf.compat.v1.losses.Reduction.NONE)
                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)
            
            return actor_loss

In [None]:
agent = ActorCriticAgent(train_env.observation_space, 
                         train_env.action_space, 
                         actor_network, 
                         critic_network)

In [None]:
def compute_avg_return(env, agent, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        done = False
        observation = env.reset()
        episode_return = 0.0
        
        while not done:
            action = agent.eval_policy(observation)[0].numpy()
            next_observation, reward, done, _ = env.step(action)
            memory.add((observation, action, reward, next_observation, done))
            observation = next_observation
            episode_return += reward

        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return

In [None]:
initial_collect_steps = 1000
done = True
for _ in range(initial_collect_steps):
    if (done):
        observation = train_env.reset()
        done = False
    else:
        observation = next_observation
        
    action = tf.random.uniform(train_env.action_space.shape, 
                               minval=train_env.action_space.low,
                               maxval=train_env.action_space.high,
                               dtype=tf.dtypes.float32).numpy()
    
    next_observation, reward, done, _ = train_env.step(action)
    
    memory.add((observation, action, reward, next_observation, done))

In [None]:
num_iterations = 10000
log_interval = 100
eval_interval = 1000
num_eval_episodes = 5

# Reset the train step
agent._train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent, num_eval_episodes)
returns = [avg_return]

done = True

for _ in range(num_iterations):

    if (done):
        observation = train_env.reset()
        done = False
    else:
        observation = next_observation
        
    action = agent.collect_policy(observation)[0].numpy()
    next_observation, reward, done, _ = train_env.step(action)
    memory.add((observation, action, reward, next_observation, done))

    # Sample a batch of data from the buffer and update the agent's network.
    batch = memory.sample(32)
    train_loss = agent.train(batch)

    step = agent._train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

In [None]:
done = False
observation = eval_env.reset()
episode_return = 0.0

while not done:
    action = agent.eval_policy(observation)[0].numpy()
    next_observation, reward, done, _ = eval_env.step(action)
    observation = next_observation
    episode_return += reward
    eval_env.render()

print(episode_return)

In [None]:
batch = memory.sample(32)
observations = tf.constant([ x[0] for x in batch ], dtype=tf.float32)

actions = agent._actor_network(observations)
print(actions)
with tf.GradientTape(watch_accessed_variables=False) as tape:
    tape.watch(actions)
    q_values = agent._critic_network(observations, actions)
    print(q_values)
    actions = tf.nest.flatten(actions)

dqdas = tape.gradient([q_values], actions)
print(dqdas)
