In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import gym
import six
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

from collections import deque

import tensorflow as tf

# Environment

In [3]:
env_name = 'Pendulum-v0'

eval_env = gym.make(env_name)
train_env = gym.make(env_name)

# Observation Spec
print("Observation Shape={}".format(train_env.observation_space.shape))
print("Observation Range (Low)={}".format(train_env.observation_space.low))
print("Observation Range (High)={}".format(train_env.observation_space.high))
print("\r")

# Action Spec
print("Action Shape={}".format(train_env.action_space.shape))
print("Action Range (Low)={}".format(train_env.action_space.low))
print("Action Range (High)={}".format(train_env.action_space.high))

Observation Shape=(3,)
Observation Range (Low)=[-1. -1. -8.]
Observation Range (High)=[1. 1. 8.]

Action Shape=(1,)
Action Range (Low)=[-2.]
Action Range (High)=[2.]


# Memory

In [4]:
class Memory:   # stored as ( s, a, r, s_, d )
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def add(self, sample):
        self.memory.append(sample)        

    def sample(self, n):
        n = min(n, len(self.memory))
        return random.sample(self.memory, n)

    def isFull(self):
        return len(self.memory) >= self.memory.maxlen
    
    def update(self, idx, p):
        pass

In [5]:
memory_capacity = 50000
memory = Memory(memory_capacity)

# Actor Network

In [6]:
class ActorNetwork(tf.keras.Model):
    def __init__(self, observation_spec, action_spec, name="Actor"):
        super(ActorNetwork, self).__init__()
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        hidden_units = 64
        
        # Layers
        self._layer1 = tf.keras.layers.Dense(hidden_units, activation='relu')
        self._layer2 = tf.keras.layers.Dense(hidden_units, activation='relu')
        self._output = tf.keras.layers.Dense(action_spec.shape[0])

    def call(self, observation):
        output = self._layer1(observation)
        output = self._layer2(output)
        output = self._output(output)
        return output
    
    def copy(self, name):
        # Create a new copy of this network with the initialization params that were
        # passed to this one.
        return type(self)(self._observation_spec, self._action_spec, name)
    
    def _build(self):
        if not self.built and self._observation_spec is not None:
            # Generate a single random observation to build the network if it hasn't
            # been built already.
            random_observation = tf.random.uniform((1,self._observation_spec.shape[0]), 
                                             self._observation_spec.low, 
                                             self._observation_spec.high)
            action = self.__call__(random_observation)
            print(action)

    @property
    def variables(self):
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights


In [7]:
actor_network = ActorNetwork(train_env.observation_space, train_env.action_space)
actor_variables = actor_network.variables
actor_network.summary()

tf.Tensor([[0.01904323]], shape=(1, 1), dtype=float32)
Model: "actor_network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  256       
_________________________________________________________________
dense_1 (Dense)              multiple                  4160      
_________________________________________________________________
dense_2 (Dense)              multiple                  65        
Total params: 4,481
Trainable params: 4,481
Non-trainable params: 0
_________________________________________________________________


# Critic Network

In [8]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, observation_spec, action_spec, name="Critic"):
        super(CriticNetwork, self).__init__()
        
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        hidden_units = 64
        
        # Observation Layers
        self._observation_layer1 = tf.keras.layers.Dense(hidden_units, activation='relu')
        self._observation_layer2 = tf.keras.layers.Dense(hidden_units, activation='relu')
        
        # Action Layers
        self._action_layer1 = tf.keras.layers.Dense(hidden_units, activation='relu')
        
        # Merged Layers
        self._merge_layer1 = tf.keras.layers.Add()
        self._merge_layer2 = tf.keras.layers.Dense(hidden_units, activation='relu')
        
        # Output Layer: Q-value for the action taken based on this observation.
        self._output = tf.keras.layers.Dense(1, activation='relu')

    def call(self, observation, action):
        obs_output = self._observation_layer1(observation)
        obs_output = self._observation_layer2(obs_output)
        
        action_output = self._action_layer1(action)
        
        output = self._merge_layer1([obs_output, action_output])
        output = self._merge_layer2(output)
        q = self._output(output)
        return q
    
    def copy(self, name):
        # Create a new copy of this network with the initialization params that were
        # passed to this one.
        return type(self)(self._observation_spec, self._action_spec, name)
    
    def _build(self):
        if not self.built and self._observation_spec is not None:
            # Generate a single random observation and action to build the network if it hasn't
            # been built already.
            random_observation = tf.random.uniform((1, self._observation_spec.shape[0]), 
                                             self._observation_spec.low, 
                                             self._observation_spec.high)
            random_action = tf.random.uniform((1, self._action_spec.shape[0]), 
                                              self._action_spec.low, 
                                              self._action_spec.high)

            q = self.__call__(random_observation, random_action)
            print(q)

    @property
    def variables(self):
        try:
            self._build()
        except ValueError as e:
            traceback = sys.exc_info()[2]
            six.reraise(
                ValueError, "Failed to call build on the network when accessing "
                "variables. Message: {!r}.".format(e), traceback)
        return self.weights


In [9]:
critic_network = CriticNetwork(train_env.observation_space, train_env.action_space)
critic_variables = critic_network.variables
critic_network.summary()

tf.Tensor([[0.]], shape=(1, 1), dtype=float32)
Model: "critic_network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  256       
_________________________________________________________________
dense_4 (Dense)              multiple                  4160      
_________________________________________________________________
dense_5 (Dense)              multiple                  128       
_________________________________________________________________
add (Add)                    multiple                  0         
_________________________________________________________________
dense_6 (Dense)              multiple                  4160      
_________________________________________________________________
dense_7 (Dense)              multiple                  65        
Total params: 8,769
Trainable params: 8,769
Non-trainable params: 0
___

# Agent

In [10]:
class ActorCriticAgent:
    def __init__(self, 
                 observation_spec, 
                 action_spec, 
                 actor_network,
                 critic_network, 
                 actor_learning_rate = 3e-4, 
                 critic_learning_rate = 3e-4, 
                 gamma = 0.99): 
        
        self._train_step_counter = tf.compat.v2.Variable(0)
        self._observation_spec = observation_spec # shape, low, high
        self._action_spec = action_spec # shape, low, high
        
        self._actor_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=actor_learning_rate)
        
        self._actor_network = actor_network
        self._target_actor_network = actor_network.copy("actor-target")
        
        self._critic_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=critic_learning_rate)
        self._critic_network = critic_network
        self._target_critic_network = critic_network.copy("critic-target")
        
        self._gamma = gamma

    def train(self, batch): 
        no_state = np.zeros(self._observation_spec.shape[0])
        observations = tf.constant([ x[0] for x in batch ], dtype=tf.float32)
        actions = tf.constant([ x[1] for x in batch ], dtype=tf.float32)
        rewards = tf.expand_dims(tf.constant([ x[2] for x in batch ], dtype=tf.float32),axis=-1)
        next_observations = tf.constant([ (no_state if x[4] is True else x[3]) for x in batch ], dtype=tf.float32)
        
        # Compute Critic Loss and apply gradients.
        critic_variables = self._critic_network.variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(critic_variables)
            critic_loss = self.critic_loss(observations, actions, rewards, next_observations)
        
        critic_grads = tape.gradient(critic_loss, critic_variables)
        critic_grads_and_vars = tuple(zip(critic_grads, critic_variables))
        self._critic_optimizer.apply_gradients(critic_grads_and_vars)
        
        # Compute Actor Loss and apply gradients
        actor_variables = self._actor_network.variables
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(actor_variables)
            actor_loss = self.actor_loss(observations)
            
        actor_grads = tape.gradient(actor_loss, actor_variables)
        actor_grads_and_vars = tuple(zip(actor_grads, actor_variables))
        self._actor_optimizer.apply_gradients(actor_grads_and_vars)

        # Increment counter and conditionally target networks.
        self._train_step_counter.assign_add(1)
        #self._update_target()
        
        total_loss = actor_loss + critic_loss
        return total_loss
        

    def critic_loss(self, observations, actions, rewards, next_observations):        
        with tf.name_scope('critic_loss'):
            target_actions = self._target_actor_network(observations)

            target_q_values = self._target_critic_network(next_observations, target_actions)

            td_targets = tf.stop_gradient(rewards + self._gamma * target_q_values)

            q_values = self._critic_network(observations, actions)

            critic_loss = tf.compat.v1.losses.mean_squared_error(td_targets, q_values)

            critic_loss = tf.reduce_mean(critic_loss)

            return critic_loss
                
    def actor_loss(self, observations):
        with tf.name_scope('actor_loss'):
            actions = self._actor_network(observations)
            
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(actions)
                q_values = self._critic_network(observations, actions)

            dqdas = tape.gradient([q_values], actions)

            actor_losses = []
            
            for dqda, action in zip(dqdas, actions):
                loss = tf.compat.v1.losses.mean_squared_error(tf.stop_gradient(dqda + action), 
                                                              action,
                                                             reduction=tf.compat.v1.losses.Reduction.NONE)
                loss = tf.reduce_mean(loss)
                actor_losses.append(loss)

            actor_loss = tf.add_n(actor_losses)
            
            return actor_loss

In [11]:
agent = ActorCriticAgent(train_env.observation_space, 
                         train_env.action_space, 
                         actor_network, 
                         critic_network)

In [13]:
done = False
observation = train_env.reset()
episode_rewards = 0
while not done:
    action = train_env.action_space.sample()
    next_observation, reward, done, _ = train_env.step(action)
    memory.add((observation, action, reward, next_observation, done))
    observation = next_observation
    episode_rewards += reward

print(episode_rewards)

-1521.9634213417144


In [21]:
def compute_avg_return(env, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        done = False
        observation = env.reset()
        episode_return = 0.0
        
        while not done:
            action = env.action_space.sample() # TODO: replace with policy
            next_observation, reward, done, _ = env.step(action)
            memory.add((observation, action, reward, next_observation, done))
            observation = next_observation
            episode_return += reward

        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return

In [23]:
num_iterations = 2000
log_interval = 100
eval_interval = 1000
num_eval_episodes = 5

# Reset the train step
agent._train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
eval_policy = None
avg_return = compute_avg_return(eval_env, eval_policy, num_eval_episodes)
returns = [avg_return]


for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    #for _ in range(collect_steps_per_iteration):
    #    collect_driver.run()

    # Sample a batch of data from the buffer and update the agent's network.
    batch = memory.sample(32)
    train_loss = agent.train(batch)

    step = agent._train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, eval_policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

step = 100: loss = 28.36805534362793
step = 200: loss = 51.639686584472656
step = 300: loss = 45.86397933959961
step = 400: loss = 39.15007019042969
step = 500: loss = 50.80495071411133
step = 600: loss = 51.993736267089844
step = 700: loss = 60.535552978515625
step = 800: loss = 46.92090606689453
step = 900: loss = 33.07298278808594
step = 1000: loss = 42.93412780761719
step = 1000: Average Return = -1411.4911204163868
step = 1100: loss = 56.757286071777344
step = 1200: loss = 41.279640197753906
step = 1300: loss = 58.4522590637207


KeyboardInterrupt: 

In [None]:
for i in range (5):
    batch = memory.sample(32)
    loss = agent.train(batch)
    print(loss)