# Note

In this tutorial we will see a few useful concepts:

- Eager execution: operations are executed immediately as they are called from Python. This to make research and development more intuitive.
- Model subclassing: we wil subclass tf.keras.Model and costumize our forward pass. Because of eager executing the forward pass can be written imperatively.

We will need to:

- Create a master agent supervisor
- Create worker agents
- Implement A3C
- Train 

Paper: Asynchronous Methods for Deep Reinforcement Learning by Volodymyr Mnih


# Task: CartPole



Cartpole is a simple game in which the player needs to balance pole connected to a cart with un-actuated joint. The cart can move left or right. At the beginning cart position, velocity, pole angle, and velocity are randomly initialized between +/-0.05.

The agent can apply a force of +1 or -1 to the cart which will move it left or right). The goals is to balance the pole acting on the cart.

At every timestep in which the pole is upright the agent receive a positive reward of +1. If the pole is more than 15 degrees from vertical or the cart moves more than 2.4 units from the center the episode ends.



In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import threading
import gym
import multiprocessing
import numpy as np
from queue import Queue
import argparse
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import layers

tf.enable_eager_execution()

verbose = False

class ActorCriticModel(keras.Model):
    def __init__(self, state_size, action_size):
        super(ActorCriticModel, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.dense1 = layers.Dense(100, activation='relu')
        self.policy_logits = layers.Dense(action_size)
        self.dense2 = layers.Dense(100, activation='relu')
        self.values = layers.Dense(1)

    def call(self, inputs):
        # Forward pass
        x = self.dense1(inputs)
        logits = self.policy_logits(x)
        v1 = self.dense2(inputs)
        values = self.values(v1)
        return logits, values


def record(episode, episode_reward, worker_idx, global_ep_reward, result_queue,
           total_loss, num_steps):
    """Store score  statistics.
    """
    if global_ep_reward == 0:
        global_ep_reward = episode_reward
    else:
        global_ep_reward = global_ep_reward * 0.99 + episode_reward * 0.01
        if verbose:
            print(f"Episode: {episode} | "
                  f"Episode Reward: {int(episode_reward)} | "
                  f"Steps: {num_steps} | "
                  f"Worker: {worker_idx}")
    result_queue.put(global_ep_reward)
    return global_ep_reward


class RandomAgent(object):
    """Random Agent that will play the specified game
    Arguments:
      env_name: Name of the environment to be played
      max_eps: Maximum number of episodes to run agent for.
    """
    def __init__(self, env_name, max_eps):
        self.env = gym.make(env_name)
        self.max_episodes = max_eps
        self.global_moving_average_reward = 0
        self.res_queue = Queue()

    def run(self):
        reward_avg = 0
        for episode in range(self.max_episodes):

            done = False
            self.env.reset()
            reward_sum = 0.0
            steps = 0
            while not done:
                # Sample randomly from the action space and step
                _, reward, done, _ = self.env.step(self.env.action_space.sample())
                steps += 1
                reward_sum += reward
                # Record statistics
                self.global_moving_average_reward = record(episode,
                                                         reward_sum,
                                                         0,
                                                         self.global_moving_average_reward,
                                                         self.res_queue, 0, steps)

                reward_avg += reward_sum

        final_avg = reward_avg / float(self.max_episodes)
        print("Average score across {} episodes: {}".format(self.max_episodes, final_avg))
        return final_avg


class MasterAgent(object):
    def __init__(self,
                 env_name='CartPole-v0',
                 save_dir='.',
                 algorithm='A3C',
                 lr=0.001,
                 max_eps=500):
        self.game_name = env_name
        self.save_dir = save_dir
        self.algorithm = algorithm
        self.max_eps = max_eps

        env = gym.make(self.game_name)
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.opt = tf.train.AdamOptimizer(lr, use_locking=True)
        print(self.state_size, self.action_size)

        self.global_model = ActorCriticModel(
            self.state_size, self.action_size)  # global network
        self.global_model(
            tf.convert_to_tensor(
                np.random.random((1, self.state_size)), dtype=tf.float32))

    def train(self):
        if self.algorithm == 'random':
            random_agent = RandomAgent(self.game_name, self.max_eps)
            random_agent.run()
            return

        res_queue = Queue()

        workers = [
            Worker(
                self.state_size,
                self.action_size,
                self.global_model,
                self.opt,
                res_queue,
                i,
                game_name=self.game_name,
                save_dir=self.save_dir,
                max_eps=self.max_eps)
            for i in range(multiprocessing.cpu_count())
        ]

        for i, worker in enumerate(workers):
            print("Starting worker {}".format(i))
            worker.start()

        moving_average_rewards = []  # record episode reward to plot
        while True:
            reward = res_queue.get()
            if reward is not None:
                moving_average_rewards.append(reward)
            else:
                break
        [w.join() for w in workers]

        plt.plot(moving_average_rewards)
        plt.ylabel('Moving avg episode reward')
        plt.xlabel('Step')
        plt.show()

    def play(self):
        env = gym.make(self.game_name).unwrapped
        state = env.reset()
        model = self.global_model
        model_path = os.path.join(self.save_dir,
                                  'model_{}.h5'.format(self.game_name))
        print('Loading model {}'.format(model_path))
        model.load_weights(model_path)
        done = False
        step_counter = 0
        reward_sum = 0

        while not done:
            env.render(mode='rgb_array')
            policy, value = model(
                tf.convert_to_tensor(state[None, :], dtype=tf.float32))
            policy = tf.nn.softmax(policy)
            action = np.argmax(policy)
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            print("{}. Reward: {}, action: {}".format(
                step_counter, reward_sum, action))
            step_counter += 1

        env.close()


class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []

    def store(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def clear(self):
        self.states = []
        self.actions = []
        self.rewards = []


class Worker(threading.Thread):
    # Set up global variables across different threads
    global_episode = 0
    # Moving average reward
    global_moving_average_reward = 0
    best_score = 0
    save_lock = threading.Lock()

    def __init__(self,
                 state_size,
                 action_size,
                 global_model,
                 opt,
                 result_queue,
                 idx,
                 max_eps=500,
                 game_name='CartPole-v0',
                 save_dir='/tmp',
                 update_freq=20,
                 gamma=0.99):
        super(Worker, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.result_queue = result_queue
        self.global_model = global_model
        self.opt = opt
        self.local_model = ActorCriticModel(self.state_size, self.action_size)
        self.worker_idx = idx
        self.max_eps = max_eps
        self.game_name = game_name
        self.env = gym.make(self.game_name).unwrapped
        self.save_dir = save_dir
        self.update_freq = update_freq
        self.gamma = gamma
        self.ep_loss = 0.0

    def run(self):
        total_step = 1
        mem = Memory()
        while Worker.global_episode < self.max_eps:
            current_state = self.env.reset()
            mem.clear()
            ep_reward = 0.
            ep_steps = 0
            self.ep_loss = 0

            time_count = 0
            done = False
            while not done:
                logits, _ = self.local_model(
                    tf.convert_to_tensor(
                        current_state[None, :], dtype=tf.float32))
                probs = tf.nn.softmax(logits)

                action = np.random.choice(self.action_size, p=probs.numpy()[0])
                new_state, reward, done, _ = self.env.step(action)
                if done:
                    reward = -1
                ep_reward += reward
                mem.store(current_state, action, reward)

                if time_count == self.update_freq or done:
                    # Calculate gradient wrt to local model. We do so by tracking the
                    # variables involved in computing the loss by using tf.GradientTape
                    with tf.GradientTape() as tape:
                        total_loss = self.compute_loss(done, new_state, mem,
                                                       self.gamma)
                    self.ep_loss += total_loss
                    # Calculate local gradients
                    grads = tape.gradient(total_loss,
                                          self.local_model.trainable_weights)
                    # Push local gradients to global model
                    self.opt.apply_gradients(
                        zip(grads, self.global_model.trainable_weights))
                    # Update local model with new weights
                    self.local_model.set_weights(
                        self.global_model.get_weights())

                    mem.clear()
                    time_count = 0

                    if done:  # done and print information
                        Worker.global_moving_average_reward = \
                          record(Worker.global_episode, ep_reward, self.worker_idx,
                                 Worker.global_moving_average_reward, self.result_queue,
                                 self.ep_loss, ep_steps)
                        # We must use a lock to save our model and to print to prevent data races.
                        if ep_reward > Worker.best_score:
                            with Worker.save_lock:
                                print("Saving best model to {}, "
                                      "episode score: {}".format(
                                          self.save_dir, ep_reward))
                                self.global_model.save_weights(
                                    os.path.join(
                                        self.save_dir,
                                        'model_{}.h5'.format(self.game_name)))
                                Worker.best_score = ep_reward
                        Worker.global_episode += 1
                ep_steps += 1

                time_count += 1
                current_state = new_state
                total_step += 1
        self.result_queue.put(None)

    def compute_loss(self, done, new_state, memory, gamma=0.99):
        if done:
            reward_sum = 0.  # terminal
        else:
            reward_sum = self.local_model(
                tf.convert_to_tensor(new_state[None, :],
                                     dtype=tf.float32))[-1].numpy()[0]

        # Get discounted rewards
        discounted_rewards = []
        for reward in memory.rewards[::-1]:  # reverse buffer r
            reward_sum = reward + gamma * reward_sum
            discounted_rewards.append(reward_sum)
        discounted_rewards.reverse()

        logits, values = self.local_model(
            tf.convert_to_tensor(np.vstack(memory.states), dtype=tf.float32))
        # Get our advantages
        advantage = tf.convert_to_tensor(
            np.array(discounted_rewards)[:, None], dtype=tf.float32) - values
        # Value loss
        value_loss = advantage**2

        # Calculate our policy loss
        policy = tf.nn.softmax(logits)
        entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=policy, logits=logits)

        policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=memory.actions, logits=logits)
        policy_loss *= tf.stop_gradient(advantage)
        policy_loss -= 0.01 * entropy
        total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss))
        return total_loss


master = MasterAgent(max_eps=500)
master.train()

4 2
Starting worker 0
Starting worker 1
Starting worker 2
Starting worker 3
Starting worker 4
Starting worker 5
Starting worker 6
Starting worker 7
Saving best model to ., episode score: 12.0Saving best model to ., episode score: 16.0

Saving best model to ., episode score: 19.0
Saving best model to ., episode score: 17.0
Saving best model to ., episode score: 22.0
Saving best model to ., episode score: 34.0
Saving best model to ., episode score: 39.0
Saving best model to ., episode score: 45.0
Saving best model to ., episode score: 60.0
Saving best model to ., episode score: 94.0
Saving best model to ., episode score: 96.0
Saving best model to ., episode score: 104.0
Saving best model to ., episode score: 111.0
Saving best model to ., episode score: 139.0
Saving best model to ., episode score: 149.0
Saving best model to ., episode score: 199.0
Saving best model to ., episode score: 211.0
Saving best model to ., episode score: 226.0
Saving best model to ., episode score: 315.0
Saving b

In [2]:
master.play()

Loading model from: ./model_CartPole-v0.h5
0. Reward: 1.0, action: 1
1. Reward: 2.0, action: 0
2. Reward: 3.0, action: 1
3. Reward: 4.0, action: 0
4. Reward: 5.0, action: 1
5. Reward: 6.0, action: 0
6. Reward: 7.0, action: 1
7. Reward: 8.0, action: 0
8. Reward: 9.0, action: 1
9. Reward: 10.0, action: 0
10. Reward: 11.0, action: 1
11. Reward: 12.0, action: 0
12. Reward: 13.0, action: 1
13. Reward: 14.0, action: 0
14. Reward: 15.0, action: 1
15. Reward: 16.0, action: 0
16. Reward: 17.0, action: 1
17. Reward: 18.0, action: 0
18. Reward: 19.0, action: 1
19. Reward: 20.0, action: 0
20. Reward: 21.0, action: 1
21. Reward: 22.0, action: 1
22. Reward: 23.0, action: 0
23. Reward: 24.0, action: 1
24. Reward: 25.0, action: 0
25. Reward: 26.0, action: 1
26. Reward: 27.0, action: 0
27. Reward: 28.0, action: 1
28. Reward: 29.0, action: 0
29. Reward: 30.0, action: 1
30. Reward: 31.0, action: 0
31. Reward: 32.0, action: 1
32. Reward: 33.0, action: 0
33. Reward: 34.0, action: 1
34. Reward: 35.0, action

284. Reward: 285.0, action: 0
285. Reward: 286.0, action: 1
286. Reward: 287.0, action: 0
287. Reward: 288.0, action: 1
288. Reward: 289.0, action: 1
289. Reward: 290.0, action: 0
290. Reward: 291.0, action: 1
291. Reward: 292.0, action: 0
292. Reward: 293.0, action: 1
293. Reward: 294.0, action: 0
294. Reward: 295.0, action: 1
295. Reward: 296.0, action: 0
