In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
import time

In [2]:

def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )


def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)


def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions) * logprobabilities_all, axis=1
    )
    return logprobability


# Sample action from actor
@tf.function
def sample_action(observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_policy(
    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):

    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities(actor(observation_buffer), action_buffer)
            - logprobability_buffer
        )
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))

    kl = tf.reduce_mean(
        logprobability_buffer
        - logprobabilities(actor(observation_buffer), action_buffer)
    )
    kl = tf.reduce_sum(kl)
    return kl


# Train the value function by regression on mean-squared error
@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        value_loss = tf.reduce_mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))


In [3]:
# Hyperparameters of the PPO algorithm
steps_per_epoch = 4000
epochs = 300
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (64, 64)

# True if you want to render the environment
render = False

In [4]:
# Initialize the environment and get the dimensionality of the
# observation space and the number of possible actions
env = gym.make("CartPole-v0")
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# Initialize the buffer
buffer = Buffer(observation_dimensions, steps_per_epoch)

# Initialize the actor and the critic as keras models
observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)
value = tf.squeeze(
    mlp(observation_input, list(hidden_sizes) + [1], tf.tanh, None), axis=1
)
critic = keras.Model(inputs=observation_input, outputs=value)

# Initialize the policy and the value function optimizers
policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

In [5]:
# Iterate over the number of epochs
scores = []
start = time.time()
num_episodes = 1
fin = False
for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes for each epoch

    

    # Iterate over the steps of each epoch
    for t in range(steps_per_epoch):
        if render:
            env.render()

        # Get the logits, action, and take one step in the environment
        observation = observation.reshape(1, -1)
        logits, action = sample_action(observation)
        observation_new, reward, done, _ = env.step(action[0].numpy())
        episode_return += reward
        episode_length += 1

        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation, action, reward, value_t, logprobability_t)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal:
            last_value = 0 if done else critic(observation.reshape(1, -1))
            buffer.finish_trajectory(last_value)
            scores.append(episode_return)
            if num_episodes >= 100:
                average = sum(scores[-100:])/100
                print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(num_episodes, episode_return, average))
                if average > 195:
                    fin = True
                    break
            else:
                average = sum(scores)/len(scores)
                print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(num_episodes, episode_return, average))
                
            num_episodes += 1
            observation, episode_return, episode_length = env.reset(), 0, 0
    if fin:
        break
    # Get values from the buffer
    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer,
    ) = buffer.get()

    # Update the policy and implement early stopping using KL divergence
    for _ in range(train_policy_iterations):
        kl = train_policy(
            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break

    # Update the value function
    for _ in range(train_value_iterations):
        train_value_function(observation_buffer, return_buffer)
print((time.time()-start)/60)

Episode:     1		score:   27.00		average:   27.00
Episode:     2		score:   26.00		average:   26.50
Episode:     3		score:   21.00		average:   24.67
Episode:     4		score:   12.00		average:   21.50
Episode:     5		score:   22.00		average:   21.60
Episode:     6		score:    8.00		average:   19.33
Episode:     7		score:   24.00		average:   20.00
Episode:     8		score:   22.00		average:   20.25
Episode:     9		score:   86.00		average:   27.56
Episode:    10		score:   56.00		average:   30.40
Episode:    11		score:   18.00		average:   29.27
Episode:    12		score:   12.00		average:   27.83
Episode:    13		score:   16.00		average:   26.92
Episode:    14		score:   20.00		average:   26.43
Episode:    15		score:   24.00		average:   26.27
Episode:    16		score:   12.00		average:   25.38
Episode:    17		score:   13.00		average:   24.65
Episode:    18		score:   30.00		average:   24.94
Episode:    19		score:   15.00		average:   24.42
Episode:    20		score:   20.00		average:   24.20
Episode:    21		scor

Episode:   169		score:   15.00		average:   18.14
Episode:   170		score:   23.00		average:   18.25
Episode:   171		score:    9.00		average:   18.15
Episode:   172		score:   19.00		average:   17.99
Episode:   173		score:   11.00		average:   17.96
Episode:   174		score:   16.00		average:   17.96
Episode:   175		score:   40.00		average:   18.25
Episode:   176		score:   27.00		average:   18.37
Episode:   177		score:   21.00		average:   18.42
Episode:   178		score:   20.00		average:   18.51
Episode:   179		score:   12.00		average:   18.54
Episode:   180		score:   15.00		average:   18.59
Episode:   181		score:   19.00		average:   18.69
Episode:   182		score:   25.00		average:   18.62
Episode:   183		score:   16.00		average:   18.55
Episode:   184		score:   19.00		average:   18.54
Episode:   185		score:   24.00		average:   18.60
Episode:   186		score:   14.00		average:   18.58
Episode:   187		score:   10.00		average:   18.51
Episode:   188		score:   26.00		average:   18.56
Episode:   189		scor

Episode:   338		score:   15.00		average:   22.84
Episode:   339		score:   17.00		average:   22.81
Episode:   340		score:   28.00		average:   22.95
Episode:   341		score:   20.00		average:   23.04
Episode:   342		score:   19.00		average:   22.75
Episode:   343		score:   53.00		average:   23.09
Episode:   344		score:   30.00		average:   23.17
Episode:   345		score:   16.00		average:   23.08
Episode:   346		score:   23.00		average:   23.16
Episode:   347		score:   12.00		average:   22.96
Episode:   348		score:   16.00		average:   22.79
Episode:   349		score:   11.00		average:   22.65
Episode:   350		score:   21.00		average:   22.71
Episode:   351		score:   14.00		average:   22.51
Episode:   352		score:   17.00		average:   22.54
Episode:   353		score:   17.00		average:   22.50
Episode:   354		score:   35.00		average:   22.35
Episode:   355		score:   23.00		average:   22.46
Episode:   356		score:   23.00		average:   22.56
Episode:   357		score:   14.00		average:   22.27
Episode:   358		scor

Episode:   506		score:   27.00		average:   26.54
Episode:   507		score:   21.00		average:   26.61
Episode:   508		score:   25.00		average:   26.75
Episode:   509		score:   20.00		average:   26.44
Episode:   510		score:   18.00		average:   26.50
Episode:   511		score:   55.00		average:   26.72
Episode:   512		score:   14.00		average:   26.55
Episode:   513		score:   25.00		average:   26.46
Episode:   514		score:   26.00		average:   26.55
Episode:   515		score:   37.00		average:   26.20
Episode:   516		score:   19.00		average:   26.07
Episode:   517		score:   33.00		average:   26.22
Episode:   518		score:   98.00		average:   26.99
Episode:   519		score:   24.00		average:   26.98
Episode:   520		score:   38.00		average:   27.17
Episode:   521		score:   26.00		average:   27.06
Episode:   522		score:   23.00		average:   26.90
Episode:   523		score:   13.00		average:   26.83
Episode:   524		score:   39.00		average:   26.90
Episode:   525		score:   20.00		average:   26.88
Episode:   526		scor

Episode:   674		score:   50.00		average:   53.64
Episode:   675		score:   77.00		average:   54.27
Episode:   676		score:   41.00		average:   54.56
Episode:   677		score:   52.00		average:   54.75
Episode:   678		score:   48.00		average:   54.94
Episode:   679		score:   55.00		average:   55.21
Episode:   680		score:   74.00		average:   55.71
Episode:   681		score:   92.00		average:   56.24
Episode:   682		score:   41.00		average:   56.34
Episode:   683		score:   64.00		average:   56.65
Episode:   684		score:   30.00		average:   55.61
Episode:   685		score:   83.00		average:   55.55
Episode:   686		score:   35.00		average:   55.46
Episode:   687		score:   41.00		average:   55.11
Episode:   688		score:   73.00		average:   55.49
Episode:   689		score:   33.00		average:   55.14
Episode:   690		score:   17.00		average:   54.97
Episode:   691		score:   33.00		average:   54.52
Episode:   692		score:  131.00		average:   55.62
Episode:   693		score:  138.00		average:   56.33
Episode:   694		scor

Episode:   842		score:  200.00		average:  167.50
Episode:   843		score:  200.00		average:  168.57
Episode:   844		score:  200.00		average:  169.26
Episode:   845		score:  200.00		average:  170.81
Episode:   846		score:  200.00		average:  171.48
Episode:   847		score:  200.00		average:  171.83
Episode:   848		score:  200.00		average:  172.10
Episode:   849		score:  200.00		average:  173.35
Episode:   850		score:  200.00		average:  173.53
Episode:   851		score:  200.00		average:  174.73
Episode:   852		score:  200.00		average:  176.16
Episode:   853		score:  200.00		average:  177.12
Episode:   854		score:  200.00		average:  177.32
Episode:   855		score:  200.00		average:  178.61
Episode:   856		score:  200.00		average:  179.23
Episode:   857		score:  200.00		average:  180.65
Episode:   858		score:  200.00		average:  181.06
Episode:   859		score:  200.00		average:  182.41
Episode:   860		score:  200.00		average:  183.85
Episode:   861		score:  200.00		average:  185.34
Episode:   862		scor