## Libraries

In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import time
from humanfriendly import format_timespan

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

## Hyperparameters

In [2]:
ENV = 'CartPole-v0'

EPISODES = 5000 # Number of times the enviroment is ran

LOSS_CLIPPING = 0.2 # Approximated values stated in the original paper
EPOCHS = 10 # Epochs to train the network (recommended between 3 and 30)
NOISE = 1.0 # Standard deviation

GAMMA = 0.99 # Used for the estimated reward

BUFFER_SIZE = 512 # Buffer of experiences
BATCH_SIZE = 64 # Batch size for the neural nets
NUM_ACTIONS = 2 # Number of possible action in the environment
NUM_STATES = 4 # Number of possible states in the exvironment
ENTROPY_LOSS = 5e-3 # Constant value for the entropy (entropy used for exploration)
LR = 1e-4  # Learning rate

# Used to feed the actor when predicting
DUMMY_ACTION, DUMMY_VALUE = np.zeros((1, NUM_ACTIONS)), np.zeros((1, 1))

## Class memory


In [3]:
# Memory of the experiences
class Memory:
    def __init__(self):
        self.episode_batch = [[], [], []]
        self.global_batch = [[], [], [], []]
        self.episode_rewards = []
        self.historical_rewards = []

    def reset_espisode_batch(self):
        self.global_batch = [[], [], [], []]
        self.episode_batch = [[], [], []]
        self.episode_rewards = []

## Custom loss function for PPO

In [8]:
# Custom loss functions for the PPO
def proximal_policy_optimization_loss(advantage, old_prediction):
    def loss(y_true, y_pred):

        
        prob = K.sum(y_true * y_pred, axis=-1)
        old_prob = K.sum(y_true * old_prediction, axis=-1)
        r = prob/(old_prob + 1e-10)
        return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage) + ENTROPY_LOSS * -(prob * K.log(prob + 1e-10)))
    return loss

## PPO Agent

In [5]:
class Agent:
    # Constructor of the class
    def __init__(self):
        self.actor = self.create_actor()
        self.critic = self.create_critic()
        self.memory = Memory()

    # Create the actor used to select the action given an state
    def create_actor(self):
        # Define three inputs as the advantage and old prediction is used for the custom loss
        input_state = Input(shape=(NUM_STATES,))
        input_advantage = Input(shape=(1,))
        input_old_prediction = Input(shape=(NUM_ACTIONS,))
        input_actions = Input(shape=(NUM_ACTIONS,))

        layer1 = Dense(128, activation='relu')(input_state)
        layer2 = Dense(128, activation='relu')(layer1)

        # Softmax as there are different probabilities depending on the action
        output_layer = Dense(NUM_ACTIONS, activation='softmax', name='output')(layer2)

        model = Model(inputs=[input_state, input_advantage, input_old_prediction, input_actions], outputs=[output_layer])

        # Compile the model with the custom loss
        model.compile(optimizer=Adam(lr=LR),
                      loss=[proximal_policy_optimization_loss(
                          advantage=input_advantage,
                          old_prediction=input_old_prediction,
                          actions=input_actions)])
        model.summary()

        return model

    # Create the critic which will criticise how the actor is performing
    def create_critic(self):
        # Define the architectire of the network
        input_layer = Input(shape=(NUM_STATES,))
        layer1 = Dense(128, activation='relu')(input_layer)
        layer2 = Dense(128, activation='relu')(layer1)

        # Linear output to know how good the action is
        ouput_layer = Dense(1)(layer2)

        model = Model(inputs=[input_layer], outputs=[ouput_layer])

        # Compile it with mse loss and gradient descent
        model.compile(optimizer=Adam(lr=LR), 
                      loss='mse')

        return model

    # Get the action given the current state
    def get_action(self, state):
        probabilities = self.actor.predict([state.reshape(1, NUM_STATES), DUMMY_VALUE, DUMMY_ACTION])
        action = np.random.choice(NUM_ACTIONS, p=np.nan_to_num(probabilities[0]))
        action_matrix = np.zeros(NUM_ACTIONS)
        action_matrix[action] = 1
        return action, action_matrix, probabilities

    # Transform rewards of the episode as the discount has to be applied in the first ones regarding the last ones
    def transform_reward(self):
        self.memory.historical_rewards.append(sum(self.memory.episode_rewards))

        for j in range(len(self.memory.episode_rewards) - 2, -1, -1):
            self.memory.episode_rewards[j] += self.memory.episode_rewards[j + 1] * GAMMA

## Environment

In [6]:
# Create simulated environment
def create_environment():
    environment = gym.make(ENV)
    return environment

In [9]:
if __name__ == '__main__':
    start = time.time()
    env = create_environment() 
    ag = Agent()
    episode = 1
    goal_reached = False

    # Iterate until the number of episodes is reached
    while episode < EPISODES:
        # Reset episode and global batch and the environment to start a new one
        ag.memory.reset_espisode_batch()
        state = env.reset()

        # Iterate until the global batch is bigger than the buffer
        while len(ag.memory.global_batch[0]) < BUFFER_SIZE:
            # Get the action given the current state
            action, action_matrix, predicted_action = ag.get_action(state)
            # Step in the enviroment and gather all parameters
            next_state, reward, done, info = env.step(action)
            ag.memory.episode_rewards.append(reward)
            last_reward = sum(ag.memory.episode_rewards)
            # Add to the episode batch the informetions gathered
            ag.memory.episode_batch[0].append(state)
            ag.memory.episode_batch[1].append(action_matrix)
            ag.memory.episode_batch[2].append(predicted_action)
            state = next_state

            # If the episode has finished
            if done:
                # Transform rewards with discount rate
                ag.transform_reward()

                # Iterate through all instances of the episode batch
                for i in range(len(ag.memory.episode_batch[0])):
                    obs, action, pred = ag.memory.episode_batch[0][i], ag.memory.episode_batch[1][i], ag.memory.episode_batch[2][i]
                    r = ag.memory.episode_rewards[i]

                    # Add all experienes to global batch
                    ag.memory.global_batch[0].append(obs)
                    ag.memory.global_batch[1].append(action)
                    ag.memory.global_batch[2].append(pred)
                    ag.memory.global_batch[3].append(r)
                
                # Reset the episode batch, episode reward and environment
                ag.memory.episode_batch = [[], [], []]
                ag.memory.episode_rewards = []
                state = env.reset()

                if episode >= 100:
                    print('EPISODE: ', episode, ' - SCORE: ', ag.memory.historical_rewards[-1], ' - MEAN SCORE: ', sum(ag.memory.historical_rewards[-100:])/100)

                    if sum(ag.memory.historical_rewards[-100:])/100 >= 195:
                        goal_reached = True
                        break
                else:
                    print('EPISODE: ', episode, ' - SCORE: ', ag.memory.historical_rewards[-1], ' - MEAN SCORE: ', sum(ag.memory.historical_rewards)/len(ag.memory.historical_rewards))

                episode +=1

        if goal_reached == True:
            break

        # Get the arrays to feed te neural nets
        obs, action, pred, reward = np.array(ag.memory.global_batch[0]), np.array(ag.memory.global_batch[1]), np.array(ag.memory.global_batch[2]), np.reshape(np.array(ag.memory.global_batch[3]), (len(ag.memory.global_batch[3]), 1))
        pred = np.reshape(pred, (pred.shape[0], pred.shape[2]))

        # Ensure that they all have the same size
        obs, action, pred, reward = obs[:BUFFER_SIZE], action[:BUFFER_SIZE], pred[:BUFFER_SIZE], reward[:BUFFER_SIZE]
        old_prediction = pred
        pred_values = ag.critic.predict(obs)

        # Get advantage with the rewards taken from the environment predictions of the critic given the states
        advantage = reward - pred_values

        # Update weights of the actor with the states, advanges obtained from the critic and predictions made. Target values are actions
        actor_loss = ag.actor.fit([obs, advantage, old_prediction], [action], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=0)
        # Update weights of the critic with the states. Targe values are actual rewards
        critic_loss = ag.critic.fit([obs], [reward], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=0)

    # Plot a graph with the rewards over the episodes
    plt.plot(ag.memory.historical_rewards)
    plt.xlabel("Episode")
    plt.ylabel("Score")   
    end = time.time()
    difference = end - start
    print(format_timespan(difference))

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 128)          640         input_4[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 128)          16512       dense_2[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 1)]          0                                            
____________________________________________________________________________________________

[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[1. 0.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
EPISODE:  30  - SCORE:  59.0  - MEAN SCORE:  26.433333333333334
[1. 0.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[1. 0.]
[1. 0.]
EPISODE:  31  - SCORE:  19.0  - MEAN SCORE:  26.193548387096776
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]
[1. 0.]
[1. 0.]
[0. 1.]
EPISODE:  32  - SCORE:  

KeyboardInterrupt: 

In [11]:
print(format_timespan(difference))

7 minutes and 31.55 seconds


In [8]:
# No necesario para cartpole
#!pip install gym[Box_2D]
#!pip install box2d-py
#!pip install pyglet
