Less epochs

In [1]:
import os

import random
import gym
import pylab
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt

#tf.config.experimental_run_functions_eagerly(True)
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import copy

EPISODES = 10000 # Number of times the enviroment is ran
LOSS_CLIPPING = 0.2 # Approximated values stated in the original paper
ENTROPY_LOSS = 0.001 # Epochs to train the network (recommended between 3 and 30)
LR = 0.02 # Learning rate 
NUM_ACTIONS = 2 # Number of possible action in the environment
NUM_STATES = 4 # Number of possible states in the exvironment
EPOCHS = 3 # Epochs to train the network (recommended between 3 and 30)
BATCH_SIZE = 64 # Batch size for the neural nets
BUFFER_SIZE = 2048 # Buffer of experiences
SHUFFLE = True # Whether to shuffle data or not while training
OPTIMIZER = Adam # Optimizer for both actor and critic
GAMMA = 0.99 # Used for the estimated reward
LAMBDA = 0.95 # Used in the original paper un the GAE
NORMALIZE = True # Whether to normalize GAE or not


# Create the actor used to select the action given an state
class Actor_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X_input)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        X = Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer(stddev=0.01))(X)
        
        # Softmax as there are different probabilities depending on the action
        output = Dense(NUM_ACTIONS, activation="softmax")(X)
        
        # Compile the model with the custom loss
        self.model = Model(inputs = X_input, outputs = output)
        self.model.compile(loss=self.ppo_loss, optimizer=OPTIMIZER(lr=LR))

    # Custom loss functions for the PPO
    def ppo_loss(self, y_true, y_pred):
        # Unpack the elements given in the true label
        advantages, true_label, actions = y_true[:, :1], y_true[:, 1:1+NUM_ACTIONS], y_true[:, 1+NUM_ACTIONS:]
        
        prob = actions * y_pred
        old_prob = actions * true_label 

        ratio = K.exp(K.log(prob + 1e-10) - K.log(old_prob + 1e-10))
        
        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = ENTROPY_LOSS * K.mean(entropy)
        
        total_loss = actor_loss - entropy

        return total_loss

# Create the critic which will criticise how the actor is performing    
class Critic_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X_input)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Linear output to know how good the action is
        value = Dense(1)(X)
        
        # Compile it with mse loss and gradient descent
        self.model = Model(inputs=X_input, outputs = value)
        self.model.compile(loss='mse', optimizer=OPTIMIZER(lr=LR))

# Combine both Actor and Critic to create the agent
class PPOAgent:
    def __init__(self, env_name):
        # Environment parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.episode = 0 # used to track current number episoded since start
        self.max_average = 0 # record max average reached
        
        # Used to plot a grapgh of the train process
        self.scores_, self.average_ = [], []

        # Create Actor-Critic network models
        self.Actor = Actor_Model()
        self.Critic = Critic_Model()
    
        # Names for the models
        self.Actor_name = f"{self.env_name}_PPO_Actor.h5"
        self.Critic_name = f"{self.env_name}_PPO_Critic.h5"

    # Get the action given the current state    
    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.Actor.model.predict(state)[0]
        
        # Probability based to choose the action
        action = np.random.choice(NUM_ACTIONS, p=prediction)
        action_onehot = np.zeros([NUM_ACTIONS])
        action_onehot[action] = 1
        return action, action_onehot, prediction

    # Generalized Advantage Estimation implemented in the original paper
    def get_gaes(self, rewards, dones, values, next_values):
        # Dones are used to track when is the final step of an episode, so next values are no applied
        deltas = [r + GAMMA * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        
        # Convert list to array as .mean() and .std() are used later
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * GAMMA * LAMBDA * gaes[t + 1]

        target = gaes + values
        if NORMALIZE:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def replay(self, states, actions, rewards, predictions, dones, next_states):
        # Reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)

        # Get Critic network predictions for state and next state
        values = self.Critic.model.predict(states)
        next_values = self.Critic.model.predict(next_states)

        # Get the advantage
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))

        # Stack info to unpack it in the custom loss
        y_true = np.hstack([advantages, predictions, actions])

        # Training Actor and Critic networks
        a_loss = self.Actor.model.fit(states, y_true, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        c_loss = self.Critic.model.fit(states, target, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)

 
    def load(self):
        self.Actor.Actor.load_weights(self.Actor_name)
        self.Critic.Critic.load_weights(self.Critic_name)

    def save(self):
        self.Actor.model.save_weights(self.Actor_name)
        self.Critic.model.save_weights(self.Critic_name)

    def run_batch(self): # train every self.Training_batch episodes
        global LR
        state = self.env.reset()
        state = np.reshape(state, [1, NUM_STATES])
        done, score = False, 0
        finished = False
        while finished == False:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
            for t in range(BUFFER_SIZE):
                #self.env.render()
                # Actor picks an action
                action, action_onehot, prediction = self.act(state)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                # Memorize (state, action, reward) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, NUM_STATES]))
                actions.append(action_onehot)
                rewards.append(reward)
                dones.append(done)
                predictions.append(prediction)
                # Update current state
                state = np.reshape(next_state, [1, NUM_STATES])
                score += reward
                if done:
                    self.episode += 1
                    self.scores_.append(score)
                    #average, SAVING = self.PlotModel(score, self.episode)
                    #print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, EPISODES, score, average, SAVING))
                    if self.episode >= 100:
                        average = sum(self.scores_[-100:])/100
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, average))
                        if average > self.max_average:
                            self.max_average = average
                            if self.max_average > 150:
                                self.save()
                            LR *= 0.95
                            K.set_value(self.Actor.model.optimizer.learning_rate, LR)
                            K.set_value(self.Critic.model.optimizer.learning_rate, LR)
                            
                        if average > 195:
                            plt.plot(self.scores_)
                            plt.xlabel("Episode")
                            plt.ylabel("Score")
                            finished = True
                            break

                    else:
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, sum(self.scores_)/len(self.scores_)))
                    
                    
                    state, done, score = self.env.reset(), False, 0
                    state = np.reshape(state, [1, NUM_STATES])
                    
            self.replay(states, actions, rewards, predictions, dones, next_states)
            if self.episode >= EPISODES:
                break
        self.env.close()  
                    
if __name__ == "__main__":
    start = time.time()
    env_name = 'CartPole-v0'
    agent = PPOAgent(env_name)
    #agent.run() # train as PPO, train every epesode
    agent.run_batch() # train as PPO, train every batch, trains better
    #agent.run_multiprocesses(num_worker = 8)  # train PPO multiprocessed (fastest)
    #agent.test()
    print((time.time() - start)/60)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode:     1		score:   17.00		average:   17.00
Episode:     2		score:   27.00		average:   22.00
Episode:     3		score:   36.00		average:   26.67
Episode:     4		score:   21.00		average:   25.25
Episode:     5		score:   14.00		average:   23.00
Episode:     6		score:   30.00		average:   24.17
Episode:     7		score:   24.00		average:   24.14
Episode:     8		score:   23.00		average:   24.00
Episode:     9		score:   15.00		average:   23.00
Episode:    10		score:   16.00		average:   22.30
Episode:    11		score:   14.00		average:   21.55
Episode:    12		score:   29.00		average:   22.17
Episode:    13		score:   12.00		average:   21.38
Episode:    14		score:   20.00		average:   21.29
Episode:    15		score:   10.00		average:   20.53
Episode:    16		score:   13.00		average:   20.06
Episode:    17		score:   25.00		average:   20.35
Episode:    18		score:   13.00		average:   19.94
Episode:    19		score:    9.00		avera

Episode:   166		score:   17.00		average:   22.50
Episode:   167		score:   12.00		average:   22.31
Episode:   168		score:   10.00		average:   22.24
Episode:   169		score:   28.00		average:   22.24
Episode:   170		score:   13.00		average:   22.16
Episode:   171		score:   18.00		average:   22.12
Episode:   172		score:   30.00		average:   22.28
Episode:   173		score:   17.00		average:   22.28
Episode:   174		score:   18.00		average:   22.33
Episode:   175		score:   13.00		average:   22.09
Episode:   176		score:   22.00		average:   22.11
Episode:   177		score:   45.00		average:   22.29
Episode:   178		score:   11.00		average:   22.15
Episode:   179		score:   18.00		average:   22.23
Episode:   180		score:   16.00		average:   22.27
Episode:   181		score:   20.00		average:   22.29
Episode:   182		score:   20.00		average:   22.32
Episode:   183		score:   35.00		average:   22.51
Episode:   184		score:   17.00		average:   22.52
Episode:   185		score:   11.00		average:   22.49
Episode:   186		scor

Episode:   338		score:   29.00		average:   22.94
Episode:   339		score:   28.00		average:   22.98
Episode:   340		score:   13.00		average:   22.73
Episode:   341		score:   13.00		average:   22.70
Episode:   342		score:   14.00		average:   22.62
Episode:   343		score:   14.00		average:   22.47
Episode:   344		score:   24.00		average:   22.61
Episode:   345		score:   23.00		average:   22.71
Episode:   346		score:   19.00		average:   22.43
Episode:   347		score:   33.00		average:   22.65
Episode:   348		score:   24.00		average:   22.67
Episode:   349		score:   13.00		average:   22.66
Episode:   350		score:   17.00		average:   22.64
Episode:   351		score:   44.00		average:   22.92
Episode:   352		score:   23.00		average:   22.94
Episode:   353		score:   19.00		average:   22.93
Episode:   354		score:   52.00		average:   23.03
Episode:   355		score:   21.00		average:   23.02
Episode:   356		score:   36.00		average:   23.10
Episode:   357		score:   11.00		average:   23.06
Episode:   358		scor

Episode:   510		score:   17.00		average:   20.49
Episode:   511		score:   12.00		average:   20.48
Episode:   512		score:   19.00		average:   20.52
Episode:   513		score:   18.00		average:   20.51
Episode:   514		score:   14.00		average:   20.40
Episode:   515		score:   20.00		average:   20.48
Episode:   516		score:   75.00		average:   21.09
Episode:   517		score:   21.00		average:   21.03
Episode:   518		score:   22.00		average:   21.13
Episode:   519		score:   11.00		average:   21.01
Episode:   520		score:   14.00		average:   20.94
Episode:   521		score:   11.00		average:   20.95
Episode:   522		score:   16.00		average:   21.00
Episode:   523		score:    9.00		average:   20.89
Episode:   524		score:   16.00		average:   20.91
Episode:   525		score:   21.00		average:   20.78
Episode:   526		score:   31.00		average:   20.93
Episode:   527		score:   17.00		average:   21.02
Episode:   528		score:   16.00		average:   21.06
Episode:   529		score:   11.00		average:   21.08
Episode:   530		scor

Episode:   680		score:   11.00		average:   19.93
Episode:   681		score:   20.00		average:   19.98
Episode:   682		score:   16.00		average:   19.98
Episode:   683		score:   25.00		average:   20.06
Episode:   684		score:    9.00		average:   19.90
Episode:   685		score:   44.00		average:   20.23
Episode:   686		score:   14.00		average:   20.18
Episode:   687		score:   18.00		average:   19.95
Episode:   688		score:   17.00		average:   20.02
Episode:   689		score:   31.00		average:   20.15
Episode:   690		score:   13.00		average:   20.07
Episode:   691		score:   48.00		average:   20.45
Episode:   692		score:   12.00		average:   20.43
Episode:   693		score:   18.00		average:   20.45
Episode:   694		score:   17.00		average:   20.49
Episode:   695		score:   17.00		average:   20.41
Episode:   696		score:   40.00		average:   20.59
Episode:   697		score:   11.00		average:   20.59
Episode:   698		score:   13.00		average:   20.57
Episode:   699		score:   17.00		average:   20.46
Episode:   700		scor

KeyboardInterrupt: 