In [1]:
import os

import random
import gym
import pylab
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt

#tf.config.experimental_run_functions_eagerly(True)
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import copy

EPISODES = 10000 # Number of times the enviroment is ran
LOSS_CLIPPING = 0.2 # Approximated values stated in the original paper
ENTROPY_LOSS = 0.001 # Epochs to train the network (recommended between 3 and 30)
LR = 0.0003 # Learning rate 
NUM_ACTIONS = 4 # Number of possible action in the environment
NUM_STATES = 8 # Number of possible states in the exvironment
EPOCHS = 10 # Epochs to train the network (recommended between 3 and 30)
BATCH_SIZE = 512 # Batch size for the neural nets
BUFFER_SIZE = 2048 # Buffer of experiences
SHUFFLE = True # Whether to shuffle data or not while training
OPTIMIZER = Adam # Optimizer for both actor and critic
GAMMA = 0.99 # Used for the estimated reward
LAMBDA = 0.95 # Used in the original paper un the GAE
NORMALIZE = True # Whether to normalize GAE or not



# Create the actor used to select the action given an state
class Actor_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X_input)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Softmax as there are different probabilities depending on the action
        output = Dense(NUM_ACTIONS, activation="softmax")(X)
        
        # Compile the model with the custom loss
        self.model = Model(inputs = X_input, outputs = output)
        self.model.compile(loss=self.ppo_loss, optimizer=OPTIMIZER(lr=LR))

    # Custom loss functions for the PPO
    def ppo_loss(self, y_true, y_pred):
        # Unpack the elements given in the true label
        advantages, true_label, actions = y_true[:, :1], y_true[:, 1:1+NUM_ACTIONS], y_true[:, 1+NUM_ACTIONS:]
        
        prob = actions * y_pred
        old_prob = actions * true_label 

        ratio = K.exp(K.log(prob + 1e-10) - K.log(old_prob + 1e-10))
        
        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = ENTROPY_LOSS * K.mean(entropy)
        
        total_loss = actor_loss - entropy

        return total_loss

# Create the critic which will criticise how the actor is performing    
class Critic_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X_input)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Linear output to know how good the action is
        value = Dense(1)(X)
        
        # Compile it with mse loss and gradient descent
        self.model = Model(inputs=X_input, outputs = value)
        self.model.compile(loss='mse', optimizer=OPTIMIZER(lr=LR))

# Combine both Actor and Critic to create the agent
class PPOAgent:
    def __init__(self, env_name):
        # Environment parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.episode = 0 # used to track current number episoded since start
        self.max_average = 0 # record max average reached
        
        # Used to plot a grapgh of the train process
        self.scores_, self.average_ = [], []

        # Create Actor-Critic network models
        self.Actor = Actor_Model()
        self.Critic = Critic_Model()
    
        # Names for the models
        self.Actor_name = f"{self.env_name}_PPO_Actor.h5"
        self.Critic_name = f"{self.env_name}_PPO_Critic.h5"

    # Get the action given the current state    
    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.Actor.model.predict(state)[0]
        
        # Probability based to choose the action
        action = np.random.choice(NUM_ACTIONS, p=prediction)
        action_onehot = np.zeros([NUM_ACTIONS])
        action_onehot[action] = 1
        return action, action_onehot, prediction

    # Generalized Advantage Estimation implemented in the original paper
    def get_gaes(self, rewards, dones, values, next_values):
        # Dones are used to track when is the final step of an episode, so next values are no applied
        deltas = [r + GAMMA * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        
        # Convert list to array as .mean() and .std() are used later
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        
        for t in reversed(range(len(deltas) - 1)):
            gaes[t] = gaes[t] + (1 - dones[t]) * GAMMA * LAMBDA * gaes[t + 1]

        target = gaes + values
        if NORMALIZE:
            gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def replay(self, states, actions, rewards, predictions, dones, next_states):
        # Reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)

        # Get Critic network predictions for state and next state
        values = self.Critic.model.predict(states)
        next_values = self.Critic.model.predict(next_states)

        # Get the advantage
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))

        # Stack info to unpack it in the custom loss
        y_true = np.hstack([advantages, predictions, actions])
        
        start_train = time.time()
        # Training Actor and Critic networks
        a_loss = self.Actor.model.fit(states, y_true, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        c_loss = self.Critic.model.fit(states, target, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        print('Time: ', time.time()-start_train)
 
    def load(self):
        self.Actor.Actor.load_weights(self.Actor_name)
        self.Critic.Critic.load_weights(self.Critic_name)

    def save(self):
        self.Actor.model.save_weights(self.Actor_name)
        self.Critic.model.save_weights(self.Critic_name)

    def run_batch(self): # train every self.Training_batch episodes
        global LR
        state = self.env.reset()
        state = np.reshape(state, [1, NUM_STATES])
        done, score = False, 0
        finished = False
        while finished == False:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
            for t in range(BUFFER_SIZE):
                #self.env.render()
                # Actor picks an action
                action, action_onehot, prediction = self.act(state)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                # Memorize (state, action, reward) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, NUM_STATES]))
                actions.append(action_onehot)
                rewards.append(reward)
                dones.append(done)
                predictions.append(prediction)
                # Update current state
                state = np.reshape(next_state, [1, NUM_STATES])
                score += reward
                if done:
                    self.episode += 1
                    self.scores_.append(score)
                    #average, SAVING = self.PlotModel(score, self.episode)
                    #print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, EPISODES, score, average, SAVING))
                    if self.episode >= 100:
                        average = sum(self.scores_[-100:])/100
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, average))
                        if average > self.max_average:
                            self.max_average = average
                            if self.max_average > 150:
                                self.save()
                            LR *= 0.95
                            K.set_value(self.Actor.model.optimizer.learning_rate, LR)
                            K.set_value(self.Critic.model.optimizer.learning_rate, LR)
                            
                        if average > 200:
                            plt.plot(self.scores_)
                            plt.xlabel("Episode")
                            plt.ylabel("Score")
                            finished = True
                            break

                    else:
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, sum(self.scores_)/len(self.scores_)))
                    
                    
                    state, done, score = self.env.reset(), False, 0
                    state = np.reshape(state, [1, NUM_STATES])
                    
            self.replay(states, actions, rewards, predictions, dones, next_states)
            if self.episode >= EPISODES:
                break
        self.env.close()  
                    
if __name__ == "__main__":
    start = time.time()
    env_name = 'LunarLander-v2'
    agent = PPOAgent(env_name)
    #agent.run() # train as PPO, train every epesode
    agent.run_batch() # train as PPO, train every batch, trains better
    #agent.run_multiprocesses(num_worker = 8)  # train PPO multiprocessed (fastest)
    #agent.test()
    print((time.time() - start)/60)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode:     1		score: -272.58		average: -272.58
Episode:     2		score: -198.91		average: -235.75
Episode:     3		score: -282.14		average: -251.21
Episode:     4		score: -192.35		average: -236.50
Episode:     5		score: -310.35		average: -251.27
Episode:     6		score: -458.99		average: -285.89
Episode:     7		score: -309.59		average: -289.27
Episode:     8		score:   97.74		average: -240.90
Episode:     9		score: -366.04		average: -254.80
Episode:    10		score: -112.57		average: -240.58
Episode:    11		score: -205.83		average: -237.42
Episode:    12		score: -453.21		average: -255.40
Episode:    13		score: -254.02		average: -255.30
Episode:    14		score: -263.17		average: -255.86
Time:  0.6846201419830322
Episode:    15		score: -224.14		average: -253.74
Episode:    16		score: -162.91		average: -248.07
Episode:    17		score: -164.45		average: -243.15
Episode:    18		score: -316.04		average: -247.20
Episode:   

Time:  0.1880331039428711
Episode:   158		score:  -63.75		average: -107.44
Episode:   159		score:  -69.28		average: -106.36
Episode:   160		score:  -82.46		average: -105.87
Episode:   161		score:  -82.56		average: -102.89
Episode:   162		score:  -96.49		average: -102.64
Episode:   163		score: -159.83		average: -103.11
Episode:   164		score: -194.75		average: -105.25
Episode:   165		score:  -49.48		average: -104.21
Episode:   166		score:  -61.00		average: -103.53
Episode:   167		score:  -89.73		average: -103.36
Episode:   168		score:  -72.28		average: -103.95
Episode:   169		score:  -42.74		average: -103.74
Episode:   170		score: -125.29		average: -103.90
Episode:   171		score:  -37.25		average: -102.89
Episode:   172		score:  -73.24		average: -102.19
Episode:   173		score:  -94.05		average: -101.55
Episode:   174		score:  -73.78		average: -101.27
Episode:   175		score: -109.21		average: -101.24
Episode:   176		score:  -92.25		average: -100.65
Episode:   177		score:  -79.75		average: -1

Episode:   316		score:   40.47		average:  -64.89
Episode:   317		score:  -95.67		average:  -65.03
Episode:   318		score:  101.26		average:  -63.36
Time:  0.19503426551818848
Episode:   319		score: -124.17		average:  -63.94
Episode:   320		score:  -78.21		average:  -64.16
Time:  0.1815319061279297
Episode:   321		score:   19.14		average:  -63.24
Episode:   322		score: -209.82		average:  -64.77
Time:  0.18303203582763672
Episode:   323		score:  -32.34		average:  -64.67
Episode:   324		score:   53.56		average:  -63.70
Time:  0.19403433799743652
Episode:   325		score:  -98.51		average:  -63.86
Episode:   326		score:   65.87		average:  -63.03
Episode:   327		score: -118.19		average:  -63.96
Time:  0.185532808303833
Episode:   328		score:    5.28		average:  -62.98
Episode:   329		score: -163.17		average:  -63.05
Time:  0.1835322380065918
Episode:   330		score:   40.63		average:  -62.13
Episode:   331		score:   79.83		average:  -60.96
Time:  0.18253207206726074
Episode:   332		score:  -84.68	

Episode:   452		score:  -93.40		average:  -69.86
Time:  0.18453264236450195
Episode:   453		score:  -30.16		average:  -69.38
Episode:   454		score: -102.31		average:  -70.02
Episode:   455		score:  -99.47		average:  -70.49
Episode:   456		score: -190.98		average:  -71.89
Time:  0.18053174018859863
Episode:   457		score:  -16.59		average:  -71.58
Episode:   458		score:  -56.01		average:  -72.02
Time:  0.18153142929077148
Episode:   459		score:   34.53		average:  -71.43
Episode:   460		score:   -9.12		average:  -70.85
Episode:   461		score:  -98.90		average:  -71.60
Episode:   462		score: -132.69		average:  -72.08
Time:  0.18603253364562988
Episode:   463		score:  -81.34		average:  -72.41
Episode:   464		score:  -17.40		average:  -71.99
Time:  0.18453264236450195
Episode:   465		score:  -66.01		average:  -72.27
Episode:   466		score:    9.21		average:  -71.50
Time:  0.1790316104888916
Episode:   467		score: -244.53		average:  -73.23
Episode:   468		score:  -83.60		average:  -73.08
Time: 

Episode:   587		score:   82.04		average:  -18.97
Episode:   588		score:   85.08		average:  -17.81
Time:  0.1770312786102295
Episode:   589		score:   20.90		average:  -16.77
Episode:   590		score:   62.94		average:  -16.07
Time:  0.18253207206726074
Episode:   591		score:   63.22		average:  -15.22
Episode:   592		score: -118.83		average:  -16.59
Episode:   593		score:  128.54		average:  -14.81
Time:  0.17603087425231934
Episode:   594		score:   63.42		average:  -13.83
Episode:   595		score: -105.63		average:  -13.99
Episode:   596		score:  -98.22		average:  -14.01
Time:  0.18653273582458496
Episode:   597		score:   28.60		average:  -13.40
Episode:   598		score:   46.79		average:  -12.95
Episode:   599		score: -105.99		average:  -14.00
Time:  0.18853306770324707
Episode:   600		score:  -25.16		average:  -13.34
Episode:   601		score:  -89.52		average:  -14.22
Episode:   602		score: -112.05		average:  -15.21
Time:  0.1815319061279297
Episode:   603		score:  142.08		average:  -12.90
Episode

Time:  0.1745307445526123
Episode:   726		score: -209.56		average:  -19.30
Episode:   727		score:   37.86		average:  -20.58
Episode:   728		score:    5.72		average:  -19.08
Time:  0.18453264236450195
Episode:   729		score:  -73.13		average:  -21.04
Episode:   730		score:  -69.56		average:  -20.92
Time:  0.18303227424621582
Episode:   731		score:  -32.07		average:  -20.40
Episode:   732		score: -137.80		average:  -22.42
Episode:   733		score:  116.86		average:  -20.49
Time:  0.18353271484375
Episode:   734		score:  -27.56		average:  -21.03
Episode:   735		score:  -84.80		average:  -22.60
Time:  0.18253231048583984
Episode:   736		score:  186.84		average:  -21.04
Episode:   737		score:   53.44		average:  -20.56
Time:  0.1810321807861328
Episode:   738		score:  -79.70		average:  -19.30
Episode:   739		score:    4.88		average:  -18.46
Episode:   740		score:  -82.74		average:  -18.47
Time:  0.18453264236450195
Episode:   741		score:    8.96		average:  -18.95
Episode:   742		score:  181.08		

Episode:   863		score:   46.52		average:    5.67
Time:  0.1810319423675537
Episode:   864		score:   70.50		average:    6.95
Episode:   865		score:  136.21		average:    8.84
Time:  0.17203021049499512
Episode:   866		score: -232.52		average:    8.08
Episode:   867		score:   64.69		average:    9.81
Time:  0.18203186988830566
Episode:   868		score:  110.20		average:   10.30
Episode:   869		score:   90.43		average:   12.33
Episode:   870		score:  -63.12		average:   12.47
Time:  0.1795334815979004
Episode:   871		score: -104.53		average:   12.19
Episode:   872		score:  -40.05		average:   11.35
Time:  0.18753314018249512
Episode:   873		score:   78.78		average:   12.81
Episode:   874		score:  -39.00		average:   11.64
Episode:   875		score:  -91.05		average:   11.38
Time:  0.1770312786102295
Episode:   876		score:  -72.20		average:   10.90
Episode:   877		score:   50.04		average:   12.12
Time:  0.18253183364868164
Episode:   878		score:   -6.86		average:   11.04
Episode:   879		score:  171.32

KeyboardInterrupt: 

Batch Size mayor, no converge