Disminuir LR

In [None]:
import os

import random
import gym
import pylab
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt

#tf.config.experimental_run_functions_eagerly(True)
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import copy

EPISODES = 100000 # Number of times the enviroment is ran
LR = 0.0055 # Learning rate 
NUM_ACTIONS = 4 # Number of possible action in the environment
NUM_STATES = 8 # Number of possible states in the exvironment
EPOCHS = 10 # Epochs to train the network (recommended between 3 and 30)
BATCH_SIZE = 64 # Batch size for the neural nets
BUFFER_SIZE = 2048 # Buffer of experiences
SHUFFLE = True # Whether to shuffle data or not while training
OPTIMIZER = Adam # Optimizer for both actor and critic
GAMMA = 0.99 # Used for the estimated reward
NORMALIZE = True # Whether to normalize GAE or not


# Create the actor used to select the action given an state
class Actor_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X_input)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Softmax as there are different probabilities depending on the action
        output = Dense(NUM_ACTIONS, activation="softmax")(X)
        
        # Compile the model with the custom loss
        self.model = Model(inputs = X_input, outputs = output)
        self.model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER(lr=LR))

# Create the critic which will criticise how the actor is performing    
class Critic_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X_input)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Linear output to know how good the action is
        value = Dense(1)(X)
        
        # Compile it with mse loss and gradient descent
        self.model = Model(inputs=X_input, outputs = value)
        self.model.compile(loss='mse', optimizer=OPTIMIZER(lr=LR))

# Combine both Actor and Critic to create the agent
class PPOAgent:
    def __init__(self, env_name):
        # Environment parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.episode = 0 # used to track current number episoded since start
        self.max_average = 0 # record max average reached
        
        # Used to plot a grapgh of the train process
        self.scores_, self.average_ = [], []

        # Create Actor-Critic network models
        self.Actor = Actor_Model()
        self.Critic = Critic_Model()
    
        # Names for the models
        self.Actor_name = f"{self.env_name}_PPO_Actor.h5"
        self.Critic_name = f"{self.env_name}_PPO_Critic.h5"

    # Get the action given the current state    
    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.Actor.model.predict(state)[0]
        
        # Probability based to choose the action
        action = np.random.choice(NUM_ACTIONS, p=prediction)
        action_onehot = np.zeros([NUM_ACTIONS])
        action_onehot[action] = 1
        return action, action_onehot, prediction
    
    def discount_rewards(self, reward):#gaes is better
        # Compute the gamma-discounted rewards over an episode
        # We apply the discount and normalize it to avoid big variability of rewards
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0,len(reward))):
            running_add = running_add * GAMMA + reward[i]
            discounted_r[i] = running_add

        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
        return discounted_r
    
    def replay(self, states, actions, rewards, predictions, dones, next_states):
        # Reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)

        # Get Critic network predictions for state and next state
        values = self.Critic.model.predict(states)
        next_values = self.Critic.model.predict(next_states)

        # Compute de discounted reward
        discounted_rewards = self.discount_rewards(rewards)

        # Stack info to unpack it in the custom loss
        y_true = actions

        values = np.reshape(values, (np.shape(values)[0],))

        advantages = np.vstack(discounted_rewards - values)
        advantages = np.reshape(advantages, (np.shape(advantages)[0],))

        start_train = time.time()
        # Training Actor and Critic networks
        a_loss = self.Actor.model.fit(states, y_true, sample_weight=advantages, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        c_loss = self.Critic.model.fit(states, discounted_rewards, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        #print('Time: ', time.time()-start_train)
 
    def load(self):
        self.Actor.Actor.load_weights(self.Actor_name)
        self.Critic.Critic.load_weights(self.Critic_name)

    def save(self):
        self.Actor.model.save_weights(self.Actor_name)
        self.Critic.model.save_weights(self.Critic_name)

    def run_batch(self): # train every self.Training_batch episodes
        global LR
        state = self.env.reset()
        state = np.reshape(state, [1, NUM_STATES])
        done, score = False, 0
        finished = False
        while finished == False:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
            for t in range(BUFFER_SIZE):
                #self.env.render()
                # Actor picks an action
                action, action_onehot, prediction = self.act(state)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                # Memorize (state, action, reward) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, NUM_STATES]))
                actions.append(action_onehot)
                rewards.append(reward)
                dones.append(done)
                predictions.append(prediction)
                # Update current state
                state = np.reshape(next_state, [1, NUM_STATES])
                score += reward
                if done:
                    self.episode += 1
                    self.scores_.append(score)
                    #average, SAVING = self.PlotModel(score, self.episode)
                    #print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, EPISODES, score, average, SAVING))
                    if self.episode >= 100:
                        average = sum(self.scores_[-100:])/100
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, average))
                        if average > self.max_average:
                            self.max_average = average
                            if self.max_average > 150:
                                self.save()
                            LR *= 0.99
                            K.set_value(self.Actor.model.optimizer.learning_rate, LR)
                            K.set_value(self.Critic.model.optimizer.learning_rate, LR)
                            
                        if average > 200:
                            plt.plot(self.scores_)
                            plt.xlabel("Episode")
                            plt.ylabel("Score")
                            finished = True
                            break

                    else:
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, sum(self.scores_)/len(self.scores_)))
                    
                    
                    state, done, score = self.env.reset(), False, 0
                    state = np.reshape(state, [1, NUM_STATES])
                    
            self.replay(states, actions, rewards, predictions, dones, next_states)
            if self.episode >= EPISODES:
                break
        self.env.close()   
                    
if __name__ == "__main__":
    start = time.time()
    env_name = 'LunarLander-v2'
    agent = PPOAgent(env_name)
    #agent.run() # train as PPO, train every epesode
    agent.run_batch() # train as PPO, train every batch, trains better
    #agent.run_multiprocesses(num_worker = 8)  # train PPO multiprocessed (fastest)
    #agent.test()
    print((time.time() - start)/60)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode:     1		score: -548.49		average: -548.49
Episode:     2		score: -183.40		average: -365.95
Episode:     3		score: -109.52		average: -280.47
Episode:     4		score: -169.20		average: -252.65
Episode:     5		score:    4.80		average: -201.16
Episode:     6		score: -272.24		average: -213.01
Episode:     7		score: -122.58		average: -200.09
Episode:     8		score:  -80.49		average: -185.14
Episode:     9		score: -162.93		average: -182.67
Episode:    10		score:  -87.83		average: -173.19
Episode:    11		score: -146.37		average: -170.75
Episode:    12		score:  -76.59		average: -162.90
Episode:    13		score: -191.78		average: -165.13
Episode:    14		score: -189.00		average: -166.83
Episode:    15		score: -100.56		average: -162.41
Episode:    16		score:  -89.03		average: -157.83
Episode:    17		score: -595.47		average: -183.57
Episode:    18		score:  -45.24		average: -175.89
Episode:    19		score: -405.86		avera

Episode:   161		score: -159.10		average: -126.60
Episode:   162		score: -145.40		average: -126.23
Episode:   163		score: -112.80		average: -125.90
Episode:   164		score: -146.69		average: -125.85
Episode:   165		score: -112.21		average: -125.79
Episode:   166		score: -148.79		average: -126.17
Episode:   167		score: -159.67		average: -126.92
Episode:   168		score: -135.66		average: -126.97
Episode:   169		score: -129.77		average: -126.87
Episode:   170		score: -183.46		average: -127.07
Episode:   171		score: -124.47		average: -128.28
Episode:   172		score: -117.95		average: -127.74
Episode:   173		score: -156.87		average: -128.04
Episode:   174		score: -106.06		average: -128.00
Episode:   175		score: -145.76		average: -128.11
Episode:   176		score: -201.19		average: -128.57
Episode:   177		score: -165.57		average: -128.93
Episode:   178		score: -136.58		average: -128.90
Episode:   179		score: -122.34		average: -128.68
Episode:   180		score: -147.41		average: -128.44
Episode:   181		scor

Episode:   330		score: -132.66		average: -126.70
Episode:   331		score: -107.96		average: -126.39
Episode:   332		score: -151.47		average: -126.84
Episode:   333		score: -110.96		average: -126.37
Episode:   334		score: -157.03		average: -126.61
Episode:   335		score:    4.40		average: -125.25
Episode:   336		score: -167.37		average: -125.57
Episode:   337		score: -224.15		average: -126.63
Episode:   338		score:  -13.86		average: -125.37
Episode:   339		score: -138.64		average: -125.60
Episode:   340		score: -103.77		average: -125.13
Episode:   341		score:  -96.31		average: -124.58
Episode:   342		score: -159.11		average: -125.01
Episode:   343		score: -141.68		average: -126.37
Episode:   344		score: -126.57		average: -126.38
Episode:   345		score: -171.84		average: -126.64
Episode:   346		score: -134.86		average: -126.85
Episode:   347		score: -161.10		average: -127.55
Episode:   348		score: -188.96		average: -127.97
Episode:   349		score:  -62.26		average: -127.41
Episode:   350		scor

Episode:   499		score:   45.61		average: -135.23
Episode:   500		score: -130.50		average: -134.62
Episode:   501		score: -135.56		average: -134.98
Episode:   502		score: -134.72		average: -135.11
Episode:   503		score: -149.28		average: -135.42
Episode:   504		score: -142.37		average: -134.70
Episode:   505		score: -141.18		average: -134.90
Episode:   506		score: -120.36		average: -135.08
Episode:   507		score: -183.11		average: -135.53
Episode:   508		score: -140.19		average: -135.86
Episode:   509		score: -159.26		average: -136.32
Episode:   510		score: -140.54		average: -135.79
Episode:   511		score: -137.07		average: -135.37
Episode:   512		score:  -24.69		average: -133.01
Episode:   513		score:  -55.62		average: -132.16
Episode:   514		score: -136.74		average: -131.95
Episode:   515		score: -136.63		average: -131.75
Episode:   516		score: -117.06		average: -131.98
Episode:   517		score: -183.17		average: -132.65
Episode:   518		score: -127.03		average: -132.69
Episode:   519		scor

Episode:   668		score: -165.96		average: -130.27
Episode:   669		score: -124.36		average: -129.86
Episode:   670		score: -118.26		average: -129.35
Episode:   671		score: -106.18		average: -130.49
Episode:   672		score: -121.79		average: -130.31
Episode:   673		score: -145.09		average: -130.14
Episode:   674		score: -130.82		average: -129.75
Episode:   675		score:  -30.72		average: -128.12
Episode:   676		score: -127.39		average: -127.62
Episode:   677		score: -117.20		average: -128.82
Episode:   678		score: -143.57		average: -128.71
Episode:   679		score:  -91.71		average: -128.38
Episode:   680		score: -166.02		average: -128.84
Episode:   681		score:  -79.03		average: -128.19
Episode:   682		score: -130.93		average: -128.11
Episode:   683		score: -112.23		average: -127.41
Episode:   684		score: -127.78		average: -127.39
Episode:   685		score:  -86.69		average: -126.84
Episode:   686		score: -125.94		average: -126.76
Episode:   687		score: -154.74		average: -127.23
Episode:   688		scor

Episode:   836		score: -144.98		average: -131.07
Episode:   837		score: -138.57		average: -131.17
Episode:   838		score: -151.81		average: -131.30
Episode:   839		score: -114.66		average: -131.22
Episode:   840		score: -132.41		average: -130.99
Episode:   841		score: -122.87		average: -130.88
Episode:   842		score: -132.77		average: -129.38
Episode:   843		score: -165.12		average: -129.48
Episode:   844		score: -114.92		average: -128.76
Episode:   845		score: -124.55		average: -128.55
Episode:   846		score: -147.02		average: -129.65
Episode:   847		score: -115.00		average: -129.69
Episode:   848		score: -125.04		average: -129.55
Episode:   849		score:  -98.03		average: -129.37
Episode:   850		score: -173.86		average: -129.74
Episode:   851		score: -156.49		average: -131.29
Episode:   852		score: -144.40		average: -131.61
Episode:   853		score: -144.67		average: -131.88
Episode:   854		score: -144.05		average: -131.50
Episode:   855		score: -112.40		average: -130.93
Episode:   856		scor

Episode:  1004		score: -107.20		average: -131.50
Episode:  1005		score: -130.50		average: -131.49
Episode:  1006		score: -187.39		average: -131.58
Episode:  1007		score: -134.45		average: -131.54
Episode:  1008		score: -159.99		average: -132.14
Episode:  1009		score: -118.33		average: -132.67
Episode:  1010		score: -118.91		average: -132.87
Episode:  1011		score: -108.51		average: -132.17
Episode:  1012		score: -129.43		average: -132.34
Episode:  1013		score: -175.02		average: -132.84
Episode:  1014		score: -116.30		average: -133.16
Episode:  1015		score: -130.02		average: -132.45
Episode:  1016		score: -176.05		average: -133.17
Episode:  1017		score: -150.93		average: -133.25
Episode:  1018		score: -143.77		average: -133.47
Episode:  1019		score: -132.26		average: -131.71
Episode:  1020		score: -122.93		average: -132.80
Episode:  1021		score:  -41.22		average: -132.20
Episode:  1022		score: -212.46		average: -133.09
Episode:  1023		score: -180.69		average: -133.67
Episode:  1024		scor