Cambiar el GAEs por discounted rewards normal y sin batch, se actualizan los pesos cada episodios

In [None]:
import os

import random
import gym
import pylab
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt

#tf.config.experimental_run_functions_eagerly(True)
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import copy

EPISODES = 10000 # Number of times the enviroment is ran
LR = 0.002 # Learning rate 
NUM_ACTIONS = 2 # Number of possible action in the environment
NUM_STATES = 4 # Number of possible states in the exvironment
EPOCHS = 1 # Epochs to train the network (recommended between 3 and 30)
BATCH_SIZE = 64 # Batch size for the neural nets
SHUFFLE = True # Whether to shuffle data or not while training
OPTIMIZER = Adam # Optimizer for both actor and critic
GAMMA = 0.99 # Used for the estimated reward
NORMALIZE = True # Whether to normalize GAE or not

counter_file = 1
NAME2 = 'AC_cartpole_steps_' + str(counter_file) + '.txt'

# Create the actor used to select the action given an state
class Actor_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(24, activation="relu", kernel_initializer='he_uniform')(X_input)
        #X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        #X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Softmax as there are different probabilities depending on the action
        output = Dense(NUM_ACTIONS, activation="softmax")(X)
        
        # Compile the model with the custom loss
        self.model = Model(inputs = X_input, outputs = output)
        self.model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER(lr=LR))

# Create the critic which will criticise how the actor is performing    
class Critic_Model:
    def __init__(self):
        X_input = Input(NUM_STATES)

        X = Dense(24, activation="relu", kernel_initializer='he_uniform')(X_input)
        #X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        #X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        
        # Linear output to know how good the action is
        value = Dense(1)(X)
        
        # Compile it with mse loss and gradient descent
        self.model = Model(inputs=X_input, outputs = value)
        self.model.compile(loss='mse', optimizer=OPTIMIZER(lr=LR))

# Combine both Actor and Critic to create the agent
class PPOAgent:
    def __init__(self, env_name):
        # Environment parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.episode = 0 # used to track current number episoded since start
        self.max_average = 0 # record max average reached
        
        # Used to plot a grapgh of the train process
        self.scores_, self.average_ = [], []

        # Create Actor-Critic network models
        self.Actor = Actor_Model()
        self.Critic = Critic_Model()
    
        # Names for the models
        self.Actor_name = f"{self.env_name}_PPO_Actor.h5"
        self.Critic_name = f"{self.env_name}_PPO_Critic.h5"

    # Get the action given the current state    
    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.Actor.model.predict(state)[0]
        
        # Probability based to choose the action
        action = np.random.choice(NUM_ACTIONS, p=prediction)
        action_onehot = np.zeros([NUM_ACTIONS])
        action_onehot[action] = 1
        return action, action_onehot, prediction
    
    def discount_rewards(self, reward):#gaes is better
        # Compute the gamma-discounted rewards over an episode
        # We apply the discount and normalize it to avoid big variability of rewards
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0,len(reward))):
            running_add = running_add * GAMMA + reward[i]
            discounted_r[i] = running_add

        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= (np.std(discounted_r) + 1e-8) # divide by standard deviation
        return discounted_r
    
    def replay(self, states, actions, rewards, predictions, dones, next_states):
        # Reshape memory to appropriate shape for training
        states = np.vstack(states)
        next_states = np.vstack(next_states)
        actions = np.vstack(actions)
        predictions = np.vstack(predictions)

        # Get Critic network predictions for state and next state
        values = self.Critic.model.predict(states)
        next_values = self.Critic.model.predict(next_states)

        # Compute de discounted reward
        discounted_rewards = self.discount_rewards(rewards)

        # Stack info to unpack it in the custom loss
        y_true = actions

        values = np.reshape(values, (np.shape(values)[0],))

        advantages = np.vstack(discounted_rewards - values)
        advantages = np.reshape(advantages, (np.shape(advantages)[0],))

        start_train = time.time()
        # Training Actor and Critic networks
        a_loss = self.Actor.model.fit(states, y_true, sample_weight=advantages, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        c_loss = self.Critic.model.fit(states, discounted_rewards, epochs=EPOCHS, verbose=0, shuffle=SHUFFLE, batch_size=BATCH_SIZE)
        #print('Time: ', time.time()-start_train)
 
    def load(self):
        self.Actor.Actor.load_weights(self.Actor_name)
        self.Critic.Critic.load_weights(self.Critic_name)

    def save(self):
        self.Actor.model.save_weights(self.Actor_name)
        self.Critic.model.save_weights(self.Critic_name)
      
    def test(self):
        env_name = 'CartPole-v0'       
        env = gym.make(env_name)
        state = env.reset()
        state = np.reshape(state, [1, NUM_STATES])
        score = 0
        scores = []
        done = False
        for ii in range(10):
            while done == False:
                action, _, _ = self.act(state)
                state, reward, done, _ = env.step(action)
                state = np.reshape(state, [1, NUM_STATES])
                score += reward
            done = False
            scores.append(score)
            score = 0
            state = env.reset()
            state = np.reshape(state, [1, NUM_STATES])
        
        env.close()
        scores_step = sum(scores)/10
        return scores_step
        
    def run_batch(self): # train every self.Training_batch episodes
        global LR
        reward_step = []
        counter = 0
        state = self.env.reset()
        state = np.reshape(state, [1, NUM_STATES])
        done, score = False, 0
        finished = False
        while finished == False:
            # Instantiate or reset games memory
            states, next_states, actions, rewards, predictions, dones = [], [], [], [], [], []
            while done == False:
                counter += 1
                #self.env.render()
                # Actor picks an action
                action, action_onehot, prediction = self.act(state)
                # Retrieve new state, reward, and whether the state is terminal
                next_state, reward, done, _ = self.env.step(action)
                # Memorize (state, action, reward) for training
                states.append(state)
                next_states.append(np.reshape(next_state, [1, NUM_STATES]))
                actions.append(action_onehot)
                rewards.append(reward)
                dones.append(done)
                predictions.append(prediction)
                # Update current state
                state = np.reshape(next_state, [1, NUM_STATES])
                score += reward
                
                if counter > 14848:
                    finished = True
                    break
                    
                if done:
                    self.episode += 1
                    self.scores_.append(score)
                    #average, SAVING = self.PlotModel(score, self.episode)
                    #print("episode: {}/{}, score: {}, average: {:.2f} {}".format(self.episode, EPISODES, score, average, SAVING))
                    if self.episode >= 100:
                        average = sum(self.scores_[-100:])/100
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, average))
                        if average > self.max_average:
                            self.max_average = average
                            if self.max_average > 150:
                                self.save()
                            LR *= 0.999
                            K.set_value(self.Actor.model.optimizer.learning_rate, LR)
                            K.set_value(self.Critic.model.optimizer.learning_rate, LR)
                        '''    
                        if average > 195:
                            plt.plot(self.scores_)
                            plt.xlabel("Episode")
                            plt.ylabel("Score")
                            finished = True
                            break
                        '''
                    else:
                        print('Episode: {:>5}\t\tscore: {:>7.2f}\t\taverage: {:>7.2f}'.format(self.episode, score, sum(self.scores_)/len(self.scores_)))
                    
                    
            #if counter == 512:
            #    counter = 0
            #    reward_step.append(self.test())         
                    
            self.replay(states, actions, rewards, predictions, dones, next_states)
            state, done, score = self.env.reset(), False, 0
            state = np.reshape(state, [1, NUM_STATES])
            if self.episode >= EPISODES:
                break
        self.env.close()  
        '''    
        output_file2 = open(NAME2, 'w')
        for rew in reward_step:
            output_file2.write(str(rew) + '\n')

        output_file2.close()    
        '''
        
if __name__ == "__main__":
    
    env_name = 'CartPole-v0'
    agent = PPOAgent(env_name)
    start = time.time()
    #agent.run() # train as PPO, train every epesode
    agent.run_batch() # train as PPO, train every batch, trains better
    #agent.run_multiprocesses(num_worker = 8)  # train PPO multiprocessed (fastest)
    #agent.test()
    print((time.time() - start)/60)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode:     1		score:   26.00		average:   26.00
Episode:     2		score:   43.00		average:   34.50
Episode:     3		score:   35.00		average:   34.67
Episode:     4		score:   40.00		average:   36.00
Episode:     5		score:   87.00		average:   46.20
Episode:     6		score:   19.00		average:   41.67
Episode:     7		score:   21.00		average:   38.71
Episode:     8		score:   24.00		average:   36.88
Episode:     9		score:   37.00		average:   36.89
Episode:    10		score:   17.00		average:   34.90
Episode:    11		score:   35.00		average:   34.91
Episode:    12		score:   76.00		average:   38.33
Episode:    13		score:   56.00		average:   39.69
Episode:    14		score:   32.00		average:   39.14
Episode:    15		score:   45.00		average:   39.53
Episode:    16		score:   62.00		average:   40.94
Episode:    17		score:   70.00		average:   42.65
Episode:    18		score:   41.00		average:   42.56
Episode:    19		score:   27.00		avera

Episode:   161		score:   60.00		average:   49.56
Episode:   162		score:   37.00		average:   49.64
Episode:   163		score:   49.00		average:   49.69
Episode:   164		score:   31.00		average:   49.36
Episode:   165		score:   67.00		average:   49.49
Episode:   166		score:   41.00		average:   49.13
Episode:   167		score:   76.00		average:   49.51
Episode:   168		score:   75.00		average:   49.61
Episode:   169		score:   34.00		average:   49.56
Episode:   170		score:   36.00		average:   49.47
Episode:   171		score:   35.00		average:   49.49
