In [1]:
import os

import random
import gym
import pylab
import numpy as np
import tensorflow as tf
import time
import matplotlib.pyplot as plt

#tf.config.experimental_run_functions_eagerly(True)
#tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import copy

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp
tf.keras.backend.set_floatx('float64')

EPISODES = 100 # Number of times the enviroment is ran
LOSS_CLIPPING = 0.2 # Approximated values stated in the original paper

LR = 0.0003 # Learning rate 
NUM_ACTIONS = 4 # Number of possible action in the environment
NUM_STATES = 8 # Number of possible states in the exvironment
EPOCHS = 10 # Epochs to train the network (recommended between 3 and 30)
BATCH_SIZE = 64 # Batch size for the neural nets
BUFFER_SIZE = 256 # Buffer of experiences
OPTIMIZER = Adam # Optimizer for both actor and critic
GAMMA = 0.99 # Used for the estimated reward
LAMBDA = 0.95 # Used in the original paper un the GAE



class ActorNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=32, fc2_dims=32):
        super(ActorNetwork, self).__init__()

        self.fc1 = Dense(fc1_dims, activation='relu', kernel_initializer=tf.random_normal_initializer(stddev=0.01))
        self.fc2 = Dense(fc2_dims, activation='relu', kernel_initializer=tf.random_normal_initializer(stddev=0.01))
        self.fc3 = Dense(n_actions, activation='softmax')

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        x = self.fc3(x)

        return x


class CriticNetwork(keras.Model):
    def __init__(self, fc1_dims=32, fc2_dims=32):
        super(CriticNetwork, self).__init__()
        self.fc1 = Dense(fc1_dims, activation='relu', kernel_initializer='he_uniform')
        self.fc2 = Dense(fc2_dims, activation='relu', kernel_initializer='he_uniform')
        self.q = Dense(1, activation=None)

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        q = self.q(x)

        return q

# Combine both Actor and Critic to create the agent
class PPOAgent:
    def __init__(self, env_name):
        # Environment parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.episode = 0 # used to track current number episoded since start
        self.max_average = 0 # record max average reached
        
        # Used to plot a grapgh of the train process
        self.scores_, self.average_ = [], []

        # Create Actor-Critic network models
        self.actor = ActorNetwork(NUM_ACTIONS)
        self.actor.compile(optimizer=Adam(learning_rate=LR))
        self.critic = CriticNetwork()
        self.critic.compile(optimizer=Adam(learning_rate=LR))

    def test(self): # train every self.Training_batch episodes
        
        state_arr = np.random.rand(BUFFER_SIZE,NUM_STATES)
        old_prob_arr = np.random.rand(BUFFER_SIZE,)
        action_arr = np.random.rand(BUFFER_SIZE,)
        values = np.random.rand(BUFFER_SIZE,1)
        reward_arr = np.random.rand(BUFFER_SIZE,)
        advantage = np.random.rand(BUFFER_SIZE,)

        
        for _ in range(10):
            start = time.time()
            for ii in range(EPISODES):
                for _ in range(EPOCHS):
                    batch_start = np.arange(0, BUFFER_SIZE, BATCH_SIZE)
                    indices = np.arange(BUFFER_SIZE, dtype=np.int64)
                    np.random.shuffle(indices)
                    batches = [indices[i:i+BATCH_SIZE] for i in batch_start]       

                    for batch in batches:
                        with tf.GradientTape(persistent=True) as tape:
                            states = tf.convert_to_tensor(state_arr[batch])
                            old_probs = tf.convert_to_tensor(old_prob_arr[batch])
                            actions = tf.convert_to_tensor(action_arr[batch])

                            probs = self.actor(states)
                            dist = tfp.distributions.Categorical(probs)
                            new_probs = dist.log_prob(actions)

                            critic_value = self.critic(states)
                            critic_value = tf.squeeze(critic_value, 1)

                            prob_ratio = tf.exp(new_probs - old_probs)
                            weighted_probs = advantage[batch] * prob_ratio
                            clipped_probs = tf.clip_by_value(prob_ratio, 1-LOSS_CLIPPING, 1+LOSS_CLIPPING)
                            weighted_clipped_probs = clipped_probs * advantage[batch]
                            actor_loss = -tf.math.minimum(weighted_probs, weighted_clipped_probs)
                            actor_loss = tf.math.reduce_mean(actor_loss)

                            returns = advantage[batch] + values[batch]

                            critic_loss = keras.losses.MSE(critic_value, returns)

                        actor_params = self.actor.trainable_variables
                        actor_grads = tape.gradient(actor_loss, actor_params)
                        critic_params = self.critic.trainable_variables
                        critic_grads = tape.gradient(critic_loss, critic_params)
                        self.actor.optimizer.apply_gradients(zip(actor_grads, actor_params))
                        self.critic.optimizer.apply_gradients(zip(critic_grads, critic_params))

            print((time.time() - start)/60) 


if __name__ == "__main__":

    env_name = 'LunarLander-v2'
    agent = PPOAgent(env_name)
    agent.test() # train as PPO, train every batch, trains better


0.7715766708056132
0.7440051754315694
0.7442135492960612
0.7399461309115092
0.7412630319595337
0.7426882783571879
0.7460388580958048
0.7503896315892538
0.7407379388809204
0.7576242327690125
