In [1]:
#PPO with RNN - LSTM
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from keras import layers

import numpy as np
import tensorflow as tf
import gymnasium as gym
from gym import *
import scipy.signal


In [2]:
#HOW PPO Works now (Keras example)

#
#def mlp(x, sizes, activation=keras.activations.tanh, output_activation=None):
    # Build a feedforward neural network
#    for size in sizes[:-1]:
#        x = layers.Dense(units=size, activation=activation)(x)

#    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

#observation_input = keras.Input(shape=(observation_dimensions,), dtype="float32")


#logits = mlp(observation_input, list(wandb.config.get('hidden_sizes')) + [num_actions])

    
#actor = keras.Model(inputs=observation_input, outputs=logits)

#value = keras.ops.squeeze(mlp(observation_input, list(wandb.config.get('hidden_sizes')) + [1]), axis=1)
#critic = keras.Model(inputs=observation_input, outputs=value)

In [3]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )


#def mlp(x, sizes, activation=keras.activations.tanh, output_activation=None):
#    # Build a feedforward neural network
#    for size in sizes[:-1]:
#        x = layers.Dense(units=size, activation=activation)(x)
#    return layers.Dense(units=sizes[-1], activation=output_activation)(x)


#SWITCHING TO RNN 

#def rnn_lstm(x, sizes, activation=keras.activations.tanh, output_activation=None):
#    #Build a recurrent neural network
#    print(x)
#    print(sizes)
#    for size in sizes[:-1]:
#        #x = layers.LSTM(units=size, activation=activation)(x)
#        x = layers.LSTM(units=size, 64, 64, activation=activation)(x)
#    #return layers.LSTM(units=sizes[-1], activation=output_activation)(x)
#    return layers.Dense(units=sizes[-1], activation=output_activation)(x)

#from example online
#model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True, return_sequences=True))



def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = keras.ops.log_softmax(logits)
    print(f"logprobabilities uses logprobabilities_all = {logprobabilities_all}")
    print(f"ops.one_hot = {keras.ops.one_hot(a, num_actions)}")
    logprobability = keras.ops.sum(
        keras.ops.one_hot(a, num_actions) * logprobabilities_all, axis=1
    )
    return logprobability


seed_generator = keras.random.SeedGenerator(1337)


# Sample action from actor
@tf.function
def sample_action(observation):
    logits = actor(observation)
    print(f"sample action calls actor with observation: {observation}")
    action = keras.ops.squeeze(
        keras.random.categorical(logits, 1, seed=seed_generator), axis=1
    )
    return logits, action


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_policy(
    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = keras.ops.exp(
            logprobabilities(actor(observation_buffer), action_buffer)
            - logprobability_buffer
        )
        min_advantage = keras.ops.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -keras.ops.mean(
            keras.ops.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))

    kl = keras.ops.mean(
        logprobability_buffer
        - logprobabilities(actor(observation_buffer), action_buffer)
    )
    kl = keras.ops.sum(kl)
    return kl


# Train the value function by regression on mean-squared error
@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        value_loss = keras.ops.mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))

In [4]:
# Hyperparameters of the PPO algorithm
steps_per_epoch = 4000
epochs = 30
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (64, 64)

# True if you want to render the environment
render = False

In [5]:
# Initialize the environment and get the dimensionality of the
# observation space and the number of possible actions
env = gym.make("CartPole-v1")
observation_dimensions = env.observation_space.shape[0]

#print(env.observation_space.shape[0])
#print(env.observation_space.shape)

#env.observation_space = gym.spaces.utils.flatten_space(env.observation_space.shape)



#print(env.observation_space.shape[0])
#print(env.observation_space.shape)

num_actions = env.action_space.n
print(f"num_actions is {num_actions}")

# Initialize the buffer
buffer = Buffer(observation_dimensions, steps_per_epoch)

#print(f"buffer is {buffer}")

#env.observation_space.flatten()

# Initialize the actor and the critic as keras models
#observation_input = keras.Input(shape=(observation_dimensions,None,None), dtype="float32")

#technically what this would mean is that you have a 4 elements in the tensor, each with 1 element
observation_input = keras.Input(shape=(2,observation_dimensions), dtype="float32")


#print(f"observation_input is {observation_input}")
#observation_input2 = keras.Input(shape=(3, 1, 1), dtype="float32")
#print(f"observation_input2 is {observation_input2}")
#inputs are gonna need to change
#logits = rnn_lstm(observation_input, list(hidden_sizes) + [num_actions])
#actor = keras.Model(inputs=observation_input, outputs=logits)
#value = keras.ops.squeeze(rnn_lstm(observation_input, list(hidden_sizes) + [1]), axis=1)
#critic = keras.Model(inputs=observation_input, outputs=value)
#observation_input = keras.Input(shape=(4,4),  dtype="float32")
x = layers.LSTM(64, return_sequences=True, activation="relu")
#print(x)
x = x(observation_input)
#print(x)
#print(x.shape)
x = layers.LSTM(64)(x)
logits = layers.Dense(2)(x)

#logits = mlp(observation_input, list(hidden_sizes) + [num_actions])

#print(f"logits is {logits}")


actor = keras.Model(inputs=observation_input, outputs=logits)

#actor.summary()

#print(f"actor is {actor}")


observation_input = keras.Input(shape=(2,4),  dtype="float32")
y = layers.LSTM(64, return_sequences=True, activation="relu")
#print(y)
y = y(observation_input)
#print(y)
#print(y.shape)
y = layers.LSTM(64)(y)
val = layers.Dense(1)(y)

#value = keras.ops.squeeze(mlp(observation_input, list(hidden_sizes) + [1]), axis=1)
value = keras.ops.squeeze(val, axis=1)
#print(f"value is {value}")

critic = keras.Model(inputs=observation_input, outputs=value)


#print(f"critic is {critic}")

# Initialize the policy and the value function optimizers
policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)

# Initialize the observation, episode return and episode length
observation, _ = env.reset()
print(f"observation is {observation}")
episode_return, episode_length = 0, 0

num_actions is 2
observation is [ 0.01577359 -0.01936189  0.03465074 -0.00034461]


In [6]:
# Iterate over the number of epochs
for epoch in range(epochs):
    # Initialize the sum of the returns, lengths and number of episodes for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0
    #buffer_pointer = tf.constant([1, 1, 1, 1])
   
    buffer_pointer = np.array([1, 1, 1, 1])
   
    #buffer_pointer = tf.constant([1])
    print(buffer_pointer.shape)
    #batch_size = np.array([4])

    # Iterate over the steps of each epoch
    for t in range(steps_per_epoch):
        if render:
            env.render()

        print(observation.shape)
        #observation = tf.stack([buffer_pointer, observation])
        observation = np.array([[buffer_pointer, observation]])

        print(observation.shape)
        print(f"observation is {observation}")
        #observation_new = 
        # Get the logits, action, and take one step in the environment
        #observation = observation.reshape(1, -1)
        #print(f"observation after reshape is {observation}")
        #observation = np.array([buffer_pointer, observation])
        #observation = buffer_pointer + observation
        
        #observation = tf.expand_dims(observation, axis=0)
        #observation[0] = buffer_pointer

        #observation = tf.stack([buffer_pointer, observation])

        
        print(f"observation is {observation}")
        logits, action = sample_action(observation)
        observation_new, reward, done, _, _ = env.step(action[0].numpy())
        #print(f"observation_new is {observation_new}")
        episode_return += reward
        episode_length += 1

        #observation = buffer_pointer + obvservation

        
        # Get the value and log-probability of the action

        #observation = observation.reshape(1, -1)
        print(f"observation is {observation}")
        print(f"observation[0][1] is {observation[0][1]}")
        observation = tf.convert_to_tensor(observation)
        #print(observation.type())
        #print(observation.shape)
        #observation.reshape(1, 1, 4)
        value_t = critic(observation)
        print(f"logits is {logits} and action is {action}")
        logprobability_t = logprobabilities(logits, action)

        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation[0][1], action, reward, value_t, logprobability_t)
        
        
        #observation =np.array([[buffer_pointer, observation_new]])
        # Update the observation
        observation = observation_new

        observation = np.array([[buffer_pointer, observation]])

        observation = tf.convert_to_tensor(observation)
        print(f"observation (the newer one) is {observation}")
        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            #last_value = 0 if done else critic(observation.reshape(1, -1))
            last_value = 0 if done else critic(observation)
            print(f"last_value {last_value}")
            buffer.finish_trajectory(last_value)
            print(f"hitting after buffer.finish_trajectory")
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation, _ = env.reset()
            print(f"observation in 'if terminal' clause is {observation}")
            episode_return, episode_length = 0, 0

    # Get values from the buffer
    (
        observation_buffer,
        action_buffer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer,
    ) = buffer.get()

    # Update the policy and implement early stopping using KL divergence
    for _ in range(train_policy_iterations):
        kl = train_policy(
            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
        )
        if kl > 1.5 * target_kl:
            # Early Stopping
            break

    # Update the value function
    for _ in range(train_value_iterations):
        train_value_function(observation_buffer, return_buffer)

    #buffer_pointer[0] = buffer_pointer[0]+1
    # Print mean return and length for each epoch
    print(
        f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
    )

(4,)
(4,)
(1, 2, 4)
observation is [[[ 1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00]
  [ 1.57735869e-02 -1.93618909e-02  3.46507393e-02 -3.44605680e-04]]]
observation is [[[ 1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00]
  [ 1.57735869e-02 -1.93618909e-02  3.46507393e-02 -3.44605680e-04]]]
sample action calls actor with observation: Tensor("observation:0", shape=(1, 2, 4), dtype=float64)
observation is [[[ 1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00]
  [ 1.57735869e-02 -1.93618909e-02  3.46507393e-02 -3.44605680e-04]]]
observation[0][1] is [ 0.01577359 -0.01936189  0.03465074 -0.00034461]
logits is [[-0.01195843 -0.00201394]] and action is [0]
logprobabilities uses logprobabilities_all = [[-0.6981318 -0.6881873]]
ops.one_hot = [[1. 0.]]
observation (the newer one) is [[[ 1.          1.          1.          1.        ]
  [ 0.01538635 -0.21496321  0.03464385  0.30306652]]]
(1, 2, 4)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (1, 2) + inhomogeneous part.