<a href="https://colab.research.google.com/github/hallpaz/drl/blob/main/notebooks/proximal_policy_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

O objetivo deste projeto é implementar um ambiente de simulação no framework Gym para resolver um problema específico. Neste problema, há uma coluna denominada "Random" contendo 50 posições, cada uma com um valor aleatório entre 1 e 100 (distribuição uniforme). Além disso, há um campo chamado "Target" que recebe um valor aleatório entre 1 e 100 (distribuição uniforme).

A missão do agente é selecionar até 5 posições da coluna "Random", uma de cada vez, de modo que a soma dos valores dessas 5 posições alcance o valor do campo "Target", indicando o fim do episódio.

In [None]:
!pip install gymnasium

É importante destacar que o agente só pode "olhar" para uma posição de cada vez da coluna "Random".

Um episódio é finalizado se a soma de target foi satisfeita ou todas os valores na coluna Random foram vistos ao menos 1 vez.

Implementação:
1 - Criação do Ambiente de simulação utilizando o framework Gym.
2 - Definição inicial dos hiperparâmetros.
3 - Arquitetura da Rede Neural com Tensorflow.
4 - Treinamento do Agente.
5 - Teste e Demonstração do Agente Maduro.

In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

In [None]:
COMBINE = 1
DONT_COMBINE = 0

In [None]:
class MatchingEnv(gym.Env):
  def __init__(self, options_size=50, target_size=1,
               options_limit=5, max_value=100):
    self.options_size = options_size
    self.target_size = target_size
    self.options_limit = options_limit
    self.max_value = max_value
    self.action_space = [0, 1]

    self.reset()

  def reset(self):
    self.options = np.random.randint(1, self.max_value + 1, self.options_size)
    self.target = np.random.randint(1, self.max_value + 1, self.target_size)
    self.current_option_index = 0
    self.current_target_index = 0
    self.selected = []

    return (self.options[self.current_option_index],
            self.target[self.current_target_index],
            self.options_size - 1,
            len(self.selected))

  def step(self, action):
    done = False
    reward = 0
    if action == COMBINE:
      current_value = self.options[self.current_option_index]
      self.selected.append(current_value)
      # avança para o próximo valor
      self.current_option_index += 1
      # calcula o quanto falta em relação ao target
      # self.target[self.current_target_index] -= current_value
      remaining_value = self.target[self.current_target_index] - sum(self.selected)
      # quantos passos ainda faltam
      remaining_steps = self.options_size - self.current_option_index - 1

      if remaining_value > 0:
        reward = current_value
      elif remaining_value == 0:
        reward = 100 * 10**(self.options_limit - len(self.selected))
        done = True
      else:
        reward = -sum(self.selected)
        done = True

    else:
      self.current_option_index += 1
      # calcula o quanto falta em relação ao target
      remaining_value = self.target[self.current_target_index] - sum(self.selected)
      # quantos passos ainda faltam
      remaining_steps = self.options_size - self.current_option_index - 1

    if remaining_steps < 0 and not self.selected:
      reward = -100

    if remaining_steps < 0 or len(self.selected) >= self.options_limit or reward < 0:
      done = True

    next_state = (self.options[self.current_option_index % self.options_size],
                remaining_value,
                remaining_steps,
                len(self.selected))
    # print(next_state)
    return next_state, reward, done, {}

In [None]:
env = MatchingEnv()

A cada seleção que o agente aposta em fazer, as posições da coluna "Random" são sorteadas novamente, exceto aquelas que foram previamente escolhidas pelo agente.

In [None]:
class Actor(tf.keras.Model):
  def __init__(self, n_actions=2):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(256, activation='relu')
    self.d2 = tf.keras.layers.Dense(128, activation='relu')
    self.a = tf.keras.layers.Dense(n_actions, activation='softmax')

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    a = self.a(x)
    return a

class Critic(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(256, activation='relu')
    self.d2 = tf.keras.layers.Dense(128, activation='relu')
    self.v = tf.keras.layers.Dense(1, activation = None)

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    v = self.v(x)
    return v

In [None]:
# Source for the code: https://github.com/abhisheksuran/Reinforcement_Learning/blob/master/PPO.ipynb
class Agent():
    def __init__(self, gamma = 0.99, epsilon=0.2):
        self.gamma = gamma
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
        self.c_opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
        self.actor = Actor()
        self.critic = Critic()
        self.clip_pram = epsilon


    def act(self,state):
        prob = self.actor(np.array([state]))
        prob = prob.numpy()
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        return int(action.numpy()[0])


    def actor_loss(self, probs, actions, adv, old_probs, closs):

        probability = probs
        entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability, tf.math.log(probability))))
        sur1 = []
        sur2 = []

        for pb, t, op, a  in zip(probability, adv, old_probs, actions):
                        t =  tf.constant(t)
                        print(a)
                        ratio = tf.math.divide(pb[a], op[a])

                        s1 = tf.math.multiply(ratio, t)

                        s2 =  tf.math.multiply(tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram), t)
                        sur1.append(s1)
                        sur2.append(s2)

        sr1 = tf.stack(sur1)
        sr2 = tf.stack(sur2)

        loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - closs + 0.001 * entropy)
        return loss

    def learn(self, states, actions,  adv , old_probs, discnt_rewards):
        discnt_rewards = tf.reshape(discnt_rewards, (len(discnt_rewards),))
        adv = tf.reshape(adv, (len(adv),))

        old_p = tf.reshape(old_probs, (len(old_probs), 2))
        # with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
        with tf.GradientTape(persistent=True) as tape:
            p = self.actor(states, training=True)
            v =  self.critic(states,training=True)
            v = tf.reshape(v, (len(v),))
            td = tf.math.subtract(discnt_rewards, v)
            c_loss = 0.5 * tf.keras.losses.mean_squared_error(discnt_rewards, v)
            a_loss = self.actor_loss(p, actions, adv, old_probs, c_loss)

        grads1 = tape.gradient(a_loss, self.actor.trainable_variables)
        grads2 = tape.gradient(c_loss, self.critic.trainable_variables)
        self.a_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
        self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
        return a_loss, c_loss

In [None]:
def test_reward(agent, env):
  total_reward = 0
  state = env.reset()
  done = False
  while not done:
    action = np.argmax(agent.actor(np.array([state])).numpy())
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward

  return total_reward

In [None]:
agent = Agent()
steps = 10000

In [None]:
def preprocess(states, actions, rewards, done, values, gamma):
    # generalized advantage estimation
    g = 0
    lmbda = 0.95
    returns = []
    for i in reversed(range(len(rewards))):
       delta = rewards[i] + gamma * values[i + 1] * done[i] - values[i]
       g = delta + gamma * lmbda * dones[i] * g
       returns.append(g + values[i])

    returns.reverse()
    adv = np.array(returns, dtype=np.float32) - values[:-1]
    adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-10)
    states = np.array(states, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    returns = np.array(returns, dtype=np.float32)
    return states, actions, returns, adv


tf.random.set_seed(7777777)
gamma = 1
ep_reward = []
total_avgr = []
target = False
best_reward = 0
avg_rewards_list = []


for s in range(steps):
  done = False
  state = env.reset()
  all_aloss = []
  all_closs = []
  rewards = []
  states = []
  actions = []
  probs = []
  dones = []
  values = []
  print(s, "new episod")

  for e in range(128):

    action = agent.act(state)
    value = agent.critic(np.array([state])).numpy()
    next_state, reward, done, _ = env.step(action)
    dones.append(1-done)
    rewards.append(reward)
    states.append(state)
    actions.append(action)
    prob = agent.actor(np.array([state]))
    probs.append(prob[0])
    values.append(value[0][0])
    state = next_state
    if done:
      env.reset()

  value = agent.critic(np.array([state])).numpy()
  values.append(value[0][0])
  np.reshape(probs, (len(probs),2))
  probs = np.stack(probs, axis=0)

  states, actions, returns, adv  = preprocess(states, actions, rewards, dones, values, gamma)

  for epoch in range(10):
      al, cl = agent.learn(states, actions, adv, probs, returns)

  avg_reward = np.mean([test_reward(agent, env) for _ in range(5)])
  print(f"total test reward is {avg_reward}")
  avg_rewards_list.append(avg_reward)
  if avg_reward > best_reward:
        print('best reward=' + str(avg_reward))
        agent.actor.save('model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        agent.critic.save('model_critic_{}_{}'.format(s, avg_reward), save_format="tf")
        best_reward = avg_reward

  env.reset()

env.close()