<a href="https://colab.research.google.com/github/hallpaz/drl/blob/main/notebooks/alternative_PPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

O objetivo deste projeto é implementar um ambiente de simulação no framework Gym para resolver um problema específico. Neste problema, há uma coluna denominada "Random" contendo 50 posições, cada uma com um valor aleatório entre 1 e 100 (distribuição uniforme). Além disso, há um campo chamado "Target" que recebe um valor aleatório entre 1 e 100 (distribuição uniforme).

A missão do agente é selecionar até 5 posições da coluna "Random", uma de cada vez, de modo que a soma dos valores dessas 5 posições alcance o valor do campo "Target", indicando o fim do episódio.

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


É importante destacar que o agente só pode "olhar" para uma posição de cada vez da coluna "Random".

Um episódio é finalizado se a soma de target foi satisfeita ou todas os valores na coluna Random foram vistos ao menos 1 vez.

Implementação:
1 - Criação do Ambiente de simulação utilizando o framework Gym.
2 - Definição inicial dos hiperparâmetros.
3 - Arquitetura da Rede Neural com Tensorflow.
4 - Treinamento do Agente.
5 - Teste e Demonstração do Agente Maduro.

In [2]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

In [3]:
COMBINE = 1
DONT_COMBINE = 0

In [4]:
class MatchingEnv(gym.Env):
  def __init__(self, options_size=50, target_size=1, options_limit=5, max_value=100):
    self.options_size = options_size
    self.target_size = target_size
    self.options_limit = options_limit
    self.max_value = max_value
    self.action_space = [0, 1]

    self.reset()

  def reset(self):
    self.options = np.random.randint(1, self.max_value + 1, self.options_size)
    self.target = np.random.randint(1, self.max_value + 1, self.target_size)
    self.current_option_index = 0
    self.current_target_index = 0
    self.selected = []

    return (self.options[self.current_option_index],
            self.target[self.current_target_index],
            self.options_size - 1,
            len(self.selected))

  def step(self, action):
    done = False
    reward = 0
    if action == COMBINE:
      current_value = self.options[self.current_option_index]
      self.selected.append(current_value)
      # avança para o próximo valor
      self.current_option_index += 1
      # calcula o quanto falta em relação ao target
      # self.target[self.current_target_index] -= current_value
      remaining_value = self.target[self.current_target_index] - sum(self.selected)
      # quantos passos ainda faltam
      remaining_steps = self.options_size - self.current_option_index - 1

      if remaining_value > 0:
        reward = current_value
      elif remaining_value == 0:
        reward = 100 * 10**(self.options_limit - len(self.selected))
        done = True
      else:
        reward = -sum(self.selected)
        done = True

    else:
      self.current_option_index += 1
      # calcula o quanto falta em relação ao target
      remaining_value = self.target[self.current_target_index] - sum(self.selected)
      # quantos passos ainda faltam
      remaining_steps = self.options_size - self.current_option_index - 1

    if remaining_steps < 0 and not self.selected:
      reward = -100

    if remaining_steps < 0 or len(self.selected) >= self.options_limit or reward < 0:
      done = True

    next_state = (self.options[self.current_option_index % self.options_size],
                remaining_value,
                remaining_steps,
                len(self.selected))
    print(next_state)
    return next_state, reward, done, {}

In [5]:
env = MatchingEnv()

In [14]:
class Critic(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(256, activation='relu')
    self.d2 = tf.keras.layers.Dense(256, activation='relu')
    self.v = tf.keras.layers.Dense(1, activation = None)

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    v = self.v(x)
    return v


class Actor(tf.keras.Model):
  def __init__(self, n_actions=2):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(256, activation='relu')
    self.d2 = tf.keras.layers.Dense(256, activation='relu')
    self.a = tf.keras.layers.Dense(n_actions, activation='softmax')

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    a = self.a(x)
    return a

In [15]:
# Code reference: https://medium.com/@sthanikamsanthosh1994/reinforcement-learning-part-8-proximal-policy-optimization-ppo-for-trading-9f1c3431f27d
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
            np.array(self.actions),\
            np.array(self.probs),\
            np.array(self.vals),\
            np.array(self.rewards),\
            np.array(self.dones),\
            batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

In [16]:
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam

In [17]:
class Agent:
  def __init__(self, n_actions, gamma=0.99, alpha=0.0003,
                gae_lambda=0.95, policy_clip=0.2, batch_size=64,
                n_epochs=10, chkpt_dir='models/'):
      self.gamma = gamma
      self.policy_clip = policy_clip
      self.n_epochs = n_epochs
      self.gae_lambda = gae_lambda
      self.chkpt_dir = chkpt_dir

      self.actor = Actor(n_actions)
      self.actor.compile(optimizer=Adam(learning_rate=alpha))
      self.critic = Critic()
      self.critic.compile(optimizer=Adam(learning_rate=alpha))
      self.memory = PPOMemory(batch_size)

  def store_transition(self, state, action, probs, vals, reward, done):
      self.memory.store_memory(state, action, probs, vals, reward, done)

  def save_models(self):
      print('... saving models ...')
      self.actor.save(self.chkpt_dir + 'actor')
      self.critic.save(self.chkpt_dir + 'critic')

  def load_models(self):
      print('... loading models ...')
      self.actor = keras.models.load_model(self.chkpt_dir + 'actor')
      self.critic = keras.models.load_model(self.chkpt_dir + 'critic')

  def choose_action(self, observation):
      state = tf.convert_to_tensor([observation])

      probs = self.actor(state)
      dist = tfp.distributions.Categorical(probs)
      action = dist.sample()
      log_prob = dist.log_prob(action)
      value = self.critic(state)

      action = action.numpy()[0]
      value = value.numpy()[0]
      log_prob = log_prob.numpy()[0]

      return action, log_prob, value

  def learn(self):
      for _ in range(self.n_epochs):
          state_arr, action_arr, old_prob_arr, vals_arr,\
              reward_arr, dones_arr, batches = \
              self.memory.generate_batches()

          values = vals_arr
          advantage = np.zeros(len(reward_arr), dtype=np.float32)

          for t in range(len(reward_arr)-1):
              discount = 1
              a_t = 0
              for k in range(t, len(reward_arr)-1):
                  a_t += discount*(reward_arr[k] + self.gamma*values[k+1] * (
                      1-int(dones_arr[k])) - values[k])
                  discount *= self.gamma*self.gae_lambda
              advantage[t] = a_t

          for batch in batches:
              with tf.GradientTape(persistent=True) as tape:
                  states = tf.convert_to_tensor(state_arr[batch])
                  old_probs = tf.convert_to_tensor(old_prob_arr[batch])
                  actions = tf.convert_to_tensor(action_arr[batch])

                  probs = self.actor(states)
                  dist = tfp.distributions.Categorical(probs)
                  new_probs = dist.log_prob(actions)

                  critic_value = self.critic(states)

                  critic_value = tf.squeeze(critic_value, 1)

                  prob_ratio = tf.math.exp(new_probs - old_probs)
                  weighted_probs = advantage[batch] * prob_ratio
                  clipped_probs = tf.clip_by_value(prob_ratio,
                                                    1-self.policy_clip,
                                                    1+self.policy_clip)
                  weighted_clipped_probs = clipped_probs * advantage[batch]
                  actor_loss = -tf.math.minimum(weighted_probs,
                                                weighted_clipped_probs)
                  actor_loss = tf.math.reduce_mean(actor_loss)

                  returns = advantage[batch] + values[batch]
                  critic_loss = keras.losses.MSE(critic_value, returns)

              actor_params = self.actor.trainable_variables
              actor_grads = tape.gradient(actor_loss, actor_params)
              critic_params = self.critic.trainable_variables
              critic_grads = tape.gradient(critic_loss, critic_params)
              self.actor.optimizer.apply_gradients(
                      zip(actor_grads, actor_params))
              self.critic.optimizer.apply_gradients(
                      zip(critic_grads, critic_params))

      self.memory.clear_memory()

In [18]:
agent = Agent(2)
steps = 10000

In [20]:
N = 20
batch_size = 5
n_epochs = 4
alpha = 0.0003
# agent = Agent(n_actions=3, batch_size=batch_size,
#               alpha=alpha, n_epochs=n_epochs,
#               input_dims=env.observation_space.shape)
n_games = 300

best_score = 0
score_history = []

learn_iters = 0
avg_score = 0
n_steps = 0

for i in range(n_games):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        action, prob, val = agent.choose_action(np.expand_dims(observation, axis=0))
        observation_, reward, done, info = env.step(action)
        n_steps += 1
        score += reward
        agent.store_transition(observation, action,
                                prob, val, reward, done)
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
        observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
        agent.save_models()
    tf.summary.scalar('reward summary', data=avg_score, step=i)
    print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
          'time_steps', n_steps, 'learning_steps', learn_iters)
    # env.render()
# filename = 'PPO_trading_view.png'
# x = [i+1 for i in range(len(score_history))]
# plot_learning_curve(x, score_history, figure_file)

(11, 84, 48, 0)
(41, 73, 47, 1)
(46, 32, 46, 2)
(71, 32, 45, 2)
(42, 32, 44, 2)
(4, -10, 43, 3)
episode 0 score -42.0 avg score -42.0 time_steps 6 learning_steps 0
(57, 55, 48, 0)
(65, -2, 47, 1)
episode 1 score -57.0 avg score -49.5 time_steps 8 learning_steps 0
(89, 1, 48, 0)
(69, -88, 47, 1)
episode 2 score -89.0 avg score -62.7 time_steps 10 learning_steps 0
(78, 76, 48, 0)
(44, 76, 47, 0)
(62, 76, 46, 0)
(75, 76, 45, 0)
(65, 76, 44, 0)
(95, 76, 43, 0)
(13, 76, 42, 0)
(68, 76, 41, 0)
(35, 8, 40, 1)
(43, -27, 39, 2)


  advantage[t] = a_t


episode 3 score -35.0 avg score -55.8 time_steps 20 learning_steps 1
(43, 72, 48, 0)
(60, 72, 47, 0)
(49, 12, 46, 1)
(23, 12, 45, 1)
(18, 12, 44, 1)
(2, 12, 43, 1)
(47, 12, 42, 1)
(3, 12, 41, 1)
(77, 12, 40, 1)
(81, 12, 39, 1)
(15, 12, 38, 1)
(62, -3, 37, 2)
episode 4 score -15.0 avg score -47.6 time_steps 32 learning_steps 1
(47, 83, 48, 0)
(37, 83, 47, 0)
(32, 83, 46, 0)
(86, 51, 45, 1)
(69, 51, 44, 1)
(70, 51, 43, 1)
(53, 51, 42, 1)
(7, -2, 41, 2)
episode 5 score -53.0 avg score -48.5 time_steps 40 learning_steps 2
(9, 57, 48, 0)
(6, 57, 47, 0)
(31, 51, 46, 1)
(44, 51, 45, 1)
(40, 51, 44, 1)
(38, 51, 43, 1)
(85, 51, 42, 1)
(2, 51, 41, 1)
(30, 49, 40, 2)
(13, 49, 39, 2)
(45, 36, 38, 3)
(84, 36, 37, 3)
(88, 36, 36, 3)
(88, -52, 35, 4)
episode 6 score -88.0 avg score -54.1 time_steps 54 learning_steps 2
(44, 98, 48, 0)
(28, 98, 47, 0)
(61, 98, 46, 0)
(28, 98, 45, 0)
(51, 70, 44, 1)
(79, 70, 43, 1)
(100, -9, 42, 2)
episode 7 score -79.0 avg score -57.2 time_steps 61 learning_steps 3
(46

  advantage[t] = a_t


(24, 82, 45, 0)
(66, 82, 44, 0)
(80, 82, 43, 0)
(9, 2, 42, 1)
(31, 2, 41, 1)
(7, 2, 40, 1)
(18, 2, 39, 1)
(73, -16, 38, 2)
episode 89 score -18.0 avg score 11044.1 time_steps 568 learning_steps 28
(5, 30, 48, 0)
(11, 30, 47, 0)
(49, 30, 46, 0)
(83, 30, 45, 0)
(52, 30, 44, 0)
(23, -22, 43, 1)
episode 90 score -52.0 avg score 10922.2 time_steps 574 learning_steps 28
(77, -53, 48, 1)
episode 91 score -65.0 avg score 10802.8 time_steps 575 learning_steps 28
(91, -23, 48, 1)
episode 92 score -80.0 avg score 10685.7 time_steps 576 learning_steps 28
(89, -21, 48, 1)
episode 93 score -70.0 avg score 10571.3 time_steps 577 learning_steps 28
(71, 27, 48, 0)
(22, 27, 47, 0)
(81, 5, 46, 1)
(90, -76, 45, 2)
episode 94 score -81.0 avg score 10459.2 time_steps 581 learning_steps 29
(66, 13, 48, 0)
(7, 13, 47, 0)
(42, 13, 46, 0)
(31, 13, 45, 0)
(50, 13, 44, 0)
(65, 13, 43, 0)
(11, 13, 42, 0)
(71, 13, 41, 0)
(73, -58, 40, 1)
episode 95 score -71.0 avg score 10349.5 time_steps 590 learning_steps 29
(64,

  advantage[t] = a_t


(75, 50, 46, 0)
(3, 50, 45, 0)
(40, 47, 44, 1)
(26, 47, 43, 1)
(74, 47, 42, 1)
(77, 47, 41, 1)
(99, 47, 40, 1)
(39, 47, 39, 1)
(70, 47, 38, 1)
(99, -23, 37, 2)
episode 226 score -70.0 avg score 11940.5 time_steps 1370 learning_steps 68
(45, 95, 48, 0)
(24, 95, 47, 0)
(76, 71, 46, 1)
(14, 71, 45, 1)
(83, 57, 44, 2)
(58, -26, 43, 3)
episode 227 score -83.0 avg score 11940.5 time_steps 1376 learning_steps 68
(14, 53, 48, 0)
(96, 39, 47, 1)
(2, -57, 46, 2)
episode 228 score -96.0 avg score 11940.1 time_steps 1379 learning_steps 68
(62, 23, 48, 1)
(44, 23, 47, 1)
(77, -21, 46, 2)
episode 229 score -44.0 avg score 11940.4 time_steps 1382 learning_steps 69
(79, 50, 48, 0)
(48, 50, 47, 0)
(68, 50, 46, 0)
(91, 50, 45, 0)
(80, 50, 44, 0)
(19, 50, 43, 0)
(41, 50, 42, 0)
(75, 9, 41, 1)
(32, 9, 40, 1)
(95, 9, 39, 1)
(86, 9, 38, 1)
(43, 9, 37, 1)
(50, 9, 36, 1)
(60, 9, 35, 1)
(66, 9, 34, 1)
(19, -57, 33, 2)
episode 230 score -66.0 avg score 11940.4 time_steps 1398 learning_steps 69
(69, 44, 48, 0)
(

  advantage[t] = a_t


(57, 28, 47, 1)
(74, 28, 46, 1)
(26, 28, 45, 1)
(65, 28, 44, 1)
(72, 28, 43, 1)
(10, -44, 42, 2)
episode 244 score -72.0 avg score 11941.3 time_steps 1486 learning_steps 74
(76, -22, 48, 1)
episode 245 score -53.0 avg score 11941.1 time_steps 1487 learning_steps 74
(67, 73, 48, 0)
(6, 6, 47, 1)
(77, 0, 46, 2)
... saving models ...
episode 246 score 100067.0 avg score 12942.1 time_steps 1490 learning_steps 74
(10, -29, 48, 1)
episode 247 score -90.0 avg score 12941.7 time_steps 1491 learning_steps 74
(72, 39, 48, 0)
(5, 39, 47, 0)
(96, 39, 46, 0)
(64, -57, 45, 1)
episode 248 score -96.0 avg score 12941.3 time_steps 1495 learning_steps 74
(15, 39, 48, 1)
(64, 39, 47, 1)
(96, 39, 46, 1)
(66, 39, 45, 1)
(72, 39, 44, 1)


  advantage[t] = a_t


(100, -33, 43, 2)
episode 249 score -72.0 avg score 12941.5 time_steps 1501 learning_steps 75
(86, 6, 48, 0)
(80, 6, 47, 0)
(8, 6, 46, 0)
(27, 6, 45, 0)
(2, 6, 44, 0)
(7, 4, 43, 1)
(30, -3, 42, 2)
episode 250 score -7.0 avg score 12942.0 time_steps 1508 learning_steps 75
(55, 30, 48, 0)
(30, 30, 47, 0)
(49, 30, 46, 0)
(39, 30, 45, 0)
(75, -9, 44, 1)
episode 251 score -39.0 avg score 12942.0 time_steps 1513 learning_steps 75
(68, 81, 48, 0)
(64, 81, 47, 0)
(26, 81, 46, 0)
(47, 81, 45, 0)
(75, 81, 44, 0)
(35, 81, 43, 0)
(83, 81, 42, 0)
(84, -2, 41, 1)
episode 252 score -83.0 avg score 12941.3 time_steps 1521 learning_steps 76
(98, 73, 48, 1)
(96, -25, 47, 2)
episode 253 score -98.0 avg score 12940.9 time_steps 1523 learning_steps 76
(33, 81, 48, 0)
(93, 81, 47, 0)
(71, 81, 46, 0)
(78, 81, 45, 0)
(73, 81, 44, 0)
(50, 81, 43, 0)
(71, 81, 42, 0)
(71, 81, 41, 0)
(34, 10, 40, 1)
(73, 10, 39, 1)
(16, 10, 38, 1)
(91, -6, 37, 2)
episode 254 score -16.0 avg score 12941.5 time_steps 1535 learning_

  advantage[t] = a_t


(83, 19, 44, 0)
(28, 19, 43, 0)
(20, 19, 42, 0)
(31, 19, 41, 0)
(34, 19, 40, 0)
(2, 19, 39, 0)
(57, 19, 38, 0)
(97, -38, 37, 1)
episode 259 score -57.0 avg score 12942.2 time_steps 1568 learning_steps 78
(71, 28, 48, 1)
(11, 28, 47, 1)
(22, 28, 46, 1)
(78, 28, 45, 1)
(50, 28, 44, 1)
(73, 28, 43, 1)
(72, 28, 42, 1)
(9, 28, 41, 1)
(81, 28, 40, 1)
(70, 28, 39, 1)
(84, 28, 38, 1)
(44, 28, 37, 1)
(64, 28, 36, 1)
(12, 28, 35, 1)
(32, 16, 34, 2)
(56, 16, 33, 2)
(69, 16, 32, 2)
(83, -53, 31, 3)
episode 260 score -69.0 avg score 12942.2 time_steps 1586 learning_steps 79
(8, 8, 48, 0)
(54, 0, 47, 1)
... saving models ...
episode 261 score 1000000.0 avg score 22942.8 time_steps 1588 learning_steps 79
(4, 36, 48, 0)
(28, 32, 47, 1)
(78, 4, 46, 2)
(71, -74, 45, 3)
episode 262 score -78.0 avg score 22942.8 time_steps 1592 learning_steps 79
(89, 63, 48, 0)
(70, -26, 47, 1)
episode 263 score -89.0 avg score 22942.5 time_steps 1594 learning_steps 79
(74, 11, 48, 0)
(60, -63, 47, 1)
episode 264 score -7

  advantage[t] = a_t


(47, 7, 44, 1)
(7, -40, 43, 2)
episode 265 score -47.0 avg score 22942.6 time_steps 1602 learning_steps 80
(66, 42, 48, 0)
(7, 42, 47, 0)
(14, 42, 46, 0)
(69, 42, 45, 0)
(97, 42, 44, 0)
(38, 42, 43, 0)
(32, 4, 42, 1)
(53, 4, 41, 1)
(20, 4, 40, 1)
(49, 4, 39, 1)
(89, -45, 38, 2)
episode 266 score -49.0 avg score 22942.6 time_steps 1613 learning_steps 80
(78, 44, 48, 1)
(46, 44, 47, 1)
(52, 44, 46, 1)
(62, 44, 45, 1)
(47, 44, 44, 1)
(39, 44, 43, 1)
(87, 44, 42, 1)
(78, 44, 41, 1)
(23, 44, 40, 1)
(27, 44, 39, 1)
(11, 44, 38, 1)
(29, 44, 37, 1)
(45, 15, 36, 2)
(76, -30, 35, 3)
... saving models ...
episode 267 score -45.0 avg score 22942.9 time_steps 1627 learning_steps 81
(37, 9, 48, 0)
(9, 9, 47, 0)
(97, 9, 46, 0)
(60, 9, 45, 0)
(66, 9, 44, 0)
(78, -57, 43, 1)
episode 268 score -66.0 avg score 22942.7 time_steps 1633 learning_steps 81
(64, 36, 48, 0)
(65, 36, 47, 0)
(16, 36, 46, 0)
(31, 20, 45, 1)
(66, -11, 44, 2)
... saving models ...
episode 269 score -31.0 avg score 22942.9 time_steps

  advantage[t] = a_t


(32, 46, 46, 0)
(91, 14, 45, 1)
(32, 14, 44, 1)
(52, 14, 43, 1)
(92, 14, 42, 1)
(60, -78, 41, 2)
... saving models ...
episode 270 score -92.0 avg score 22943.0 time_steps 1646 learning_steps 82
(47, 97, 48, 0)
(10, 97, 47, 0)
(25, 97, 46, 0)
(99, 97, 45, 0)
(73, 97, 44, 0)
(8, 24, 43, 1)
(79, 24, 42, 1)
(54, -55, 41, 2)
episode 271 score -79.0 avg score 22942.8 time_steps 1654 learning_steps 82
(74, 35, 48, 0)
(58, 35, 47, 0)
(73, 35, 46, 0)
(70, 35, 45, 0)
(14, -35, 44, 1)
episode 272 score -70.0 avg score 22942.8 time_steps 1659 learning_steps 82
(73, 74, 48, 0)


  advantage[t] = a_t


(60, 74, 47, 0)
(31, 14, 46, 1)
(21, 14, 45, 1)
(50, 14, 44, 1)
(1, 14, 43, 1)
(68, 14, 42, 1)
(99, 14, 41, 1)
(45, -85, 40, 2)
episode 273 score -99.0 avg score 22942.2 time_steps 1668 learning_steps 83
(75, 82, 48, 0)
(95, 7, 47, 1)
(33, 7, 46, 1)
(33, 7, 45, 1)
(16, 7, 44, 1)
(23, 7, 43, 1)
(24, 7, 42, 1)
(22, 7, 41, 1)
(42, 7, 40, 1)
(87, 7, 39, 1)
(40, 7, 38, 1)
(89, -33, 37, 2)
episode 274 score -40.0 avg score 22942.5 time_steps 1680 learning_steps 84
(92, 88, 48, 0)
(76, -4, 47, 1)
episode 275 score -92.0 avg score 22942.3 time_steps 1682 learning_steps 84
(100, -57, 48, 1)
episode 276 score -60.0 avg score 22942.4 time_steps 1683 learning_steps 84
(38, 79, 48, 0)
(99, 41, 47, 1)
(99, 41, 46, 1)
(71, 41, 45, 1)
(96, 41, 44, 1)
(55, 41, 43, 1)
(88, 41, 42, 1)
(52, -47, 41, 2)
episode 277 score -88.0 avg score 22942.4 time_steps 1691 learning_steps 84
(27, 7, 48, 1)
(80, -20, 47, 2)
episode 278 score -27.0 avg score 22943.0 time_steps 1693 learning_steps 84
(72, 91, 48, 0)
(47, 9