# Enable GPU

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Actor Critic Share Network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ActorCritic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(ActorCritic_Net , self).__init__()
    self.fc1 = nn.Linear(input_dims,fc1_dims)
    self.actor = nn.Linear(fc1_dims, output_dims)
    self.critic = nn.Linear(fc1_dims,1)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    pi = F.softmax(self.actor(x), dim = 1)
    value = self.critic(x)
    return (pi, value)



# Actor Critic Seperate Net

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Actor_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(Actor_Net,self).__init__()
    self.fc1 = nn.Linear(input_dims, fc1_dims)
    self.out = nn.Linear(fc1_dims, output_dims)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    torch.nn.init.xavier_uniform_(self.out.weight)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.softmax(self.out(x), dim = 1)

    return x

class Critic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(Critic_Net, self).__init__()
    self.fc1 = nn.Linear(input_dims, fc1_dims)
    self.out = nn.Linear(fc1_dims, 1)
    torch.nn.init.xavier_uniform_(self.fc1.weight)
    torch.nn.init.xavier_uniform_(self.out.weight)
  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = self.out(x)
    return x

# REINFORCE with Baseline Agent

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical 
import numpy as np

class ActorCritic(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128, gamma = 0.99, ac_lr = 1e-3, cr_lr = 1e-2, optimizer = 'RMSprop'):
    super(ActorCritic, self).__init__()
    self.actor_net = Actor_Net(input_dims= input_dims, output_dims= output_dims, fc1_dims= fc1_dims)
    self.critic_net = Critic_Net(input_dims= input_dims, output_dims = output_dims, fc1_dims= fc1_dims)
    if optimizer == 'RMSprop':
      self.actor_optim = optim.RMSprop(params = self.actor_net.parameters(),lr = ac_lr)
      self.critic_optim = optim.RMSprop(params= self.critic_net.parameters(), lr = cr_lr)
    else:
      self.actor_optim = optim.Adam(params = self.actor_net.parameters(),lr = ac_lr)
      self.critic_optim = optim.Adam(params= self.critic_net.parameters(), lr = cr_lr)

    self.gamma = gamma

    self.log_probs = []
    self.values = []
  
  def get_action(self, state):
    with torch.no_grad():
      pi = self.actor_net(state)
      distribution = Categorical(probs = pi)
      action = distribution.sample()
      self.log_probs.append(distribution.log_prob(action))

    return action


  def learn_mean(self, rewards ,states, actions, return_norm = True):
    
    returns = []
    states = torch.cat(states, dim = 0).to(device)
    actions = torch.tensor(actions).to(device)  
    actor_losses = []
    critic_losses = []

    # Calculate returns
    G = 0
    for reward in rewards[::-1]:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)


    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Resize the vectors
  
    #self.values = torch.cat(self.values).squeeze() # concatinating plus squeeze since input dim is 2d
    self.log_probs = torch.cat(self.log_probs) # only concatinating since input dim is 1d

    # Compute actor and critic losses
    
    for G, log_prob, state in zip(returns, self.log_probs, states):
      G = G.detach() 
      v = self.critic_net(state)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_losses.append(-log_prob * advantage.detach())
      critic_losses.append(F.smooth_l1_loss(v, torch.tensor([G]).to(device)))

    self.critic_optim.zero_grad()
    self.actor_optim.zero_grad()
    critic_losses = torch.stack(critic_losses).to(device).mean()
    critic_losses.backward()
    actor_losses = torch.stack(actor_losses).to(device).mean()
    actor_losses.backward()
    self.critic_optim.step()
    self.actor_optim.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_forward(self, rewards, states, actions, return_norm = True):
    
  
    returns = []
    states = torch.cat(states, dim = 0).to(device)
    actions = torch.tensor(actions).to(device)  
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    

    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi = self.actor_net(state)
      v = self.critic_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage.detach()
      critic_loss = F.smooth_l1_loss(v.squeeze(0), torch.tensor([G]).to(device).detach()).unsqueeze(0)
      self.critic_optim.zero_grad()
      self.actor_optim.zero_grad()
      critic_loss.backward()
      actor_loss.backward()
      self.critic_optim.step()
      self.actor_optim.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_backward(self, rewards, states, actions, return_norm = True):
    
    returns = []
    states = torch.cat(states, dim = 0).to(device).flip(dims = [0])
    actions = torch.tensor(actions).to(device).flip(dims = [0])
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi = self.actor_net(state)
      v = self.critic_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.detach() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage.detach()
      critic_loss = F.smooth_l1_loss(v.squeeze(0), torch.tensor([G]).to(device).detach()).unsqueeze(0)
      self.critic_optim.zero_grad()
      critic_loss.backward()
      self.critic_optim.step()
      self.actor_optim.zero_grad()
      actor_loss.backward()
      self.actor_optim.step()

    # clear out the memory
    self.values = []
    self.log_probs = []


# Without Wandb

In [None]:
import gym
import torch
import time
import pdb

def train():

  start = time.time()

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, ac_lr = 1e-3, cr_lr = 0.01, optimizer= 'Adam').to(device)
  num_ep = 1000
  print_interval = 100
  running_score = 10

  for ep in range(num_ep):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())
      # update score and state
      score += reward
      state = next_state

      if done:
        break

  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    # train the agent
    pdb.set_trace()
    agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))
  

In [None]:
train()


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.7/bdb.py", line 332, in set_trace
    sys.settrace(self.trace_dispatch)



> <ipython-input-5-51800f8a14b5>(53)train()
-> agent.learn_backward(rewards, states, actions, return_norm = True)
(Pdb) s
--Call--
> <ipython-input-4-8a54cfed6727>(124)learn_backward()
-> def learn_backward(self, rewards, states, actions, return_norm = True):
(Pdb) n
> <ipython-input-4-8a54cfed6727>(126)learn_backward()
-> returns = []
(Pdb) n
> <ipython-input-4-8a54cfed6727>(127)learn_backward()
-> states = torch.cat(states, dim = 0).to(device).flip(dims = [0])
(Pdb) list
122  	    self.log_probs = []
123  	
124  	  def learn_backward(self, rewards, states, actions, return_norm = True):
125  	
126  	    returns = []
127  ->	    states = torch.cat(states, dim = 0).to(device).flip(dims = [0])
128  	    actions = torch.tensor(actions).to(device).flip(dims = [0])
129  	    rewards = torch.tensor(rewards).to(device).flip(dims= [0])
130  	
131  	    # Calculate returns
132  	    G = 0
(Pdb) n
> <ipython-input-4-8a54cfed6727>(128)learn_backward()
-> actions = torch.tensor(actions).to(device)


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.7/bdb.py", line 357, in set_quit
    sys.settrace(None)



BdbQuit: ignored

With Wandb

In [None]:
!pip install wandb
!wandb login

Collecting wandb
  Downloading wandb-0.12.4-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 8.3 MB/s 
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 73.8 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.1-py3-none-any.whl (7.5 kB)
Collecting configparser>=3.8.1
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.4.3-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 65.1 MB/s 
[?25hCollecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 8.0 MB/s 
Colle

In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'learning': {'values': ['learn_mean','learn_forward', 'learn_backward']}, 'actor_learning_rate': {'values' : [0.01, 0.001, 0.0001,0.0003,0.00001]}, 'critic_learning_rate' : {'values': [0.01, 0.001, 0.0001, 0.0003, 0.00001]}
                              , 'num_neurons': {'value': 128 }, 'optimizer': {'values' : ['Adam']}}

sweep_id = wandb.sweep(sweep_config, project = 'REINFORCE_Baseline_seperate_net')

Create sweep with ID: n7voo9d6
Sweep URL: https://wandb.ai/ko120/REINFORCE_Baseline_seperate_net/sweeps/n7voo9d6


In [None]:
import gym 
import torch
import time
import wandb

def train():
  wandb.init(config = {'env':'CartPole-v1','algorithm:': 'REINFORCE_Baseline','architecture': 'seperate','num_laeyrs':'2'}, project = 'REINFORCE_Baseline_seperate_net',group = 'Cart_REINFORCE_Baseline_with_128_seperate')
  config = wandb.config
  start = time.time()

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, ac_lr = config.actor_learning_rate, cr_lr = config.critic_learning_rate, optimizer = config.optimizer).to(device)
  num_ep = 1000
  print_interval = 100
  save_interval = 1000
  running_score = 10

  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []
    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())

      # update score and state
      score += reward
      state = next_state

      if done:
        break
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})

    # train the agent
  
    if config.learning == 'learn_mean':
      agent.learn_mean(rewards,states, actions, return_norm = True)
    elif config.learning == 'learn_forward':
      agent.learn_forward(rewards, states, actions, return_norm = True)
    elif config.learning == 'learn_backward':
      agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    

    if ep == num_ep:
      dummy_input = torch.rand(1,4).to(device)
      torch.onnx.export(agent.actor_net,dummy_input,'final_actor.onnx')
      torch.onnx.export(agent.critic_net,dummy_input, 'final_critic.onnx')
      wandb.save('final_actor.onnx')
      wandb.save('final_critic.onnx')
      torch.save(agent.actor_net.state_dict(),'final_actor.pt')
      wandb.save('final_actor.pt')
      torch.save(agent.critic_net.state_dict(),'final_critic.pt')
      wandb.save('final_critic.pt')
    

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: vrruefd7 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam
[34m[1mwandb[0m: Currently logged in as: [33mko120[0m (use `wandb login --relogin` to force relogin)


episode 100 average reward 213.25002155086145, ended at 26.9
episode 200 average reward 396.24426920202717, ended at 94.6
episode 300 average reward 481.78605037226305, ended at 175.1
episode 400 average reward 481.5427352977531, ended at 243.2
episode 500 average reward 499.2185522422537, ended at 324.6
episode 600 average reward 373.40313621388196, ended at 388.5
episode 700 average reward 495.62879093313165, ended at 468.3
episode 800 average reward 499.97412012899093, ended at 550.7
episode 900 average reward 499.9838170355653, ended at 632.4
episode 1000 average reward 499.9999041882858, ended at 715.7


VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,499.9999


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▃▄▆▇▆▇███▇▆▇▇██████▅▆▇███████████████


[34m[1mwandb[0m: Agent Starting Run: 60r7grr7 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.322517879723577, ended at 3.8
episode 200 average reward 9.571770296810373, ended at 8.2
episode 300 average reward 9.45952177884975, ended at 12.2
episode 400 average reward 9.267446914215702, ended at 15.9
episode 500 average reward 9.350072635557495, ended at 19.5
episode 600 average reward 9.419099381701852, ended at 23.1
episode 700 average reward 9.462435083397038, ended at 26.9
episode 800 average reward 9.259286901607013, ended at 30.6
episode 900 average reward 9.110490440818939, ended at 34.5
episode 1000 average reward 9.545656879526401, ended at 38.2


VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.54566


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▄▂▂▂▂▂█▄▂▂▂▂▂▂▁▁▂▂▂▂▂▂▂▃▂▂▁▂▁▁▁▂▂▂▂▁▁▁▁▂


[34m[1mwandb[0m: Agent Starting Run: a64liyrz with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 59.69553352371371, ended at 16.9
episode 200 average reward 15.121253096237892, ended at 31.0
episode 300 average reward 9.492377653921498, ended at 34.9
episode 400 average reward 9.267641438384123, ended at 38.9
episode 500 average reward 9.35007378724352, ended at 42.7
episode 600 average reward 9.419099388520442, ended at 46.8
episode 700 average reward 9.462435083437407, ended at 50.7
episode 800 average reward 9.259286901607252, ended at 54.5
episode 900 average reward 9.110490440818943, ended at 58.3
episode 1000 average reward 9.545656879526401, ended at 62.3


VBox(children=(Label(value=' 0.08MB of 0.08MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.54566


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▃▂▆▆█▆▇▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: kzru83hd with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 85.10167119653354, ended at 7.9
episode 200 average reward 375.03589108515945, ended at 57.6
episode 300 average reward 444.75972407939497, ended at 129.9
episode 400 average reward 496.7245581802775, ended at 212.3
episode 500 average reward 498.2931466344466, ended at 294.3
episode 600 average reward 499.989894524774, ended at 377.0
episode 700 average reward 448.6332244145113, ended at 457.8
episode 800 average reward 497.0558404580649, ended at 537.3
episode 900 average reward 478.0769030531915, ended at 615.4
episode 1000 average reward 427.0607899023673, ended at 671.4


VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,427.06079


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▂▃▅▆▆▆▇▆█████████████████▇████▇█▇▄▅▇


[34m[1mwandb[0m: Agent Starting Run: xejrdgoa with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.346888661013052, ended at 4.0
episode 200 average reward 9.279327084739474, ended at 8.1
episode 300 average reward 9.457790360267394, ended at 12.1
episode 400 average reward 9.267436663301396, ended at 15.7
episode 500 average reward 9.350072574866658, ended at 19.6
episode 600 average reward 9.419099381342528, ended at 23.2
episode 700 average reward 9.462435083394912, ended at 26.9
episode 800 average reward 9.259286901606998, ended at 30.5
episode 900 average reward 9.110490440818939, ended at 34.2
episode 1000 average reward 9.545656879526401, ended at 37.9


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.54566


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▅█▅▃▂▃▂▂▂▂▂▃▂▂▁▂▂▂▃▂▂▃▃▃▂▂▂▂▂▂▁▂▂▃▂▁▂▂▂▃


[34m[1mwandb[0m: Agent Starting Run: myopvmdr with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 41.384501052726286, ended at 13.2
episode 200 average reward 30.10521664955143, ended at 28.2
episode 300 average reward 20.52375217991972, ended at 38.8
episode 400 average reward 9.347893101249573, ended at 43.0
episode 500 average reward 9.274125565179196, ended at 47.0
episode 600 average reward 9.327892341028315, ended at 50.8
episode 700 average reward 9.333155882643707, ended at 54.6
episode 800 average reward 9.523549421228681, ended at 58.6
episode 900 average reward 9.673708625666526, ended at 62.8
episode 1000 average reward 9.171220645574454, ended at 66.8


VBox(children=(Label(value=' 0.12MB of 0.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.17122


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▅▅▅█▇▅█▄▄▅▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: dxt1ilae with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 192.58669977685597, ended at 13.1
episode 200 average reward 469.4668446959825, ended at 87.4
episode 300 average reward 475.17647647544663, ended at 164.0
episode 400 average reward 499.3220665771224, ended at 246.6
episode 500 average reward 499.99598627536005, ended at 329.5
episode 600 average reward 499.5273311234092, ended at 411.4
episode 700 average reward 499.9972015501042, ended at 493.4
episode 800 average reward 499.99998343169517, ended at 576.3
episode 900 average reward 499.9999999019065, ended at 659.6
episode 1000 average reward 488.47634487374404, ended at 736.9


VBox(children=(Label(value=' 0.13MB of 0.13MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,488.47634


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▄▆▇▇█▇▇██████████████████████████▇▇█


[34m[1mwandb[0m: Agent Starting Run: 49jei20x with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.503175513917999, ended at 4.6
episode 200 average reward 9.28025238561885, ended at 8.6
episode 300 average reward 9.457795838538287, ended at 12.5
episode 400 average reward 9.267436695735658, ended at 16.2
episode 500 average reward 9.350072575058684, ended at 20.0
episode 600 average reward 9.419099381343667, ended at 23.8
episode 700 average reward 9.462435083394912, ended at 27.6
episode 800 average reward 9.259286901606998, ended at 31.4
episode 900 average reward 9.110490440818939, ended at 35.1
episode 1000 average reward 9.545656879526401, ended at 39.2


VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.54566


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: u07f42ok with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 41.072570426407225, ended at 20.1
episode 200 average reward 15.37380393489714, ended at 25.9
episode 300 average reward 20.06836454039016, ended at 33.3
episode 400 average reward 14.49696762948873, ended at 40.0
episode 500 average reward 14.687133006036525, ended at 45.7
episode 600 average reward 20.070984474057717, ended at 53.0
episode 700 average reward 19.831476496197137, ended at 61.5
episode 800 average reward 14.732999032020622, ended at 69.3
episode 900 average reward 15.724782164344246, ended at 75.7
episode 1000 average reward 14.277034791170502, ended at 81.7


VBox(children=(Label(value=' 0.15MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,14.27703


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▆█▇▄▂▁▁▁▁▂▂▂▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: a9lnpv63 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 34.60357165447812, ended at 3.9
episode 200 average reward 31.86084503212698, ended at 11.8
episode 300 average reward 70.79124249967424, ended at 20.6
episode 400 average reward 418.5213972102402, ended at 61.6
episode 500 average reward 284.8852775620713, ended at 134.1
episode 600 average reward 482.1913434535629, ended at 191.9
episode 700 average reward 482.89749402167524, ended at 273.8
episode 800 average reward 322.1575960706421, ended at 343.7
episode 900 average reward 474.03308170845844, ended at 396.2
episode 1000 average reward 497.50058536469294, ended at 477.4


VBox(children=(Label(value=' 0.16MB of 0.16MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,497.50059


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▁▁▁▂▂▂▃▃▅▇▇█▇▄▄▆████████▅▄▃▆█████


[34m[1mwandb[0m: Agent Starting Run: zeybcgko with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.333089459216534, ended at 3.5
episode 200 average reward 9.500804592032203, ended at 7.4
episode 300 average reward 9.300743129962136, ended at 11.2
episode 400 average reward 9.277204483143562, ended at 15.2
episode 500 average reward 9.273707051150158, ended at 18.7
episode 600 average reward 9.327889863203778, ended at 22.8
episode 700 average reward 9.333155867973675, ended at 26.8
episode 800 average reward 9.523549421141825, ended at 30.7
episode 900 average reward 9.673708625666013, ended at 34.4
episode 1000 average reward 9.17122064557445, ended at 38.2


VBox(children=(Label(value=' 0.17MB of 0.17MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.17122


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▇▃▃▂▂▄▃▄▃▂▂▃▃▄▃▃▂▁▂▁▂▂▁▃▃▄▃▄▄▅▅▄▂▃▆▃▃▄▁


[34m[1mwandb[0m: Agent Starting Run: t9d50kig with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 12.893334967249356, ended at 10.3
episode 200 average reward 9.521883129594075, ended at 14.4
episode 300 average reward 9.319142357687909, ended at 18.3
episode 400 average reward 9.277313416308946, ended at 22.2
episode 500 average reward 10.66777084956975, ended at 26.4
episode 600 average reward 9.706142623298804, ended at 30.6
episode 700 average reward 9.37801103247987, ended at 34.6
episode 800 average reward 9.523814987453964, ended at 38.5
episode 900 average reward 9.673710197959123, ended at 42.4
episode 1000 average reward 9.171220654883259, ended at 46.3


VBox(children=(Label(value=' 0.18MB of 0.18MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.17122


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▅█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: slxm6ux9 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 56.89365867225397, ended at 6.5
episode 200 average reward 223.30983711091238, ended at 25.1
episode 300 average reward 492.4088425879189, ended at 102.2
episode 400 average reward 264.3433331411042, ended at 162.1
episode 500 average reward 285.95924170042474, ended at 216.4
episode 600 average reward 317.2330316745393, ended at 285.0
episode 700 average reward 396.1105594683263, ended at 353.4
episode 800 average reward 499.38491953164765, ended at 435.3
episode 900 average reward 235.90839297799414, ended at 503.1
episode 1000 average reward 485.5447567600625, ended at 561.7


VBox(children=(Label(value=' 0.20MB of 0.20MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,485.54476


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▂▂▂▃▄▇▇███▇▄▆▇▅▃▆▇█▅▇█▇▅▇██████▅▃▄▇█


[34m[1mwandb[0m: Agent Starting Run: djgtk0b1 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.500381438073829, ended at 5.0
episode 200 average reward 9.501795049081355, ended at 8.7
episode 300 average reward 9.300748993992038, ended at 12.5
episode 400 average reward 9.27720451786172, ended at 16.2
episode 500 average reward 9.273707051355705, ended at 19.8
episode 600 average reward 9.327889863204994, ended at 23.7
episode 700 average reward 9.333155867973684, ended at 27.9
episode 800 average reward 9.523549421141825, ended at 31.9
episode 900 average reward 9.673708625666013, ended at 35.8
episode 1000 average reward 9.17122064557445, ended at 39.8


VBox(children=(Label(value=' 0.21MB of 0.21MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.17122


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▂▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: kug71qdx with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.523190242239878, ended at 5.1
episode 200 average reward 9.280370883402718, ended at 8.8
episode 300 average reward 9.457796540107879, ended at 12.7
episode 400 average reward 9.267436699889322, ended at 16.5
episode 500 average reward 9.350072575083274, ended at 20.1
episode 600 average reward 9.41909938134381, ended at 24.0
episode 700 average reward 9.462435083394913, ended at 27.7
episode 800 average reward 9.259286901606998, ended at 31.5
episode 900 average reward 9.110490440818939, ended at 35.2
episode 1000 average reward 9.545656879526401, ended at 39.2


VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.54566


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▃█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: qpvg26xr with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 34.67965769865406, ended at 4.4
episode 200 average reward 49.61300443950531, ended at 11.5
episode 300 average reward 86.17515787012829, ended at 23.3
episode 400 average reward 206.58413452061168, ended at 49.3
episode 500 average reward 288.4915491079931, ended at 91.1
episode 600 average reward 370.7402011790107, ended at 146.4
episode 700 average reward 409.77566538410707, ended at 210.5
episode 800 average reward 483.96544571768766, ended at 288.3
episode 900 average reward 418.47903327345904, ended at 363.2
episode 1000 average reward 468.2820774586913, ended at 438.7


VBox(children=(Label(value=' 0.23MB of 0.23MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,468.28208


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▄▄▄▄▅▅▅▅▆▇▇▆▆███████▇▇▇██


[34m[1mwandb[0m: Agent Starting Run: wierexrm with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 46.25827293772249, ended at 16.2
episode 200 average reward 69.18364313519693, ended at 38.6
episode 300 average reward 79.91433047475194, ended at 71.6
episode 400 average reward 71.49007409605287, ended at 99.3
episode 500 average reward 22.89796709220868, ended at 117.6
episode 600 average reward 143.0764632694281, ended at 147.1
episode 700 average reward 105.10234763568131, ended at 171.0
episode 800 average reward 122.89886688418632, ended at 216.7
episode 900 average reward 70.14804588827496, ended at 247.5
episode 1000 average reward 68.49377202788752, ended at 281.9


VBox(children=(Label(value=' 0.24MB of 0.24MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,68.49377


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▃▃▄▃▄▄▅▅▄▅▄▃▅▅▄▂▂▁▁▄▆▇▃▁▄██▅▇▆▅▅▄▄▅▆▆


[34m[1mwandb[0m: Agent Starting Run: echj6i1r with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 122.01027883560441, ended at 34.6
episode 200 average reward 231.1507216096989, ended at 122.9
episode 300 average reward 197.9541862851296, ended at 158.3
episode 400 average reward 445.50367159051683, ended at 313.9
episode 500 average reward 329.4029745750011, ended at 417.1
episode 600 average reward 276.77482427168155, ended at 532.7
episode 700 average reward 80.78337786535678, ended at 576.5
episode 800 average reward 179.49133566408642, ended at 638.8
episode 900 average reward 221.92884686753195, ended at 730.5
episode 1000 average reward 64.2698435706412, ended at 757.6


VBox(children=(Label(value=' 0.25MB of 0.25MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,64.26984


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▃▃▄▄▅▄▂▁▁▇▇▇██▄█▅▅▅█▇▅▄▂▂▂▃▅▄▅▄▆▅▃▂▂▂


[34m[1mwandb[0m: Agent Starting Run: wq9q9j3x with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 30.38211429919399, ended at 3.8
episode 200 average reward 41.79590444258998, ended at 10.7
episode 300 average reward 72.43792488560705, ended at 21.7
episode 400 average reward 188.96625255646362, ended at 44.4
episode 500 average reward 186.51809577511565, ended at 73.5
episode 600 average reward 328.11205554390955, ended at 118.7
episode 700 average reward 335.209929190339, ended at 170.7
episode 800 average reward 455.6372946369706, ended at 228.2
episode 900 average reward 464.92469327786944, ended at 303.6
episode 1000 average reward 465.9622649427583, ended at 378.7


VBox(children=(Label(value=' 0.26MB of 0.26MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,465.96226


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▄▄▃▄▄▄▅▆▅▆▆▆▆▄▆▇████████


[34m[1mwandb[0m: Agent Starting Run: b8dhy77p with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 69.83664481509662, ended at 17.6
episode 200 average reward 119.30240193485693, ended at 63.0
episode 300 average reward 9.954445194763743, ended at 66.8
episode 400 average reward 9.288194532386932, ended at 70.8
episode 500 average reward 77.13526064525891, ended at 89.4
episode 600 average reward 22.21044066855543, ended at 121.4
episode 700 average reward 31.41750996519075, ended at 139.0
episode 800 average reward 127.98294448618935, ended at 175.0
episode 900 average reward 75.2586321135853, ended at 218.4
episode 1000 average reward 121.73107409099914, ended at 258.9


VBox(children=(Label(value=' 0.27MB of 0.27MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,121.73107


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▃▄▄▆▇▄▂▁▁▁▁▁▁▁▁▂▄▅▇▆▂▁▆▃▂▄▄▄▆█▇▆▄▄▅▅▆


[34m[1mwandb[0m: Agent Starting Run: n34xi4z6 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 46.63757869893002, ended at 17.6
episode 200 average reward 40.87746532975265, ended at 34.8
episode 300 average reward 248.89288246229162, ended at 135.1
episode 400 average reward 100.79177021743128, ended at 198.7
episode 500 average reward 66.622660205376, ended at 238.5
episode 600 average reward 213.56919457388125, ended at 289.3
episode 700 average reward 22.261323892390603, ended at 335.6
episode 800 average reward 9.335063096788463, ended at 339.4
episode 900 average reward 21.79471754363323, ended at 344.5
episode 1000 average reward 23.67594848106239, ended at 353.3


VBox(children=(Label(value=' 0.29MB of 0.29MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,23.67595


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▂▂▁▂▂▂▅█▅▆▄▆▅▂▃▄▃▂▂▃▆▆▆▃▂▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: fl9lokyj with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.5769989731767, ended at 3.4
episode 200 average reward 44.94425009217182, ended at 10.4
episode 300 average reward 73.04511659343743, ended at 20.3
episode 400 average reward 133.6858702331559, ended at 36.5
episode 500 average reward 227.2939008291899, ended at 69.2
episode 600 average reward 289.0243245706378, ended at 111.7
episode 700 average reward 377.6838756095947, ended at 169.3
episode 800 average reward 473.30581420493905, ended at 235.8
episode 900 average reward 453.25082668224326, ended at 310.0
episode 1000 average reward 482.6653971632261, ended at 386.9


VBox(children=(Label(value=' 0.30MB of 0.30MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,482.6654


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▄▄▄▄▄▅▅▅▆▇▆▆▇█▇▇██▇███


[34m[1mwandb[0m: Agent Starting Run: v0eyfic6 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 37.09965980312827, ended at 11.9
episode 200 average reward 42.68431260978861, ended at 28.6
episode 300 average reward 56.63013547076453, ended at 51.4
episode 400 average reward 51.585652585848905, ended at 73.4
episode 500 average reward 40.50234251782031, ended at 92.6
episode 600 average reward 63.030504443875714, ended at 110.0
episode 700 average reward 41.90582290789107, ended at 136.4
episode 800 average reward 9.451368927228556, ended at 140.1
episode 900 average reward 41.056238563016855, ended at 146.9
episode 1000 average reward 10.957688025478687, ended at 158.2


VBox(children=(Label(value=' 0.31MB of 0.31MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,10.95769


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▃▃▄▃▄▄▅▅▆▆▆▅▅▆▄▆▄▅▄▃▃▆█▇▇▇▂▁▁▁▁▁▁▃▇▃▂▁


[34m[1mwandb[0m: Agent Starting Run: e361vslw with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 113.70539469948703, ended at 25.3
episode 200 average reward 149.2658043092191, ended at 81.9
episode 300 average reward 10.128225894767711, ended at 86.0
episode 400 average reward 9.28210361903192, ended at 90.1
episode 500 average reward 9.273736056627337, ended at 94.1
episode 600 average reward 9.32789003493155, ended at 97.9
episode 700 average reward 9.333155868990394, ended at 101.8
episode 800 average reward 9.523549421147841, ended at 105.6
episode 900 average reward 9.673708625666047, ended at 109.4
episode 1000 average reward 9.17122064557445, ended at 113.3


VBox(children=(Label(value=' 0.32MB of 0.32MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.17122


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▃▄▄▅█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: kfee9cgm with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 27.60171015832284, ended at 3.6
episode 200 average reward 48.314349591324486, ended at 10.8
episode 300 average reward 85.79787215521289, ended at 23.0
episode 400 average reward 171.57134602457216, ended at 46.4
episode 500 average reward 263.887313577537, ended at 84.2
episode 600 average reward 316.0244314417844, ended at 134.1
episode 700 average reward 318.9136223863668, ended at 184.6
episode 800 average reward 440.2199026209458, ended at 256.2
episode 900 average reward 444.35364687751263, ended at 330.5
episode 1000 average reward 472.276700588211, ended at 407.8


VBox(children=(Label(value=' 0.33MB of 0.33MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,472.2767


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▅▅▆▅▅▆▆▆▆▇▇▇█▇███████


[34m[1mwandb[0m: Agent Starting Run: 8awgc3zw with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 47.715767590985884, ended at 14.9
episode 200 average reward 171.47923328719796, ended at 54.0
episode 300 average reward 10.951760016342579, ended at 65.3
episode 400 average reward 9.28697937686264, ended at 69.3
episode 500 average reward 9.273764923694044, ended at 73.1
episode 600 average reward 9.32789020583986, ended at 76.9
episode 700 average reward 9.333155870002264, ended at 80.7
episode 800 average reward 9.523549421153833, ended at 84.8
episode 900 average reward 9.673708625666084, ended at 88.5
episode 1000 average reward 9.171220645574452, ended at 92.2


VBox(children=(Label(value=' 0.34MB of 0.34MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.17122


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▃▂▃▃▅█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: 4wu7hbko with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 125.09546608056392, ended at 29.9
episode 200 average reward 35.54004973859629, ended at 57.5
episode 300 average reward 9.61326773608664, ended at 61.5
episode 400 average reward 9.268357171648033, ended at 65.6
episode 500 average reward 9.350078024763222, ended at 69.6
episode 600 average reward 9.419099413608802, ended at 73.5
episode 700 average reward 9.462435083585943, ended at 77.4
episode 800 average reward 9.25928690160813, ended at 81.2
episode 900 average reward 9.110490440818948, ended at 84.9
episode 1000 average reward 9.545656879526401, ended at 88.7


VBox(children=(Label(value=' 0.35MB of 0.35MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.54566


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▄▆█▅▆▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: ufdxgh0v with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 17.210574874969442, ended at 3.2
episode 200 average reward 46.12642274130302, ended at 9.5
episode 300 average reward 58.001941068514945, ended at 18.8
episode 400 average reward 97.86579242952813, ended at 31.7
episode 500 average reward 195.5479709071277, ended at 60.1
episode 600 average reward 303.2644225272419, ended at 102.4
episode 700 average reward 337.10512808869197, ended at 153.2
episode 800 average reward 382.4008963052171, ended at 213.8
episode 900 average reward 450.81620199075525, ended at 286.2
episode 1000 average reward 474.72991254603903, ended at 358.5


VBox(children=(Label(value=' 0.37MB of 0.37MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,474.72991


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇██▆███


[34m[1mwandb[0m: Agent Starting Run: mq2eowpe with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 32.747246413469085, ended at 11.3
episode 200 average reward 43.69606097952905, ended at 28.2
episode 300 average reward 38.12724124639795, ended at 47.1
episode 400 average reward 68.0084489806531, ended at 67.5
episode 500 average reward 31.57480361939838, ended at 79.3
episode 600 average reward 45.05173409297923, ended at 93.5
episode 700 average reward 54.98299992313926, ended at 112.0
episode 800 average reward 67.10012976971754, ended at 137.8
episode 900 average reward 57.53844479195299, ended at 167.8
episode 1000 average reward 9.456390308153008, ended at 171.5


VBox(children=(Label(value=' 0.38MB of 0.38MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,9.45639


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▃▃▃▄▄▄▄▄▅▅▃▃▅▅▅▃▃▃▃▃▃▄▄▄▅▄▆▇▆▅▇▆█▆▂▁▁▁


[34m[1mwandb[0m: Agent Starting Run: 4tnpwvik with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 38.523661834179045, ended at 16.5
episode 200 average reward 60.601141992236805, ended at 42.1
episode 300 average reward 91.54610859431438, ended at 70.4
episode 400 average reward 25.545908499087634, ended at 95.7
episode 500 average reward 9.48795970324496, ended at 99.8
episode 600 average reward 9.40961182537226, ended at 103.7
episode 700 average reward 9.353320224449675, ended at 107.6
episode 800 average reward 9.523668804803547, ended at 111.5
episode 900 average reward 9.673709332480472, ended at 115.3
episode 1000 average reward 89.26889800049472, ended at 140.9


VBox(children=(Label(value=' 0.39MB of 0.39MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,89.2689


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▄▄▃▄▆▆▄▃▄▇▇▇▆▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇█


[34m[1mwandb[0m: Agent Starting Run: f4p75duf with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.050835135848303, ended at 3.7
episode 200 average reward 24.67468946471139, ended at 7.9
episode 300 average reward 23.803183447477178, ended at 11.8
episode 400 average reward 35.04639097414618, ended at 17.1
episode 500 average reward 26.373833500578225, ended at 22.2
episode 600 average reward 32.60948329601037, ended at 28.0
episode 700 average reward 29.717232130678326, ended at 33.9
episode 800 average reward 38.43679153461775, ended at 40.6
episode 900 average reward 33.47897763837417, ended at 46.6
episode 1000 average reward 41.34704296470216, ended at 53.1


VBox(children=(Label(value=' 0.40MB of 0.40MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,41.34704


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▃▃▃▃▄▃▃▃▄▄▄▆▆▆▆▄▄▄▅▅▄▅▇▅▅▅▇▆▆▇▅▆▅▆▆██


[34m[1mwandb[0m: Agent Starting Run: m8xqqan9 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 26.743953925601527, ended at 9.2
episode 200 average reward 36.64775153282794, ended at 23.1
episode 300 average reward 47.810457415685164, ended at 40.9
episode 400 average reward 53.343143358698335, ended at 61.1
episode 500 average reward 67.5828865413257, ended at 84.4
episode 600 average reward 84.31403781751814, ended at 114.8
episode 700 average reward 109.05464908484153, ended at 152.8
episode 800 average reward 168.63178670544028, ended at 209.8
episode 900 average reward 211.16905639873582, ended at 275.3
episode 1000 average reward 265.77129399306796, ended at 364.3


VBox(children=(Label(value=' 0.41MB of 0.41MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,265.77129


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▆▆▅▆▅▆▇███


[34m[1mwandb[0m: Agent Starting Run: 3g9whba1 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 38.15501592251266, ended at 11.1
episode 200 average reward 38.80373566993299, ended at 26.0
episode 300 average reward 61.68403684975861, ended at 51.0
episode 400 average reward 107.97630321196472, ended at 88.7
episode 500 average reward 278.66091705238347, ended at 180.1
episode 600 average reward 436.4912556319816, ended at 348.4
episode 700 average reward 450.0793813060592, ended at 512.3
episode 800 average reward 454.918264215987, ended at 686.4
episode 900 average reward 428.7678070555241, ended at 856.9
episode 1000 average reward 408.32894673075447, ended at 1023.5


VBox(children=(Label(value=' 0.42MB of 0.42MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,408.32895


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▅▆▆▇▇█▆▇▇▇▇▇████▆▇██▇▆


[34m[1mwandb[0m: Agent Starting Run: n5w5cu5z with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.266360583006353, ended at 3.6
episode 200 average reward 23.259316371670185, ended at 7.5
episode 300 average reward 24.839955436715613, ended at 11.9
episode 400 average reward 32.531364679994255, ended at 17.0
episode 500 average reward 28.861407932234933, ended at 21.9
episode 600 average reward 35.749750692359086, ended at 27.6
episode 700 average reward 30.888139437811102, ended at 32.7
episode 800 average reward 37.85783497507355, ended at 38.6
episode 900 average reward 36.526866319008235, ended at 44.6
episode 1000 average reward 35.26691964299096, ended at 50.5


VBox(children=(Label(value=' 0.43MB of 0.43MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,35.26692


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▃▃▄▃▄▄▄▅▄▅▅▆▇▅▅▅▅▆▆▆▆▄▆▅▆▇▇█▇▇▆▆▆▇▅█


[34m[1mwandb[0m: Agent Starting Run: xph4ijov with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 31.96830647653793, ended at 9.8
episode 200 average reward 40.86802447626798, ended at 24.3
episode 300 average reward 60.61839075154811, ended at 46.3
episode 400 average reward 99.21883755827737, ended at 79.3
episode 500 average reward 202.39739196388223, ended at 147.8
episode 600 average reward 228.049808319213, ended at 241.2
episode 700 average reward 279.8747962995883, ended at 366.3
episode 800 average reward 409.8470830043461, ended at 508.8
episode 900 average reward 340.9568965016593, ended at 647.5
episode 1000 average reward 492.05256803061064, ended at 830.4


VBox(children=(Label(value=' 0.44MB of 0.44MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,492.05257


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▄▄▅▄▄▄▆▆▆▅▆▆▇▇▇▆▅▇▇██


[34m[1mwandb[0m: Agent Starting Run: pxtyyr7m with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 27.091863087856204, ended at 9.9
episode 200 average reward 41.86861030768875, ended at 26.0
episode 300 average reward 84.6521880378273, ended at 53.9
episode 400 average reward 211.1939078159952, ended at 125.1
episode 500 average reward 441.60179895759387, ended at 271.8
episode 600 average reward 401.70923625216017, ended at 440.1
episode 700 average reward 482.657161324009, ended at 623.2
episode 800 average reward 469.539416691509, ended at 808.1
episode 900 average reward 427.4278091357393, ended at 976.8
episode 1000 average reward 489.6837032483525, ended at 1163.4


VBox(children=(Label(value=' 0.46MB of 0.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,489.6837


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▂▂▂▂▂▃▄▄▄▅▆▇█▇▇▇▇▇██████▇█▇▇██▇█


[34m[1mwandb[0m: Agent Starting Run: dvhn0ehq with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.542333028719188, ended at 3.7
episode 200 average reward 26.237669758165765, ended at 7.6
episode 300 average reward 23.65969986110769, ended at 11.7
episode 400 average reward 26.70549353031354, ended at 15.9
episode 500 average reward 25.747886086938035, ended at 20.1
episode 600 average reward 39.64394628908527, ended at 25.7
episode 700 average reward 27.38927729294732, ended at 30.8
episode 800 average reward 38.831532296301766, ended at 36.9
episode 900 average reward 37.43033153325631, ended at 42.5
episode 1000 average reward 33.40789797289935, ended at 48.8


VBox(children=(Label(value=' 0.47MB of 0.47MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,33.4079


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▂▃▃▃▄▄▄▅▄▄▄▄▄▅▄▅▃▄▄▆▆▆▇▆▄▅▆▆▇▇▇▆▆▇▇▆█▇


[34m[1mwandb[0m: Agent Starting Run: 83dwtnrx with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 33.64972047680542, ended at 10.5
episode 200 average reward 44.062011607441995, ended at 26.9
episode 300 average reward 49.417426365225346, ended at 45.3
episode 400 average reward 89.22727215414679, ended at 74.9
episode 500 average reward 240.69109261302455, ended at 143.6
episode 600 average reward 402.6024567502007, ended at 273.3
episode 700 average reward 406.0134825247458, ended at 420.1
episode 800 average reward 462.2042199305548, ended at 591.3
episode 900 average reward 467.7719575477086, ended at 760.1
episode 1000 average reward 467.0712729521209, ended at 937.7


VBox(children=(Label(value=' 0.48MB of 0.48MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,467.07127


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▄▄▅▄▆▆▆▆▇▇▇▇▇█▇▇█████▇


[34m[1mwandb[0m: Agent Starting Run: ntnfz7xt with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.380093864348993, ended at 8.4
episode 200 average reward 38.02889573053363, ended at 22.9
episode 300 average reward 54.85984844045243, ended at 41.9
episode 400 average reward 65.67039889996427, ended at 64.7
episode 500 average reward 172.48350905776056, ended at 117.2
episode 600 average reward 352.13346450419806, ended at 232.9
episode 700 average reward 481.3391669195163, ended at 406.7
episode 800 average reward 466.45633167765607, ended at 581.8
episode 900 average reward 422.7838349263696, ended at 759.6
episode 1000 average reward 426.0686570169458, ended at 942.8


VBox(children=(Label(value=' 0.49MB of 0.49MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,426.06866


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▅▆▆▇███▇▇████▇████


[34m[1mwandb[0m: Agent Starting Run: fx3ukzta with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.40851986739077, ended at 3.7
episode 200 average reward 26.997387765420026, ended at 7.8
episode 300 average reward 24.014236933673153, ended at 11.7
episode 400 average reward 29.581428796193087, ended at 16.7
episode 500 average reward 30.180769814859154, ended at 21.8
episode 600 average reward 35.69548490458764, ended at 27.4
episode 700 average reward 29.103082298995734, ended at 32.6
episode 800 average reward 32.694730029929715, ended at 38.6
episode 900 average reward 35.51744481520338, ended at 44.4
episode 1000 average reward 33.5195432679721, ended at 50.4


VBox(children=(Label(value=' 0.50MB of 0.50MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,33.51954


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▃▃▂▃▄▄▄▃▃▃▄▄▅▅▄▅▅▅▅▆▅▅▅▄▅▅▅▆▆▅▅▅▆▅▅█▆


[34m[1mwandb[0m: Agent Starting Run: ec2pcwtm with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 28.68342878481664, ended at 9.2
episode 200 average reward 36.794668210074526, ended at 23.6
episode 300 average reward 49.599988280606105, ended at 42.9
episode 400 average reward 103.23273232646788, ended at 74.4
episode 500 average reward 258.68165835724915, ended at 158.2
episode 600 average reward 251.55904580619531, ended at 262.1
episode 700 average reward 342.38723553754267, ended at 397.5
episode 800 average reward 422.30584292162604, ended at 551.6
episode 900 average reward 368.0249967749201, ended at 711.4
episode 1000 average reward 422.840181708476, ended at 884.5


VBox(children=(Label(value=' 0.51MB of 0.51MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,422.84018


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▄▄▅▄▅▅▅▄▆▇▇▅▇▇▇▇█▆▆███▇


[34m[1mwandb[0m: Agent Starting Run: bewylazi with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 31.98851680037106, ended at 9.9
episode 200 average reward 41.02703581637749, ended at 25.9
episode 300 average reward 64.90889579093653, ended at 48.3
episode 400 average reward 198.36820635201104, ended at 105.4
episode 500 average reward 369.2181272494577, ended at 224.5
episode 600 average reward 428.481977806825, ended at 394.2
episode 700 average reward 462.1323922529584, ended at 568.4
episode 800 average reward 477.52342086742266, ended at 763.6
episode 900 average reward 475.8789488685598, ended at 954.7
episode 1000 average reward 495.4699537696739, ended at 1144.7


VBox(children=(Label(value=' 0.52MB of 0.52MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,495.46995


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▄▄▅▅▇▇▇▇▇▆▇▇████████████


[34m[1mwandb[0m: Agent Starting Run: fkdid3kv with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.835746228419364, ended at 3.6
episode 200 average reward 21.564464392209693, ended at 7.3
episode 300 average reward 24.740427697846215, ended at 11.7
episode 400 average reward 28.563564878976926, ended at 16.4
episode 500 average reward 29.106458620803394, ended at 21.3
episode 600 average reward 34.51319052553222, ended at 27.1
episode 700 average reward 24.089056983963673, ended at 32.5
episode 800 average reward 30.0090317152531, ended at 37.7
episode 900 average reward 29.533120857465097, ended at 42.8
episode 1000 average reward 27.10184993102777, ended at 47.7


VBox(children=(Label(value=' 0.53MB of 0.53MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,27.10185


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▃▃▄▄▄▄▅▄▅▅▄▄▆▆▅▆▅▅▇█▆▇▆▅▅▆▆▆▅▅▇▆▆▆▇▆


[34m[1mwandb[0m: Agent Starting Run: oqwln2dv with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 36.1912659821589, ended at 11.0
episode 200 average reward 38.235719968948565, ended at 26.4
episode 300 average reward 52.354097243599874, ended at 44.6
episode 400 average reward 66.18701478761909, ended at 68.3
episode 500 average reward 157.2369763598375, ended at 117.4
episode 600 average reward 255.50123008643942, ended at 214.1
episode 700 average reward 392.045336894232, ended at 334.0
episode 800 average reward 443.5959385723148, ended at 488.9
episode 900 average reward 429.91257657319363, ended at 656.0
episode 1000 average reward 473.97592501118675, ended at 826.3


VBox(children=(Label(value=' 0.55MB of 0.55MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,473.97593


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▂▁▂▂▂▂▂▂▃▃▃▄▅▄▄▅▅▇▅▇▇▇▇▇█▇▇▇██


[34m[1mwandb[0m: Agent Starting Run: hq1okiqd with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 28.31874494572189, ended at 9.4
episode 200 average reward 35.63748914012515, ended at 22.4
episode 300 average reward 54.23400565792511, ended at 38.6
episode 400 average reward 59.23403156483906, ended at 59.7
episode 500 average reward 131.78719864216941, ended at 96.5
episode 600 average reward 267.9075446734763, ended at 174.1
episode 700 average reward 325.76508819944974, ended at 292.6
episode 800 average reward 390.6451226860015, ended at 447.3
episode 900 average reward 411.02290586108955, ended at 620.0
episode 1000 average reward 436.736953291413, ended at 787.4


VBox(children=(Label(value=' 0.56MB of 0.56MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,436.73695


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▅▅▅▅▆▇▇▇▇██▇█▇▇▇


[34m[1mwandb[0m: Agent Starting Run: 5s2xkx66 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.40166162151358, ended at 4.0
episode 200 average reward 35.257330867170815, ended at 9.4
episode 300 average reward 33.38825057097358, ended at 15.0
episode 400 average reward 43.05699158748565, ended at 21.6
episode 500 average reward 54.68050619669835, ended at 29.6
episode 600 average reward 54.641564764248486, ended at 38.6
episode 700 average reward 56.12632904069829, ended at 48.2
episode 800 average reward 73.42676270881765, ended at 58.7
episode 900 average reward 93.71616368607027, ended at 72.9
episode 1000 average reward 114.04393665519116, ended at 89.4


VBox(children=(Label(value=' 0.57MB of 0.57MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,114.04394


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▂▂▂▃▃▂▃▃▃▃▃▃▃▃▄▄▃▄▄▄▄▄▄▅▄▄▅▆▅▅▆▇▇▇██


[34m[1mwandb[0m: Agent Starting Run: jrv5woac with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 42.9364368627635, ended at 12.6
episode 200 average reward 45.81773243896867, ended at 29.8
episode 300 average reward 75.07302239150545, ended at 55.4
episode 400 average reward 77.5363170789726, ended at 85.7
episode 500 average reward 61.786272190713554, ended at 112.8
episode 600 average reward 106.67017440499467, ended at 151.9
episode 700 average reward 168.0467155354046, ended at 214.0
episode 800 average reward 196.20462825948917, ended at 309.7
episode 900 average reward 323.7048800038901, ended at 411.0
episode 1000 average reward 92.91590550510058, ended at 493.9


VBox(children=(Label(value=' 0.58MB of 0.58MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,92.91591


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▄▃▄▅▅▇▅▄▆▅▆██▄▃


[34m[1mwandb[0m: Agent Starting Run: 1dt1gxyl with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 42.450099722994175, ended at 12.8
episode 200 average reward 270.99631533971, ended at 62.4
episode 300 average reward 220.87054449492592, ended at 200.7
episode 400 average reward 286.1130729388295, ended at 270.8
episode 500 average reward 279.22203232911716, ended at 370.0
episode 600 average reward 395.77857077388524, ended at 489.6
episode 700 average reward 497.8778947003645, ended at 676.6
episode 800 average reward 488.13245402922695, ended at 823.7
episode 900 average reward 209.05470522152223, ended at 978.4
episode 1000 average reward 469.9778532520557, ended at 1146.6


VBox(children=(Label(value=' 0.59MB of 0.59MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,469.97785


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▃▅▆▇▇▃▃▃▃▆▄▄▅▆▃▅▆▇▇██▅▅▇███▇▄▅▇▇▇


[34m[1mwandb[0m: Agent Starting Run: bfxhpkyk with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.420808955703997, ended at 3.7
episode 200 average reward 25.040423933529688, ended at 8.0
episode 300 average reward 40.03281128066349, ended at 13.9
episode 400 average reward 40.121438735026906, ended at 20.6
episode 500 average reward 42.26586061575219, ended at 28.2
episode 600 average reward 48.23607639099737, ended at 35.6
episode 700 average reward 51.347497704012206, ended at 43.7
episode 800 average reward 52.94295905839381, ended at 52.3
episode 900 average reward 56.57020943904042, ended at 63.1
episode 1000 average reward 66.3405571062789, ended at 74.2


VBox(children=(Label(value=' 0.60MB of 0.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,66.34056


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▂▂▂▃▃▃▃▃▄▄▅▅▄▅▅▅▅▅▄▅▅▄▅▆▆▆▅▇▆▇▇█▇▇███


[34m[1mwandb[0m: Agent Starting Run: d51i0h2q with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 40.795411789311004, ended at 12.9
episode 200 average reward 161.78871181509413, ended at 49.9
episode 300 average reward 246.66747500799258, ended at 136.1
episode 400 average reward 241.05403644283948, ended at 237.5
episode 500 average reward 277.44267764972665, ended at 346.5
episode 600 average reward 223.52120768676897, ended at 456.2
episode 700 average reward 101.26110993036758, ended at 544.8
episode 800 average reward 325.587449830936, ended at 641.5
episode 900 average reward 228.74303929997114, ended at 689.6
episode 1000 average reward 290.7403224983433, ended at 773.2


VBox(children=(Label(value=' 0.61MB of 0.61MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,290.74032


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▂▂▂▃▅▄▅▆▆█▆▅▇▅▇▆▇█▇▆▆▇▇▅▂▅▆█▄▂▄▆▅▄▅▆


[34m[1mwandb[0m: Agent Starting Run: elk2ibvu with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 42.799593693704956, ended at 14.1
episode 200 average reward 241.98456874570994, ended at 89.6
episode 300 average reward 342.4845515430995, ended at 220.1
episode 400 average reward 414.7689448633455, ended at 398.3
episode 500 average reward 223.1935134892997, ended at 513.7
episode 600 average reward 482.5658232809221, ended at 685.4
episode 700 average reward 313.12201867852275, ended at 803.6
episode 800 average reward 437.6908404493459, ended at 972.3
episode 900 average reward 487.3425573791827, ended at 1152.9
episode 1000 average reward 393.4026333750314, ended at 1307.5


VBox(children=(Label(value=' 0.63MB of 0.63MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,393.40263


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▄▄▅▅▅▇▇▇██▇▆▅▅▅▆██▇▆▆▄▇▆██▇▇▇█▇█▆▆


[34m[1mwandb[0m: Agent Starting Run: nu1c6sx0 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.105988768533493, ended at 3.7
episode 200 average reward 30.54297363634793, ended at 8.9
episode 300 average reward 35.35343767401745, ended at 14.5
episode 400 average reward 42.19032784337175, ended at 21.3
episode 500 average reward 37.838601084909605, ended at 28.3
episode 600 average reward 48.08426569028521, ended at 35.9
episode 700 average reward 46.63161622244637, ended at 44.3
episode 800 average reward 49.23942591872784, ended at 52.6
episode 900 average reward 57.52952782787245, ended at 63.3
episode 1000 average reward 92.42913285800026, ended at 75.1


VBox(children=(Label(value=' 0.64MB of 0.64MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,92.42913


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▃▄▄▄▅▄▅▅▅▄▅▅▆▆▆▅▆▆▅█


[34m[1mwandb[0m: Agent Starting Run: hbxh3xbi with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 40.498619910654796, ended at 14.5
episode 200 average reward 163.04675908579583, ended at 57.5
episode 300 average reward 81.25234238984358, ended at 107.6
episode 400 average reward 35.606845490299676, ended at 134.8
episode 500 average reward 9.61764315043371, ended at 138.9
episode 600 average reward 9.433030505149507, ended at 142.8
episode 700 average reward 10.548500918758908, ended at 147.0
episode 800 average reward 10.417836690733862, ended at 153.5
episode 900 average reward 11.499910570019278, ended at 159.4
episode 1000 average reward 12.023892533015617, ended at 163.6


VBox(children=(Label(value=' 0.65MB of 0.65MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,12.02389


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▂▂▂▄▆▆█▃▅▄▃▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Agent Starting Run: u653n4hy with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 41.83415586101572, ended at 13.6
episode 200 average reward 168.3917690718862, ended at 53.9
episode 300 average reward 357.6366397058744, ended at 169.6
episode 400 average reward 283.526893464888, ended at 292.9
episode 500 average reward 399.32657948962304, ended at 444.9
episode 600 average reward 280.61892206148013, ended at 595.1
episode 700 average reward 435.0367332512819, ended at 784.1
episode 800 average reward 475.7599882409037, ended at 973.4
episode 900 average reward 484.9400534165943, ended at 1167.1
episode 1000 average reward 411.72279552406286, ended at 1308.4


VBox(children=(Label(value=' 0.66MB of 0.66MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,411.7228


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▂▄▅▄▅▆▅▅▆▅▅▆▅▇██▄▇▇███▇█▇████▆▅▆▇


[34m[1mwandb[0m: Agent Starting Run: y1hirlbj with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.598346636496373, ended at 3.8
episode 200 average reward 33.33105491215636, ended at 8.6
episode 300 average reward 28.270492333612722, ended at 14.3
episode 400 average reward 37.403144162184674, ended at 20.9
episode 500 average reward 40.527145026056175, ended at 27.6
episode 600 average reward 42.305708705143076, ended at 35.1
episode 700 average reward 52.901163673429735, ended at 43.2
episode 800 average reward 52.49691648284391, ended at 51.7
episode 900 average reward 50.45363565473805, ended at 61.1
episode 1000 average reward 61.93420522856127, ended at 71.4


VBox(children=(Label(value=' 0.67MB of 0.67MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,61.93421


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▂▂▂▃▄▄▄▃▄▃▅▅▄▅▆▄▅▅▆▆▆▅▇▆▆▆▆▇▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ev3od3of with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 44.35171042731204, ended at 14.2
episode 200 average reward 170.35213253670358, ended at 66.7
episode 300 average reward 30.04216076313636, ended at 105.8
episode 400 average reward 186.5816327706037, ended at 174.5
episode 500 average reward 12.038419307702453, ended at 191.7
episode 600 average reward 9.435343823887694, ended at 195.5
episode 700 average reward 9.464719574543546, ended at 199.7
episode 800 average reward 9.259300427003597, ended at 203.5
episode 900 average reward 9.257430598824103, ended at 207.4
episode 1000 average reward 182.18721844648124, ended at 273.9


VBox(children=(Label(value=' 0.68MB of 0.68MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,182.18722


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▂▂▃▅▇▇▆▅▂▂▄█▇▇▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▅▇█


[34m[1mwandb[0m: Agent Starting Run: 9ashjg2j with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 41.44986409284626, ended at 13.2
episode 200 average reward 156.94093263444944, ended at 55.3
episode 300 average reward 253.39998355552981, ended at 153.1
episode 400 average reward 413.1393474766169, ended at 307.8
episode 500 average reward 491.0743437772966, ended at 497.7
episode 600 average reward 431.2627806645897, ended at 643.7
episode 700 average reward 472.49987568106434, ended at 779.1
episode 800 average reward 474.3257330183216, ended at 968.5
episode 900 average reward 484.10862893943823, ended at 1149.9
episode 1000 average reward 453.579861597036, ended at 1341.5


VBox(children=(Label(value=' 0.69MB of 0.69MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,453.57986


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▃▃▅▅▃▆▅▅▇▇████▇▄▇▅▄▆▇████▇▇▇█████


[34m[1mwandb[0m: Agent Starting Run: vfp1mo59 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.79077052684506, ended at 3.7
episode 200 average reward 29.282135558207216, ended at 8.7
episode 300 average reward 30.294178147047347, ended at 13.6
episode 400 average reward 30.766700527129466, ended at 19.2
episode 500 average reward 37.84699860875484, ended at 25.9
episode 600 average reward 44.11621425631471, ended at 33.0
episode 700 average reward 45.87303903343643, ended at 40.4
episode 800 average reward 58.33337397477186, ended at 49.5
episode 900 average reward 54.6658304246369, ended at 58.7
episode 1000 average reward 63.897200078864365, ended at 69.4


VBox(children=(Label(value=' 0.70MB of 0.70MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,63.8972


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▂▂▂▃▃▃▂▃▃▃▃▄▃▃▄▄▄▅▅▅▅▅▄▆▆▅▅▆▆▆▅▅▆▇▇█▇


[34m[1mwandb[0m: Agent Starting Run: 8ktx081k with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 42.898814271476276, ended at 15.7
episode 200 average reward 72.64900070131847, ended at 40.7
episode 300 average reward 185.80044956013705, ended at 91.9
episode 400 average reward 158.89370383331647, ended at 148.1
episode 500 average reward 140.82665438241247, ended at 177.7
episode 600 average reward 20.807708257351777, ended at 208.9
episode 700 average reward 33.17528687435159, ended at 217.0
episode 800 average reward 57.010949426516845, ended at 249.8
episode 900 average reward 9.51147261913468, ended at 253.8
episode 1000 average reward 120.89999529845227, ended at 310.1


VBox(children=(Label(value=' 0.72MB of 0.72MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,120.9


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▂▂▂▂▂▃▃▄▄▅▄▂▆▃▂▂▃▆▃▂▁▁▁▁▁▂▃▃▃▁▁▁▁▁▁█▅


[34m[1mwandb[0m: Agent Starting Run: in67o2pi with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 43.254833243019014, ended at 13.9
episode 200 average reward 129.3957481479408, ended at 54.0
episode 300 average reward 190.52540465953513, ended at 139.2
episode 400 average reward 207.14632256054358, ended at 211.7
episode 500 average reward 185.8649158954531, ended at 289.1
episode 600 average reward 229.94071004729634, ended at 387.2
episode 700 average reward 470.9095792608399, ended at 546.6
episode 800 average reward 375.6920798327207, ended at 702.6
episode 900 average reward 130.50035408209285, ended at 742.5
episode 1000 average reward 133.19863375297868, ended at 820.7


VBox(children=(Label(value=' 0.73MB of 0.73MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,133.19863


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▃▃▄▅▄▄▂▃▅▄▃▄▄▃▄▆▄▅▆▆▇█▇▆▇▄▂▂▃▅▅▃▃


[34m[1mwandb[0m: Agent Starting Run: ei3e1egg with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.48515725792664, ended at 4.1
episode 200 average reward 23.100567082392992, ended at 8.0
episode 300 average reward 21.30208711111309, ended at 12.0
episode 400 average reward 23.972219149802292, ended at 16.3
episode 500 average reward 21.309510547160706, ended at 20.2
episode 600 average reward 23.777047617785147, ended at 24.0
episode 700 average reward 27.978847167239206, ended at 28.7
episode 800 average reward 25.620942660794828, ended at 32.6
episode 900 average reward 26.416948570036585, ended at 36.8
episode 1000 average reward 27.262353939466216, ended at 41.1


VBox(children=(Label(value=' 0.74MB of 0.74MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,27.26235


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▅▅▆▆▆▅▆▅▅▄▄▅▆▄▅▆▄▄▅▅▅▄▄▅▅▆█▆▅▅▅▅▆▆▆▆▇▆▆


[34m[1mwandb[0m: Agent Starting Run: az4lg9q5 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.596019144102552, ended at 8.2
episode 200 average reward 26.897009074706226, ended at 18.0
episode 300 average reward 27.822036552557524, ended at 28.5
episode 400 average reward 27.065444119832975, ended at 38.8
episode 500 average reward 24.359989485867946, ended at 49.1
episode 600 average reward 32.11447252531298, ended at 60.8
episode 700 average reward 31.724396548868704, ended at 72.6
episode 800 average reward 35.830028210186505, ended at 84.7
episode 900 average reward 30.504526950616654, ended at 97.0
episode 1000 average reward 36.561511008585654, ended at 110.7


VBox(children=(Label(value=' 0.75MB of 0.75MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,36.56151


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▃▃▃▅▆▅▄▄▅▅▄▅▅▄▅▅▅▆▅▆▆▅▇▇▅▆▆▆▆▆▇▇▇▇▆█


[34m[1mwandb[0m: Agent Starting Run: ydgbgo00 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.173701670904638, ended at 8.0
episode 200 average reward 25.971801284183833, ended at 18.0
episode 300 average reward 25.88189243663077, ended at 28.4
episode 400 average reward 31.462216516157714, ended at 39.7
episode 500 average reward 29.61926673028498, ended at 50.9
episode 600 average reward 35.372143895526925, ended at 64.7
episode 700 average reward 29.489571881846995, ended at 76.6
episode 800 average reward 33.9600792627704, ended at 90.8
episode 900 average reward 46.8446961109924, ended at 106.6
episode 1000 average reward 38.51481422216756, ended at 122.3


VBox(children=(Label(value=' 0.76MB of 0.76MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,38.51481


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▂▃▃▃▄▄▃▃▃▄▄▄▅▄▅▄▄▅▆▆▆▅▅▄▄▅▇▆▆▆▅▇█▇▇█▇


[34m[1mwandb[0m: Agent Starting Run: npshjsfe with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.48515725792664, ended at 4.1
episode 200 average reward 20.96412267474305, ended at 8.1
episode 300 average reward 23.299971185517595, ended at 12.1
episode 400 average reward 25.56643901189631, ended at 16.9
episode 500 average reward 21.185982598309625, ended at 20.9
episode 600 average reward 22.269245308578707, ended at 25.2
episode 700 average reward 24.13456784410985, ended at 29.4
episode 800 average reward 21.014840182828856, ended at 33.0
episode 900 average reward 24.56808227687218, ended at 37.2
episode 1000 average reward 29.601827037275225, ended at 41.7


VBox(children=(Label(value=' 0.77MB of 0.77MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,29.60183


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▅▅▆▆▆▆▅▅▅▄▆▅▆▇█▆▅▄▅▄▅▅▅▅▆▅▆▄▅▄▅▅▅▇▆▅▆▆█


[34m[1mwandb[0m: Agent Starting Run: n23wjm5s with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.42015756927228, ended at 8.4
episode 200 average reward 24.786149909130753, ended at 18.1
episode 300 average reward 21.657676691929918, ended at 26.8
episode 400 average reward 30.017138174390883, ended at 38.3
episode 500 average reward 29.109269687813878, ended at 49.6
episode 600 average reward 32.07955057683534, ended at 62.9
episode 700 average reward 30.08781300058016, ended at 75.0
episode 800 average reward 37.16993142780567, ended at 88.8
episode 900 average reward 37.01577844147952, ended at 103.2
episode 1000 average reward 39.34127584414019, ended at 118.1


VBox(children=(Label(value=' 0.78MB of 0.78MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,39.34128


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▃▄▄▄▄▄▄▄▄▅▄▅▅▅▅▅▆▆▇▆▆▆▅▆▆▇▆██▆▇▇▇▇█▇


[34m[1mwandb[0m: Agent Starting Run: wj931bd9 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.735597321929493, ended at 8.6
episode 200 average reward 27.572853590415576, ended at 20.1
episode 300 average reward 22.99718859938913, ended at 29.8
episode 400 average reward 29.423661941430243, ended at 41.1
episode 500 average reward 37.37785176882031, ended at 54.0
episode 600 average reward 33.36831233801055, ended at 67.2
episode 700 average reward 40.30938806591333, ended at 81.7
episode 800 average reward 31.46005364976239, ended at 96.9
episode 900 average reward 37.51461714277432, ended at 113.1
episode 1000 average reward 40.701723852219786, ended at 127.8


VBox(children=(Label(value=' 0.79MB of 0.79MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,40.70172


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▂▂▃▄▄▆▄▄▄▅▄▄▄▅▅▅▅▆▅▆▆▆▆▅▆▇▇▇█▆▇▇█▇▇▇▇▇


[34m[1mwandb[0m: Agent Starting Run: 9oh778hs with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.48515725792664, ended at 4.1
episode 200 average reward 20.96412267474305, ended at 8.2
episode 300 average reward 23.312111994399093, ended at 12.2
episode 400 average reward 21.051231041452596, ended at 16.0
episode 500 average reward 21.614719960537375, ended at 20.1
episode 600 average reward 21.018537301269262, ended at 24.0
episode 700 average reward 23.302179933361412, ended at 27.8
episode 800 average reward 22.63927243269886, ended at 31.9
episode 900 average reward 24.66898278728141, ended at 36.7
episode 1000 average reward 23.666864577512733, ended at 40.7


VBox(children=(Label(value=' 0.81MB of 0.81MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,23.66686


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▅▅▆▆▆▆▄▅▅▄▆▅▄▄▅▆▆▄▅▅▅▄▃▄▅▅▅▆▅▇▅█▆█▆▆▇▅▅


[34m[1mwandb[0m: Agent Starting Run: vsvcgbai with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.390917129432715, ended at 8.3
episode 200 average reward 23.01289888274801, ended at 17.0
episode 300 average reward 25.70761252459187, ended at 27.2
episode 400 average reward 26.25650942614992, ended at 37.6
episode 500 average reward 27.129276481152896, ended at 48.9
episode 600 average reward 25.621595386386918, ended at 59.1
episode 700 average reward 31.35132976459469, ended at 71.1
episode 800 average reward 31.988246161858758, ended at 83.0
episode 900 average reward 37.26884675444663, ended at 96.5
episode 1000 average reward 34.605687115871454, ended at 109.4


VBox(children=(Label(value=' 0.82MB of 0.82MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,34.60569


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▄▄▃▄▄▅▄▄▄▆▅▅▄▅▆▇▅▄▅▆▅▆▅█▆▇▆▇▇▇███▇▆▆


[34m[1mwandb[0m: Agent Starting Run: ff7zdbtn with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.390917129432715, ended at 9.1
episode 200 average reward 21.846203407743634, ended at 18.1
episode 300 average reward 27.182691014401236, ended at 28.4
episode 400 average reward 30.010212993184094, ended at 39.2
episode 500 average reward 26.157261975213675, ended at 50.8
episode 600 average reward 32.197702268775906, ended at 63.3
episode 700 average reward 28.646937064203463, ended at 75.0
episode 800 average reward 35.9575302932245, ended at 88.8
episode 900 average reward 30.91525616561081, ended at 100.6
episode 1000 average reward 33.98612278210142, ended at 115.2


VBox(children=(Label(value=' 0.83MB of 0.83MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,33.98612


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▃▃▃▄▃▅▄▄▄▄▄▅▆▆▅▆▅▆▅▇▇▄▆▆▅▆▇▆▆▆▅▆▇███


[34m[1mwandb[0m: Agent Starting Run: ruuby1p5 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.48515725792664, ended at 4.5
episode 200 average reward 20.96412267474305, ended at 8.5
episode 300 average reward 23.312111994399093, ended at 12.5
episode 400 average reward 19.831391004588195, ended at 16.2
episode 500 average reward 23.420234623949156, ended at 20.1
episode 600 average reward 19.913956517260605, ended at 23.7
episode 700 average reward 21.668800948803288, ended at 28.0
episode 800 average reward 22.62960196469291, ended at 32.1
episode 900 average reward 24.668961877276562, ended at 36.6
episode 1000 average reward 23.15590204358393, ended at 41.3


VBox(children=(Label(value=' 0.84MB of 0.84MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,23.1559


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▅▅▆▆▆▆▄▅▅▄▆▅▅▅▅▃▅▅▆▅▅▅▄▅▅▆▆▆▅▇▄█▆█▆▆▆▆▆


[34m[1mwandb[0m: Agent Starting Run: nw55qewg with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.390917129432715, ended at 8.3
episode 200 average reward 29.27770400118239, ended at 17.6
episode 300 average reward 22.518905358351933, ended at 27.1
episode 400 average reward 29.78548799640095, ended at 37.3
episode 500 average reward 27.32020771960047, ended at 48.2
episode 600 average reward 29.77530514017501, ended at 59.8
episode 700 average reward 30.436379585485824, ended at 71.6
episode 800 average reward 36.884063807908134, ended at 84.9
episode 900 average reward 30.37698643525052, ended at 97.2
episode 1000 average reward 35.898972839000315, ended at 110.6


VBox(children=(Label(value=' 0.85MB of 0.85MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,35.89897


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▃▃▃▅▅▄▄▃▅▄▄▄▅▅▆▅▄▆▆▆▆▆▅▅▇▆█▆█▇▇▅██▆▆


[34m[1mwandb[0m: Agent Starting Run: gcb317zc with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.380439232996046, ended at 9.1
episode 200 average reward 25.553984765192144, ended at 18.0
episode 300 average reward 21.8071099440486, ended at 27.4
episode 400 average reward 31.097718712024434, ended at 37.8
episode 500 average reward 29.38776374646283, ended at 49.5
episode 600 average reward 29.114997937605466, ended at 61.4
episode 700 average reward 30.459020421007242, ended at 73.2
episode 800 average reward 35.34184343353664, ended at 87.2
episode 900 average reward 37.37671066314375, ended at 101.5
episode 1000 average reward 36.767288772538485, ended at 116.4


VBox(children=(Label(value=' 0.86MB of 0.86MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,36.76729


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▂▃▃▃▃▄▅▄▃▄▄▄▅▅▅▄▆▅▆▆▆▄▅▅▅▆▆▇▇▇▆▇▇▇█▆▇▇


[34m[1mwandb[0m: Agent Starting Run: dgkwv973 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 24.48515725792664, ended at 4.3
episode 200 average reward 20.96412267474305, ended at 8.3
episode 300 average reward 21.58961093252197, ended at 12.4
episode 400 average reward 20.033937765302007, ended at 16.1
episode 500 average reward 25.48199208163279, ended at 20.3
episode 600 average reward 21.027670453458132, ended at 24.3
episode 700 average reward 24.1503463316438, ended at 28.5
episode 800 average reward 21.226000651526153, ended at 32.1
episode 900 average reward 29.40274707077755, ended at 36.6
episode 1000 average reward 20.781438263636538, ended at 40.7


VBox(children=(Label(value=' 0.87MB of 0.87MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,20.78144


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▄▄▆▆▅▅▄▄▅▄▆▅▄▃▄▄▄▄▆▆▅▆▄▅▅▅▅▄▄▅▄▅▅▆█▆▅▅▄


[34m[1mwandb[0m: Agent Starting Run: y0uu8z2o with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.380439232996046, ended at 8.5
episode 200 average reward 29.318730320788553, ended at 18.3
episode 300 average reward 27.18494408681256, ended at 28.7
episode 400 average reward 28.566844449082673, ended at 39.3
episode 500 average reward 29.474417699594404, ended at 50.1
episode 600 average reward 27.407775186256494, ended at 60.9
episode 700 average reward 22.676826936788416, ended at 71.2
episode 800 average reward 28.962099815702572, ended at 83.3
episode 900 average reward 30.126960052757905, ended at 96.2
episode 1000 average reward 25.32554201042151, ended at 108.1


VBox(children=(Label(value=' 0.89MB of 0.89MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,25.32554


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▃▄▃▃▅▅▆▅▄▅▅▄▅▆▄▆▅▅▅▅▅▅▅▆▄▄▅▇▆▆▇▇▇▇█▆▅


[34m[1mwandb[0m: Agent Starting Run: p97sg03i with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.380439232996046, ended at 8.7
episode 200 average reward 21.88722972734979, ended at 17.2
episode 300 average reward 25.204921095514962, ended at 26.5
episode 400 average reward 27.633225014829705, ended at 37.3
episode 500 average reward 27.073700744034298, ended at 48.5
episode 600 average reward 30.218037355586013, ended at 60.2
episode 700 average reward 29.95104737262775, ended at 72.5
episode 800 average reward 29.433896847502538, ended at 84.0
episode 900 average reward 27.892396970174044, ended at 96.1
episode 1000 average reward 30.014613245992546, ended at 107.4


VBox(children=(Label(value=' 0.90MB of 0.90MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,1000.0
running_score,30.01461


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▄▃▃▄▄▄▅▄▅▄▅▅▄▆▅▆▇▆▆▅▆█▆▇▇▆▆▆▅▆▇█▇▇▆▇▆▇█


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# You can see the result here!
[Report Link](https://wandb.ai/ko120/REINFORCE_Baseline/reports/REINFORCE-with-Baseline-forward-and-backward--Vmlldzo4NzM4ODE)