In [1]:
import pdb
import math
import random

import gym
import numpy as np
import reward as tr
import reward.utils as U

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from collections import defaultdict

import reward as tr

In [2]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
MAX_STEPS  = 4e6
CLIP_GRAD = 0.5

<h2>Use CUDA</h2>

In [4]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

In [5]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

In [6]:
def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

<h1>Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor</h1>
<h2><a href="https://arxiv.org/abs/1801.01290">Arxiv</a></h2>

In [7]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_dim, init_w=3e-3):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(state_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
        
class SoftQNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(SoftQNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
        
class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_min=-20, log_std_max=2):
        super(PolicyNetwork, self).__init__()
        
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        
        self.mean_linear = nn.Linear(hidden_size, num_actions)
        self.mean_linear.weight.data.uniform_(-init_w, init_w)
        self.mean_linear.bias.data.uniform_(-init_w, init_w)
        
        self.log_std_linear = nn.Linear(hidden_size, num_actions)
        self.log_std_linear.weight.data.uniform_(-init_w, init_w)
        self.log_std_linear.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        
        mean    = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
        
        return mean, log_std
    
    def evaluate(self, state, epsilon=1e-6):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        
        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)
        
        return action, log_prob, z, mean, log_std
        
    
    def get_action(self, state, step):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
        normal = Normal(mean, std)
        z      = normal.sample()
        action = torch.tanh(z)
        
        # TODO
        action  = action.detach().cpu().numpy()
        return action

In [8]:
def soft_q_update(batch, 
           gamma=0.99,
           mean_lambda=1e-3,
           std_lambda=1e-3,
           z_lambda=0.0,
           soft_tau=1e-2,
           frame_idx=0,
           logger=None,            
          ):
    state, action, reward, next_state, done = batch.state_t, batch.action, batch.reward, batch.state_tp1, batch.done

    expected_q_value = soft_q_net(state, action)
    expected_value   = value_net(state)
    new_action, log_prob, z, mean, log_std = policy_net.evaluate(state)


    target_value = target_value_net(next_state)
    next_q_value = reward + (1 - done) * gamma * target_value
    q_value_loss = soft_q_criterion(expected_q_value, next_q_value.detach())

    expected_new_q_value = soft_q_net(state, new_action)
    next_value = expected_new_q_value - log_prob
    value_loss = value_criterion(expected_value, next_value.detach())

    log_prob_target = expected_new_q_value - expected_value
    policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
    

    mean_loss = mean_lambda * mean.pow(2).mean()
    std_loss  = std_lambda  * log_std.pow(2).mean()
    z_loss    = z_lambda    * z.pow(2).sum(1).mean()

    policy_loss += mean_loss + std_loss + z_loss

    soft_q_optimizer.zero_grad()
    q_value_loss.backward()
    q_grad = torch.nn.utils.clip_grad_norm_(soft_q_net.parameters(), CLIP_GRAD)
    soft_q_optimizer.step()

    value_optimizer.zero_grad()
    value_loss.backward()
    v_grad = torch.nn.utils.clip_grad_norm_(value_net.parameters(), CLIP_GRAD)
    value_optimizer.step()

    policy_optimizer.zero_grad()
    policy_loss.backward()
    p_grad = torch.nn.utils.clip_grad_norm_(policy_net.parameters(), CLIP_GRAD)
    policy_optimizer.step()
    
    
    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - soft_tau) + param.data * soft_tau
        )

    if frame_idx % 1000 == 0:
        # Store information
        logger.add_log('policy/loss', policy_loss)
        logger.add_log('v/loss', value_loss)
        logger.add_log('q/loss', q_value_loss)
        
        logger.add_log('policy/grad', p_grad)
        logger.add_log('v/grad', v_grad)
        logger.add_log('q/grad', q_grad)
        
        logger.add_histogram('policy/log_prob', log_prob)
        logger.add_histogram('V', expected_value)
        logger.add_histogram('Q', expected_q_value)

In [9]:
env = tr.envs.GymEnv("HalfCheetah-v2")
env = tr.envs.wrappers.ActionBound(env)
runner = tr.runners.SingleRunner(env)
batcher = tr.batchers.ReplayBatcher(
    runner=runner,
    batch_size=128,
    replay_buffer_maxlen=1e6,
    learning_freq=1,
    grad_steps_per_batch=1,
    init_replays=500 / 1e6,
)

action_dim = env.action_info.shape[0]
state_dim = env.state_info.shape[0]

Choosing the latest nvidia driver: /usr/lib/nvidia-390, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-390']
Choosing the latest nvidia driver: /usr/lib/nvidia-390, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-390']


100%|██████████| 500/500 [00:00<00:00, 4266.41it/s]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Populating Replay Buffer...





In [10]:
hidden_dim = 256

value_net        = ValueNetwork(state_dim, hidden_dim).to(device)
target_value_net = ValueNetwork(state_dim, hidden_dim).to(device)

soft_q_net = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)
    

value_criterion  = nn.MSELoss()
soft_q_criterion = nn.MSELoss()

value_lr  = 3e-4
soft_q_lr = 3e-4
policy_lr = 3e-4

value_optimizer  = optim.Adam(value_net.parameters(), lr=value_lr)
soft_q_optimizer = optim.Adam(soft_q_net.parameters(), lr=soft_q_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)

In [11]:
logger = U.Logger('logs/halfcheetah/batcher-grad0.5-v0-1')

Writing logs to: logs/halfcheetah/batcher-grad0.5-v0-1


In [None]:
for batch in batcher.get_batches(MAX_STEPS, select_action_fn=policy_net.get_action):
    # Manipulate batch, calculate adv, etc
    batch = batch.to_tensor().concat_batch()
    # Optimization step
    soft_q_update(batch, soft_tau=0.005, logger=logger)
    
    # Write logs
    if (batcher.num_steps) % 1000 == 0:
        batcher.write_logs(logger)
        logger.log(step=batcher.num_steps)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  0%|          | 1.01k/4.00M [00:10<24:42:24, 45.0it/s]


--------------------------------------------------------------
policy/loss                                            |  1.47
v/loss                                                 |  1.40
q/loss                                                 |  0.45
policy/grad                                            |  3.04
v/grad                                                 | 13.58
q/grad                                                 |  3.43
Env/Reward/Episode (New)                               |   nan
Env/Length/Episode (New)                               |   nan
Env/Reward/Episode (Last 50)                           |   nan
Env/Length/Episode (Last 50)                           |   nan
--------------------------------------------------------------


  0%|          | 2.01k/4.00M [00:29<22:35:55, 49.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.47
v/loss                                               |    1.18
q/loss                                               |    1.19
policy/grad                                          |    2.62
v/grad                                               |   32.13
q/grad                                               |   18.48
Env/Reward/Episode (New)                             | -357.24
Env/Length/Episode (New)                             | 1500.00
Env/Reward/Episode (Last 50)                         | -357.24
Env/Length/Episode (Last 50)                         | 1500.00
--------------------------------------------------------------


  0%|          | 3.01k/4.00M [00:48<21:50:23, 50.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.35
v/loss                                               |    2.05
q/loss                                               |    1.98
policy/grad                                          |    3.08
v/grad                                               |   68.67
q/grad                                               |   41.43
Env/Reward/Episode (New)                             | -313.77
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -335.51
Env/Length/Episode (Last 50)                         | 1250.00
--------------------------------------------------------------


  0%|          | 4.00k/4.00M [01:07<19:14:25, 57.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.34
v/loss                                               |    3.21
q/loss                                               |    3.56
policy/grad                                          |    3.98
v/grad                                               |  115.66
q/grad                                               |   74.53
Env/Reward/Episode (New)                             | -487.78
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -386.26
Env/Length/Episode (Last 50)                         | 1166.67
--------------------------------------------------------------


  0%|          | 5.01k/4.00M [01:27<23:29:54, 47.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.31
v/loss                                               |    4.58
q/loss                                               |    5.44
policy/grad                                          |    5.14
v/grad                                               |  170.03
q/grad                                               |  109.21
Env/Reward/Episode (New)                             | -289.56
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -362.09
Env/Length/Episode (Last 50)                         | 1125.00
--------------------------------------------------------------


  0%|          | 6.01k/4.00M [01:46<20:53:23, 53.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.31
v/loss                                               |    5.71
q/loss                                               |    7.62
policy/grad                                          |    5.95
v/grad                                               |  226.79
q/grad                                               |  146.71
Env/Reward/Episode (New)                             | -333.23
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -356.31
Env/Length/Episode (Last 50)                         | 1100.00
--------------------------------------------------------------


  0%|          | 7.00k/4.00M [02:05<22:21:38, 49.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.29
v/loss                                               |    6.74
q/loss                                               |   10.26
policy/grad                                          |    6.41
v/grad                                               |  278.88
q/grad                                               |  187.38
Env/Reward/Episode (New)                             | -308.46
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -348.34
Env/Length/Episode (Last 50)                         | 1083.33
--------------------------------------------------------------


  0%|          | 8.01k/4.00M [02:24<21:52:00, 50.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    7.78
q/loss                                               |   12.26
policy/grad                                          |    7.11
v/grad                                               |  337.33
q/grad                                               |  228.48
Env/Reward/Episode (New)                             | -308.92
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -342.71
Env/Length/Episode (Last 50)                         | 1071.43
--------------------------------------------------------------


  0%|          | 9.01k/4.00M [02:43<23:08:56, 47.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.25
v/loss                                               |    9.30
q/loss                                               |   16.11
policy/grad                                          |    7.88
v/grad                                               |  419.37
q/grad                                               |  283.47
Env/Reward/Episode (New)                             | -224.74
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -327.96
Env/Length/Episode (Last 50)                         | 1062.50
--------------------------------------------------------------


  0%|          | 10.0k/4.00M [03:03<23:09:29, 47.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.30
v/loss                                               |    9.59
q/loss                                               |   18.47
policy/grad                                          |    7.90
v/grad                                               |  450.53
q/grad                                               |  312.66
Env/Reward/Episode (New)                             | -162.06
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -309.53
Env/Length/Episode (Last 50)                         | 1055.56
--------------------------------------------------------------


  0%|          | 11.0k/4.00M [03:22<22:46:59, 48.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |   10.07
q/loss                                               |   20.64
policy/grad                                          |    8.04
v/grad                                               |  505.80
q/grad                                               |  343.94
Env/Reward/Episode (New)                             | -281.84
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -306.76
Env/Length/Episode (Last 50)                         | 1050.00
--------------------------------------------------------------


  0%|          | 12.0k/4.00M [03:43<23:46:16, 46.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.29
v/loss                                               |   11.55
q/loss                                               |   26.65
policy/grad                                          |    8.93
v/grad                                               |  584.25
q/grad                                               |  402.49
Env/Reward/Episode (New)                             | -185.89
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -295.77
Env/Length/Episode (Last 50)                         | 1045.45
--------------------------------------------------------------


  0%|          | 13.0k/4.00M [04:03<21:29:53, 51.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.27
v/loss                                               |   11.54
q/loss                                               |   27.43
policy/grad                                          |    8.83
v/grad                                               |  625.31
q/grad                                               |  425.34
Env/Reward/Episode (New)                             | -201.24
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -287.89
Env/Length/Episode (Last 50)                         | 1041.67
--------------------------------------------------------------


  0%|          | 14.0k/4.00M [04:23<23:03:20, 48.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.23
v/loss                                               |   11.89
q/loss                                               |   31.05
policy/grad                                          |    9.09
v/grad                                               |  673.89
q/grad                                               |  465.31
Env/Reward/Episode (New)                             | -147.47
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -277.09
Env/Length/Episode (Last 50)                         | 1038.46
--------------------------------------------------------------


  0%|          | 15.0k/4.00M [04:42<19:44:56, 56.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.27
v/loss                                               |   11.97
q/loss                                               |   31.06
policy/grad                                          |    9.15
v/grad                                               |  704.11
q/grad                                               |  483.09
Env/Reward/Episode (New)                             | -196.49
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -271.33
Env/Length/Episode (Last 50)                         | 1035.71
--------------------------------------------------------------


  0%|          | 16.0k/4.00M [05:01<21:29:10, 51.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.11
v/loss                                               |   11.95
q/loss                                               |   36.40
policy/grad                                          |    8.66
v/grad                                               |  723.94
q/grad                                               |  514.98
Env/Reward/Episode (New)                             | -267.54
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -271.08
Env/Length/Episode (Last 50)                         | 1033.33
--------------------------------------------------------------


  0%|          | 17.0k/4.00M [05:20<20:41:13, 53.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.26
v/loss                                               |   12.12
q/loss                                               |   37.93
policy/grad                                          |    8.77
v/grad                                               |  780.61
q/grad                                               |  535.17
Env/Reward/Episode (New)                             | -236.64
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -268.93
Env/Length/Episode (Last 50)                         | 1031.25
--------------------------------------------------------------


  0%|          | 18.0k/4.00M [05:39<23:33:27, 47.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.23
v/loss                                               |   12.73
q/loss                                               |   41.19
policy/grad                                          |    8.83
v/grad                                               |  837.44
q/grad                                               |  579.60
Env/Reward/Episode (New)                             |  -93.05
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.58
Env/Length/Episode (Last 50)                         | 1029.41
--------------------------------------------------------------


  0%|          | 19.0k/4.00M [05:59<23:05:22, 47.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.31
v/loss                                               |   13.35
q/loss                                               |   45.91
policy/grad                                          |    8.71
v/grad                                               |  896.05
q/grad                                               |  624.15
Env/Reward/Episode (New)                             | -276.05
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -259.55
Env/Length/Episode (Last 50)                         | 1027.78
--------------------------------------------------------------


  1%|          | 20.0k/4.00M [06:18<23:12:00, 47.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.24
v/loss                                               |   12.44
q/loss                                               |   44.48
policy/grad                                          |    8.36
v/grad                                               |  887.60
q/grad                                               |  603.51
Env/Reward/Episode (New)                             | -230.66
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.03
Env/Length/Episode (Last 50)                         | 1026.32
--------------------------------------------------------------


  1%|          | 21.0k/4.00M [06:37<20:07:00, 54.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |   12.50
q/loss                                               |   42.17
policy/grad                                          |    8.38
v/grad                                               |  909.52
q/grad                                               |  620.16
Env/Reward/Episode (New)                             | -268.07
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.53
Env/Length/Episode (Last 50)                         | 1025.00
--------------------------------------------------------------


  1%|          | 22.0k/4.00M [06:56<19:47:48, 55.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |   12.26
q/loss                                               |   49.49
policy/grad                                          |    8.22
v/grad                                               |  952.95
q/grad                                               |  656.34
Env/Reward/Episode (New)                             | -238.89
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -257.60
Env/Length/Episode (Last 50)                         | 1023.81
--------------------------------------------------------------


  1%|          | 23.0k/4.00M [07:16<23:06:18, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |   12.29
q/loss                                               |   55.03
policy/grad                                          |    8.12
v/grad                                               |  975.88
q/grad                                               |  690.49
Env/Reward/Episode (New)                             | -265.45
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -257.96
Env/Length/Episode (Last 50)                         | 1022.73
--------------------------------------------------------------


  1%|          | 24.0k/4.00M [07:35<23:10:52, 47.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.25
v/loss                                               |   12.53
q/loss                                               |   58.77
policy/grad                                          |    7.94
v/grad                                               | 1028.42
q/grad                                               |  722.32
Env/Reward/Episode (New)                             | -286.12
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -259.18
Env/Length/Episode (Last 50)                         | 1021.74
--------------------------------------------------------------


  1%|          | 25.0k/4.00M [07:54<20:41:26, 53.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.27
v/loss                                               |   12.49
q/loss                                               |   61.56
policy/grad                                          |    7.65
v/grad                                               | 1066.95
q/grad                                               |  757.19
Env/Reward/Episode (New)                             | -258.89
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -259.17
Env/Length/Episode (Last 50)                         | 1020.83
--------------------------------------------------------------


  1%|          | 26.0k/4.00M [08:15<24:10:06, 45.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |   11.84
q/loss                                               |   54.58
policy/grad                                          |    7.25
v/grad                                               | 1036.86
q/grad                                               |  712.29
Env/Reward/Episode (New)                             | -225.34
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -257.82
Env/Length/Episode (Last 50)                         | 1020.00
--------------------------------------------------------------


  1%|          | 27.0k/4.00M [08:34<24:27:49, 45.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.27
v/loss                                               |   11.49
q/loss                                               |   57.01
policy/grad                                          |    6.86
v/grad                                               | 1056.94
q/grad                                               |  725.15
Env/Reward/Episode (New)                             | -235.14
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -256.94
Env/Length/Episode (Last 50)                         | 1019.23
--------------------------------------------------------------


  1%|          | 28.0k/4.00M [08:54<21:56:34, 50.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |   11.30
q/loss                                               |   67.47
policy/grad                                          |    6.64
v/grad                                               | 1069.33
q/grad                                               |  772.83
Env/Reward/Episode (New)                             | -290.61
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.19
Env/Length/Episode (Last 50)                         | 1018.52
--------------------------------------------------------------


  1%|          | 29.0k/4.00M [09:14<22:22:56, 49.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.10
v/loss                                               |   10.70
q/loss                                               |   58.28
policy/grad                                          |    6.23
v/grad                                               | 1037.15
q/grad                                               |  730.71
Env/Reward/Episode (New)                             | -302.24
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -259.76
Env/Length/Episode (Last 50)                         | 1017.86
--------------------------------------------------------------


  1%|          | 30.0k/4.00M [09:34<23:33:38, 46.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.12
v/loss                                               |   11.33
q/loss                                               |   73.20
policy/grad                                          |    6.51
v/grad                                               | 1117.53
q/grad                                               |  813.60
Env/Reward/Episode (New)                             | -285.73
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -260.66
Env/Length/Episode (Last 50)                         | 1017.24
--------------------------------------------------------------


  1%|          | 31.0k/4.00M [09:54<22:31:25, 48.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.22
v/loss                                               |   11.73
q/loss                                               |   82.88
policy/grad                                          |    6.44
v/grad                                               | 1160.81
q/grad                                               |  881.63
Env/Reward/Episode (New)                             | -241.75
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -260.03
Env/Length/Episode (Last 50)                         | 1016.67
--------------------------------------------------------------


  1%|          | 32.0k/4.00M [10:12<17:30:39, 62.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |   12.05
q/loss                                               |   85.20
policy/grad                                          |    6.45
v/grad                                               | 1211.78
q/grad                                               |  893.74
Env/Reward/Episode (New)                             | -252.97
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -259.80
Env/Length/Episode (Last 50)                         | 1016.13
--------------------------------------------------------------


  1%|          | 33.0k/4.00M [10:31<22:44:30, 48.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |   11.39
q/loss                                               |   78.91
policy/grad                                          |    6.19
v/grad                                               | 1224.95
q/grad                                               |  888.48
Env/Reward/Episode (New)                             | -242.21
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -259.25
Env/Length/Episode (Last 50)                         | 1015.62
--------------------------------------------------------------


  1%|          | 34.0k/4.00M [10:50<21:17:34, 51.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.10
v/loss                                               |   10.78
q/loss                                               |   75.37
policy/grad                                          |    5.85
v/grad                                               | 1174.79
q/grad                                               |  840.75
Env/Reward/Episode (New)                             | -246.12
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.85
Env/Length/Episode (Last 50)                         | 1015.15
--------------------------------------------------------------


  1%|          | 35.0k/4.00M [11:08<20:50:52, 52.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.31
v/loss                                               |   10.44
q/loss                                               |   71.44
policy/grad                                          |    5.70
v/grad                                               | 1133.48
q/grad                                               |  830.55
Env/Reward/Episode (New)                             | -239.42
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.28
Env/Length/Episode (Last 50)                         | 1014.71
--------------------------------------------------------------


  1%|          | 36.0k/4.00M [11:28<22:32:02, 48.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.23
v/loss                                               |    9.97
q/loss                                               |   81.01
policy/grad                                          |    5.67
v/grad                                               | 1162.93
q/grad                                               |  860.08
Env/Reward/Episode (New)                             | -271.65
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -258.66
Env/Length/Episode (Last 50)                         | 1014.29
--------------------------------------------------------------


  1%|          | 37.0k/4.00M [11:48<23:17:00, 47.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    9.95
q/loss                                               |   80.13
policy/grad                                          |    5.40
v/grad                                               | 1153.53
q/grad                                               |  893.38
Env/Reward/Episode (New)                             | -161.89
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -255.98
Env/Length/Episode (Last 50)                         | 1013.89
--------------------------------------------------------------


  1%|          | 38.0k/4.00M [12:07<21:50:39, 50.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.35
v/loss                                               |   10.51
q/loss                                               |   88.86
policy/grad                                          |    5.51
v/grad                                               | 1225.77
q/grad                                               |  921.37
Env/Reward/Episode (New)                             | -169.91
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -253.65
Env/Length/Episode (Last 50)                         | 1013.51
--------------------------------------------------------------


  1%|          | 39.0k/4.00M [12:27<22:31:56, 48.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    9.84
q/loss                                               |  102.57
policy/grad                                          |    5.16
v/grad                                               | 1181.95
q/grad                                               |  959.46
Env/Reward/Episode (New)                             | -244.72
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -253.41
Env/Length/Episode (Last 50)                         | 1013.16
--------------------------------------------------------------


  1%|          | 40.0k/4.00M [12:47<23:27:37, 46.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    9.14
q/loss                                               |   73.77
policy/grad                                          |    4.98
v/grad                                               | 1145.03
q/grad                                               |  823.72
Env/Reward/Episode (New)                             | -210.92
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -252.32
Env/Length/Episode (Last 50)                         | 1012.82
--------------------------------------------------------------


  1%|          | 41.0k/4.00M [13:07<23:00:55, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.10
v/loss                                               |   10.09
q/loss                                               |  100.63
policy/grad                                          |    5.04
v/grad                                               | 1237.14
q/grad                                               |  985.01
Env/Reward/Episode (New)                             | -237.65
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -251.96
Env/Length/Episode (Last 50)                         | 1012.50
--------------------------------------------------------------


  1%|          | 42.0k/4.00M [13:27<23:55:04, 46.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.11
v/loss                                               |    8.74
q/loss                                               |   91.18
policy/grad                                          |    4.77
v/grad                                               | 1123.79
q/grad                                               |  892.58
Env/Reward/Episode (New)                             | -257.01
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -252.08
Env/Length/Episode (Last 50)                         | 1012.20
--------------------------------------------------------------


  1%|          | 43.0k/4.00M [13:46<23:25:26, 46.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.22
v/loss                                               |   10.15
q/loss                                               |  107.01
policy/grad                                          |    4.93
v/grad                                               | 1259.33
q/grad                                               | 1027.23
Env/Reward/Episode (New)                             | -222.81
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -251.38
Env/Length/Episode (Last 50)                         | 1011.90
--------------------------------------------------------------


  1%|          | 44.0k/4.00M [14:06<23:12:34, 47.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |    8.92
q/loss                                               |   91.56
policy/grad                                          |    4.45
v/grad                                               | 1174.62
q/grad                                               |  924.14
Env/Reward/Episode (New)                             | -101.15
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -247.89
Env/Length/Episode (Last 50)                         | 1011.63
--------------------------------------------------------------


  1%|          | 45.0k/4.00M [14:26<22:46:15, 48.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    9.18
q/loss                                               |   91.82
policy/grad                                          |    4.53
v/grad                                               | 1202.79
q/grad                                               |  920.34
Env/Reward/Episode (New)                             | -201.70
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -246.84
Env/Length/Episode (Last 50)                         | 1011.36
--------------------------------------------------------------


  1%|          | 46.0k/4.00M [14:45<22:21:22, 49.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.11
v/loss                                               |    8.53
q/loss                                               |   99.65
policy/grad                                          |    4.36
v/grad                                               | 1198.43
q/grad                                               |  965.61
Env/Reward/Episode (New)                             | -362.44
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -249.41
Env/Length/Episode (Last 50)                         | 1011.11
--------------------------------------------------------------


  1%|          | 47.0k/4.00M [15:05<23:12:29, 47.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.12
v/loss                                               |    9.08
q/loss                                               |  107.26
policy/grad                                          |    4.40
v/grad                                               | 1247.85
q/grad                                               |  997.11
Env/Reward/Episode (New)                             |  -80.44
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -245.74
Env/Length/Episode (Last 50)                         | 1010.87
--------------------------------------------------------------


  1%|          | 48.0k/4.00M [15:24<23:23:44, 46.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.21
v/loss                                               |    8.75
q/loss                                               |  101.13
policy/grad                                          |    4.21
v/grad                                               | 1192.74
q/grad                                               |  967.44
Env/Reward/Episode (New)                             | -307.10
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -247.04
Env/Length/Episode (Last 50)                         | 1010.64
--------------------------------------------------------------


  1%|          | 49.0k/4.00M [15:44<21:55:12, 50.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.29
v/loss                                               |    8.17
q/loss                                               |   91.38
policy/grad                                          |    4.14
v/grad                                               | 1212.71
q/grad                                               |  907.71
Env/Reward/Episode (New)                             | -285.25
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -247.84
Env/Length/Episode (Last 50)                         | 1010.42
--------------------------------------------------------------


  1%|▏         | 50.0k/4.00M [16:04<22:57:29, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |    8.58
q/loss                                               |  105.57
policy/grad                                          |    4.03
v/grad                                               | 1216.34
q/grad                                               | 1005.30
Env/Reward/Episode (New)                             | -107.99
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -244.98
Env/Length/Episode (Last 50)                         | 1010.20
--------------------------------------------------------------


  1%|▏         | 51.0k/4.00M [16:24<22:28:52, 48.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    8.75
q/loss                                               |  120.36
policy/grad                                          |    4.04
v/grad                                               | 1264.41
q/grad                                               | 1075.27
Env/Reward/Episode (New)                             | -208.35
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -244.25
Env/Length/Episode (Last 50)                         | 1010.00
--------------------------------------------------------------


  1%|▏         | 52.0k/4.00M [16:44<21:59:13, 49.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.28
v/loss                                               |    8.58
q/loss                                               |  112.07
policy/grad                                          |    4.05
v/grad                                               | 1275.00
q/grad                                               | 1007.94
Env/Reward/Episode (New)                             | -299.62
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -243.10
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 53.0k/4.00M [17:04<25:09:14, 43.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    7.55
q/loss                                               |   93.97
policy/grad                                          |    3.73
v/grad                                               | 1174.01
q/grad                                               |  904.61
Env/Reward/Episode (New)                             | -339.37
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -243.61
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 54.0k/4.00M [17:24<22:09:20, 49.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.27
v/loss                                               |    8.02
q/loss                                               |  106.15
policy/grad                                          |    3.72
v/grad                                               | 1198.29
q/grad                                               |  984.99
Env/Reward/Episode (New)                             | -262.43
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -239.10
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 55.0k/4.00M [17:45<24:00:44, 45.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |    7.49
q/loss                                               |  102.92
policy/grad                                          |    3.61
v/grad                                               | 1175.03
q/grad                                               |  943.79
Env/Reward/Episode (New)                             | -239.03
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -238.09
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 56.0k/4.00M [18:05<23:27:37, 46.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.28
v/loss                                               |    8.29
q/loss                                               |  111.66
policy/grad                                          |    3.69
v/grad                                               | 1230.59
q/grad                                               | 1012.56
Env/Reward/Episode (New)                             | -259.26
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -236.61
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 57.0k/4.00M [18:25<22:35:48, 48.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.29
v/loss                                               |    7.85
q/loss                                               |  116.80
policy/grad                                          |    3.84
v/grad                                               | 1226.66
q/grad                                               | 1042.55
Env/Reward/Episode (New)                             | -227.31
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -234.99
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 58.0k/4.00M [18:46<21:24:56, 51.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.25
v/loss                                               |    8.02
q/loss                                               |  119.97
policy/grad                                          |    3.63
v/grad                                               | 1270.50
q/grad                                               | 1061.20
Env/Reward/Episode (New)                             | -236.75
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.55
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  1%|▏         | 59.0k/4.00M [19:06<24:37:20, 44.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |    7.56
q/loss                                               |  105.10
policy/grad                                          |    3.59
v/grad                                               | 1228.75
q/grad                                               |  988.10
Env/Reward/Episode (New)                             | -205.55
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.16
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 60.0k/4.00M [19:26<24:02:52, 45.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |    8.21
q/loss                                               |  101.54
policy/grad                                          |    3.86
v/grad                                               | 1286.73
q/grad                                               |  974.87
Env/Reward/Episode (New)                             | -366.72
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -237.26
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 61.0k/4.00M [19:47<24:11:36, 45.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.12
v/loss                                               |    7.26
q/loss                                               |  107.49
policy/grad                                          |    3.53
v/grad                                               | 1186.21
q/grad                                               |  983.79
Env/Reward/Episode (New)                             | -111.23
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.84
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 62.0k/4.00M [20:07<22:42:09, 48.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |    7.37
q/loss                                               |  121.92
policy/grad                                          |    3.48
v/grad                                               | 1229.90
q/grad                                               | 1041.75
Env/Reward/Episode (New)                             |  -98.04
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -232.09
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 63.0k/4.00M [20:28<22:06:44, 49.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    6.35
q/loss                                               |  112.11
policy/grad                                          |    3.19
v/grad                                               | 1134.07
q/grad                                               |  971.50
Env/Reward/Episode (New)                             | -200.62
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -232.07
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 64.0k/4.00M [20:48<22:45:27, 48.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.21
v/loss                                               |    6.14
q/loss                                               |  107.39
policy/grad                                          |    3.15
v/grad                                               | 1092.29
q/grad                                               |  940.94
Env/Reward/Episode (New)                             | -212.29
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.37
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 65.0k/4.00M [21:08<23:47:11, 46.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.24
v/loss                                               |    6.67
q/loss                                               |  109.87
policy/grad                                          |    3.10
v/grad                                               | 1151.54
q/grad                                               |  956.76
Env/Reward/Episode (New)                             | -228.86
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -234.02
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 66.0k/4.00M [21:27<22:34:17, 48.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    6.42
q/loss                                               |  113.79
policy/grad                                          |    3.05
v/grad                                               | 1153.48
q/grad                                               | 1015.33
Env/Reward/Episode (New)                             | -215.76
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -232.98
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 67.0k/4.00M [21:47<22:32:03, 48.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.28
v/loss                                               |    6.34
q/loss                                               |  106.82
policy/grad                                          |    3.08
v/grad                                               | 1147.48
q/grad                                               |  960.50
Env/Reward/Episode (New)                             | -240.09
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.05
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 68.0k/4.00M [22:07<23:47:10, 45.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.21
v/loss                                               |    6.38
q/loss                                               |  121.30
policy/grad                                          |    2.93
v/grad                                               | 1122.88
q/grad                                               | 1012.08
Env/Reward/Episode (New)                             | -167.80
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -234.55
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 69.0k/4.00M [22:27<22:56:42, 47.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.30
v/loss                                               |    7.68
q/loss                                               |  134.76
policy/grad                                          |    3.20
v/grad                                               | 1232.63
q/grad                                               | 1102.78
Env/Reward/Episode (New)                             | -208.31
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.19
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 70.0k/4.00M [22:47<21:17:04, 51.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    5.92
q/loss                                               |  107.33
policy/grad                                          |    2.82
v/grad                                               | 1091.24
q/grad                                               |  949.35
Env/Reward/Episode (New)                             | -252.62
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -233.63
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 71.0k/4.00M [23:07<23:17:32, 46.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    6.04
q/loss                                               |  110.99
policy/grad                                          |    2.84
v/grad                                               | 1101.02
q/grad                                               |  922.70
Env/Reward/Episode (New)                             | -153.60
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -231.34
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 72.0k/4.00M [23:28<22:36:07, 48.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.26
v/loss                                               |    6.49
q/loss                                               |  126.89
policy/grad                                          |    2.88
v/grad                                               | 1149.76
q/grad                                               | 1044.51
Env/Reward/Episode (New)                             | -285.45
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -232.27
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 73.0k/4.00M [23:47<22:59:03, 47.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    6.29
q/loss                                               |  121.05
policy/grad                                          |    2.98
v/grad                                               | 1174.76
q/grad                                               | 1065.30
Env/Reward/Episode (New)                             | -188.53
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -230.73
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 74.0k/4.00M [24:07<22:48:44, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.21
v/loss                                               |    7.15
q/loss                                               |  134.26
policy/grad                                          |    3.04
v/grad                                               | 1181.95
q/grad                                               | 1085.63
Env/Reward/Episode (New)                             | -192.37
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -228.86
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 75.0k/4.00M [24:27<22:59:24, 47.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.10
v/loss                                               |    5.83
q/loss                                               |  117.06
policy/grad                                          |    2.89
v/grad                                               | 1135.38
q/grad                                               | 1004.51
Env/Reward/Episode (New)                             | -209.82
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -227.88
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 76.0k/4.00M [24:47<22:23:19, 48.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |    5.75
q/loss                                               |  113.49
policy/grad                                          |    2.94
v/grad                                               | 1104.90
q/grad                                               |  981.90
Env/Reward/Episode (New)                             | -190.53
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -227.18
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 77.0k/4.00M [25:06<23:29:59, 46.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.24
v/loss                                               |    6.90
q/loss                                               |  148.68
policy/grad                                          |    3.16
v/grad                                               | 1160.85
q/grad                                               | 1142.97
Env/Reward/Episode (New)                             | -271.61
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -227.91
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 78.0k/4.00M [25:25<22:23:47, 48.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.26
v/loss                                               |    6.11
q/loss                                               |  130.36
policy/grad                                          |    2.99
v/grad                                               | 1117.46
q/grad                                               | 1046.20
Env/Reward/Episode (New)                             | -235.70
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -226.81
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 79.0k/4.00M [25:44<22:47:22, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.20
v/loss                                               |    5.19
q/loss                                               |  124.21
policy/grad                                          |    2.85
v/grad                                               | 1046.87
q/grad                                               |  964.90
Env/Reward/Episode (New)                             | -288.67
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -226.54
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 80.0k/4.00M [26:05<22:32:17, 48.3it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    5.76
q/loss                                               |  118.74
policy/grad                                          |    2.89
v/grad                                               | 1093.38
q/grad                                               |  988.83
Env/Reward/Episode (New)                             | -215.41
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -225.14
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 81.0k/4.00M [26:25<24:05:00, 45.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    6.37
q/loss                                               |  130.70
policy/grad                                          |    3.05
v/grad                                               | 1176.33
q/grad                                               | 1082.83
Env/Reward/Episode (New)                             | -241.45
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -225.13
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 82.0k/4.00M [26:46<23:36:04, 46.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    5.52
q/loss                                               |  137.18
policy/grad                                          |    2.92
v/grad                                               | 1092.70
q/grad                                               | 1047.05
Env/Reward/Episode (New)                             | -256.74
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -225.21
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 83.0k/4.00M [27:06<22:16:48, 48.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.13
v/loss                                               |    5.74
q/loss                                               |  137.28
policy/grad                                          |    2.99
v/grad                                               | 1124.17
q/grad                                               | 1079.73
Env/Reward/Episode (New)                             | -269.20
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -225.74
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 84.0k/4.00M [27:25<24:09:23, 45.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.03
v/loss                                               |    5.19
q/loss                                               |  108.22
policy/grad                                          |    2.83
v/grad                                               | 1073.16
q/grad                                               |  928.31
Env/Reward/Episode (New)                             | -177.71
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -224.38
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 85.0k/4.00M [27:45<19:51:33, 54.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.24
v/loss                                               |    6.27
q/loss                                               |  118.66
policy/grad                                          |    2.98
v/grad                                               | 1146.23
q/grad                                               |  993.51
Env/Reward/Episode (New)                             | -260.16
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -224.79
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 86.0k/4.00M [28:05<23:50:53, 45.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    5.18
q/loss                                               |  128.40
policy/grad                                          |    2.80
v/grad                                               | 1057.12
q/grad                                               |  980.71
Env/Reward/Episode (New)                             | -311.64
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -225.59
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 87.0k/4.00M [28:24<22:50:23, 47.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.24
v/loss                                               |    5.77
q/loss                                               |  126.86
policy/grad                                          |    2.86
v/grad                                               | 1108.22
q/grad                                               | 1017.47
Env/Reward/Episode (New)                             | -193.47
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -226.22
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 88.0k/4.00M [28:44<20:05:49, 54.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    5.27
q/loss                                               |  131.33
policy/grad                                          |    2.74
v/grad                                               | 1079.93
q/grad                                               | 1027.08
Env/Reward/Episode (New)                             | -251.76
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -227.86
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 89.0k/4.00M [29:03<22:42:18, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    5.02
q/loss                                               |  109.48
policy/grad                                          |    2.71
v/grad                                               | 1015.51
q/grad                                               |  911.00
Env/Reward/Episode (New)                             | -274.52
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -228.46
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 90.0k/4.00M [29:22<22:34:45, 48.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.26
v/loss                                               |    4.77
q/loss                                               |  126.62
policy/grad                                          |    2.61
v/grad                                               | 1023.66
q/grad                                               |  981.86
Env/Reward/Episode (New)                             | -291.49
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -230.07
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 91.0k/4.00M [29:42<21:40:23, 50.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.13
v/loss                                               |    5.66
q/loss                                               |  140.58
policy/grad                                          |    2.82
v/grad                                               | 1102.75
q/grad                                               | 1078.47
Env/Reward/Episode (New)                             | -125.12
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -227.82
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 92.0k/4.00M [30:01<22:41:44, 47.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.12
v/loss                                               |    6.41
q/loss                                               |  129.34
policy/grad                                          |    2.97
v/grad                                               | 1154.52
q/grad                                               | 1048.05
Env/Reward/Episode (New)                             | -176.19
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -226.20
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 93.0k/4.00M [30:21<22:32:09, 48.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.09
v/loss                                               |    5.40
q/loss                                               |  148.98
policy/grad                                          |    2.84
v/grad                                               | 1103.90
q/grad                                               | 1072.53
Env/Reward/Episode (New)                             | -134.06
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -224.43
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 94.0k/4.00M [30:41<23:07:33, 46.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.03
v/loss                                               |    5.07
q/loss                                               |  135.61
policy/grad                                          |    2.77
v/grad                                               | 1058.41
q/grad                                               | 1040.53
Env/Reward/Episode (New)                             | -238.76
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -227.18
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 95.0k/4.00M [31:00<24:01:37, 45.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    5.21
q/loss                                               |  121.44
policy/grad                                          |    2.74
v/grad                                               | 1050.07
q/grad                                               |  937.61
Env/Reward/Episode (New)                             | -148.52
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -226.11
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 96.0k/4.00M [31:19<20:32:55, 52.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    5.65
q/loss                                               |  125.40
policy/grad                                          |    2.82
v/grad                                               | 1095.53
q/grad                                               |  990.82
Env/Reward/Episode (New)                             | -133.30
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -221.53
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 97.0k/4.00M [31:38<19:11:59, 56.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.22
v/loss                                               |    5.14
q/loss                                               |  124.10
policy/grad                                          |    2.72
v/grad                                               | 1090.84
q/grad                                               | 1011.71
Env/Reward/Episode (New)                             | -121.01
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -222.34
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 98.0k/4.00M [31:57<21:46:37, 49.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.27
v/loss                                               |    4.93
q/loss                                               |  126.04
policy/grad                                          |    2.75
v/grad                                               | 1022.70
q/grad                                               | 1003.55
Env/Reward/Episode (New)                             | -174.13
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -219.68
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  2%|▏         | 99.0k/4.00M [32:17<20:49:28, 52.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.12
v/loss                                               |    4.97
q/loss                                               |  118.59
policy/grad                                          |    2.78
v/grad                                               | 1062.46
q/grad                                               |  993.34
Env/Reward/Episode (New)                             | -234.54
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -218.67
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 100k/4.00M [32:36<19:47:04, 54.8it/s] 


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |    4.97
q/loss                                               |  144.16
policy/grad                                          |    2.77
v/grad                                               | 1053.00
q/grad                                               | 1056.28
Env/Reward/Episode (New)                             | -180.22
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -220.11
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 101k/4.00M [32:55<20:15:38, 53.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.24
v/loss                                               |    5.26
q/loss                                               |  134.09
policy/grad                                          |    2.86
v/grad                                               | 1079.89
q/grad                                               | 1005.82
Env/Reward/Episode (New)                             |  -87.15
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -217.69
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 102k/4.00M [33:15<22:57:52, 47.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.22
v/loss                                               |    5.67
q/loss                                               |  135.86
policy/grad                                          |    2.94
v/grad                                               | 1114.98
q/grad                                               | 1026.62
Env/Reward/Episode (New)                             | -189.95
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -215.50
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 103k/4.00M [33:34<20:10:42, 53.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |    4.74
q/loss                                               |  127.95
policy/grad                                          |    2.77
v/grad                                               | 1029.35
q/grad                                               |  998.58
Env/Reward/Episode (New)                             | -140.65
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -211.52
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 104k/4.00M [33:54<21:23:20, 50.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |    4.76
q/loss                                               |  123.63
policy/grad                                          |    2.74
v/grad                                               | 1041.99
q/grad                                               |  984.83
Env/Reward/Episode (New)                             | -141.87
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -209.11
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 105k/4.00M [34:13<20:39:13, 52.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |    4.79
q/loss                                               |  128.06
policy/grad                                          |    2.72
v/grad                                               |  998.48
q/grad                                               |  972.89
Env/Reward/Episode (New)                             | -226.81
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -208.87
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 106k/4.00M [34:33<23:41:37, 45.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.11
v/loss                                               |    6.14
q/loss                                               |  159.33
policy/grad                                          |    3.05
v/grad                                               | 1156.18
q/grad                                               | 1168.17
Env/Reward/Episode (New)                             | -121.47
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -206.11
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 107k/4.00M [34:52<23:26:22, 46.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |    4.87
q/loss                                               |  140.43
policy/grad                                          |    2.79
v/grad                                               | 1034.16
q/grad                                               | 1035.85
Env/Reward/Episode (New)                             | -141.01
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -204.38
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 108k/4.00M [35:12<23:10:29, 46.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    5.22
q/loss                                               |  136.06
policy/grad                                          |    2.92
v/grad                                               | 1065.23
q/grad                                               | 1033.43
Env/Reward/Episode (New)                             | -165.88
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -202.97
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 109k/4.00M [35:32<23:28:28, 46.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    5.30
q/loss                                               |  161.04
policy/grad                                          |    2.96
v/grad                                               | 1106.55
q/grad                                               | 1154.69
Env/Reward/Episode (New)                             | -143.58
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -201.73
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 110k/4.00M [35:53<25:44:55, 42.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.20
v/loss                                               |    4.71
q/loss                                               |  125.25
policy/grad                                          |    2.82
v/grad                                               | 1037.99
q/grad                                               |  981.47
Env/Reward/Episode (New)                             | -176.05
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -197.91
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 111k/4.00M [36:14<23:12:06, 46.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.07
v/loss                                               |    4.59
q/loss                                               |  131.12
policy/grad                                          |    2.71
v/grad                                               | 1030.52
q/grad                                               | 1028.05
Env/Reward/Episode (New)                             | -201.85
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -199.73
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 112k/4.00M [36:34<23:23:09, 46.2it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    5.12
q/loss                                               |  137.88
policy/grad                                          |    2.82
v/grad                                               | 1047.47
q/grad                                               | 1038.13
Env/Reward/Episode (New)                             | -210.14
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -201.97
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 113k/4.00M [36:54<22:27:18, 48.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.16
v/loss                                               |    4.98
q/loss                                               |  124.16
policy/grad                                          |    2.80
v/grad                                               | 1021.03
q/grad                                               |  948.51
Env/Reward/Episode (New)                             | -196.94
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -201.90
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 114k/4.00M [37:15<22:43:39, 47.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.11
v/loss                                               |    4.54
q/loss                                               |  131.19
policy/grad                                          |    2.77
v/grad                                               | 1024.90
q/grad                                               | 1003.56
Env/Reward/Episode (New)                             | -111.33
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -199.88
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 115k/4.00M [37:34<22:13:32, 48.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.25
v/loss                                               |    4.84
q/loss                                               |  137.29
policy/grad                                          |    2.96
v/grad                                               | 1058.80
q/grad                                               | 1037.11
Env/Reward/Episode (New)                             | -185.68
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -199.01
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 116k/4.00M [37:55<22:27:07, 48.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    5.43
q/loss                                               |  138.38
policy/grad                                          |    3.09
v/grad                                               | 1140.60
q/grad                                               | 1075.51
Env/Reward/Episode (New)                             |  -27.56
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -195.25
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 117k/4.00M [38:15<24:31:19, 44.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.10
v/loss                                               |    4.48
q/loss                                               |  123.37
policy/grad                                          |    2.86
v/grad                                               | 1010.06
q/grad                                               |  966.42
Env/Reward/Episode (New)                             |  -81.70
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -192.08
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 118k/4.00M [38:35<22:56:11, 47.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.19
v/loss                                               |    4.66
q/loss                                               |  126.20
policy/grad                                          |    2.96
v/grad                                               | 1040.09
q/grad                                               | 1019.26
Env/Reward/Episode (New)                             | -233.17
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -193.39
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 119k/4.00M [38:56<23:44:00, 45.4it/s]


--------------------------------------------------------------
policy/loss                                          |    0.15
v/loss                                               |    5.00
q/loss                                               |  131.82
policy/grad                                          |    3.10
v/grad                                               | 1062.61
q/grad                                               | 1040.83
Env/Reward/Episode (New)                             | -133.79
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -191.90
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 120k/4.00M [39:16<21:35:42, 49.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.07
v/loss                                               |    5.29
q/loss                                               |  148.59
policy/grad                                          |    3.21
v/grad                                               | 1132.93
q/grad                                               | 1078.60
Env/Reward/Episode (New)                             | -217.11
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -191.19
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 121k/4.00M [39:36<21:04:25, 51.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.20
v/loss                                               |    4.69
q/loss                                               |  134.82
policy/grad                                          |    3.04
v/grad                                               | 1063.20
q/grad                                               | 1045.94
Env/Reward/Episode (New)                             | -171.39
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -191.54
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 122k/4.00M [39:56<23:28:34, 45.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.10
v/loss                                               |    4.90
q/loss                                               |  134.15
policy/grad                                          |    3.06
v/grad                                               | 1056.71
q/grad                                               | 1000.61
Env/Reward/Episode (New)                             | -208.76
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -190.01
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 123k/4.00M [40:17<23:38:23, 45.6it/s]


--------------------------------------------------------------
policy/loss                                          |    0.12
v/loss                                               |    5.43
q/loss                                               |  137.81
policy/grad                                          |    3.21
v/grad                                               | 1143.13
q/grad                                               | 1066.00
Env/Reward/Episode (New)                             |  -51.59
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -187.27
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 124k/4.00M [40:37<22:22:57, 48.1it/s]


--------------------------------------------------------------
policy/loss                                          |    0.14
v/loss                                               |    4.62
q/loss                                               |  136.36
policy/grad                                          |    3.06
v/grad                                               | 1056.24
q/grad                                               | 1019.70
Env/Reward/Episode (New)                             | -108.74
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -185.60
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 125k/4.00M [40:57<22:11:01, 48.5it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |    4.30
q/loss                                               |  115.23
policy/grad                                          |    2.93
v/grad                                               |  993.79
q/grad                                               |  904.59
Env/Reward/Episode (New)                             | -177.07
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -184.94
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 126k/4.00M [41:16<21:33:35, 49.9it/s]


--------------------------------------------------------------
policy/loss                                          |    0.17
v/loss                                               |    4.95
q/loss                                               |  122.92
policy/grad                                          |    3.15
v/grad                                               | 1080.89
q/grad                                               |  980.08
Env/Reward/Episode (New)                             |  -66.35
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -182.46
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 127k/4.00M [41:37<23:23:44, 46.0it/s]


--------------------------------------------------------------
policy/loss                                          |    0.28
v/loss                                               |    4.60
q/loss                                               |  115.20
policy/grad                                          |    3.11
v/grad                                               | 1008.49
q/grad                                               |  910.23
Env/Reward/Episode (New)                             | -182.26
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -180.67
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 128k/4.00M [41:58<24:02:52, 44.7it/s]


--------------------------------------------------------------
policy/loss                                          |    0.18
v/loss                                               |    4.72
q/loss                                               |  134.30
policy/grad                                          |    3.04
v/grad                                               | 1073.80
q/grad                                               | 1033.11
Env/Reward/Episode (New)                             |  -92.23
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -177.80
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 129k/4.00M [42:19<24:00:47, 44.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.11
v/loss                                               |    4.71
q/loss                                               |  136.80
policy/grad                                          |    3.16
v/grad                                               | 1025.14
q/grad                                               | 1029.48
Env/Reward/Episode (New)                             | -197.11
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -175.97
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 130k/4.00M [42:40<22:00:50, 48.8it/s]


--------------------------------------------------------------
policy/loss                                          |    0.21
v/loss                                               |    4.13
q/loss                                               |  139.09
policy/grad                                          |    2.93
v/grad                                               |  990.05
q/grad                                               | 1007.75
Env/Reward/Episode (New)                             | -159.45
Env/Length/Episode (New)                             | 1000.00
Env/Reward/Episode (Last 50)                         | -174.85
Env/Length/Episode (Last 50)                         | 1000.00
--------------------------------------------------------------


  3%|▎         | 131k/4.00M [42:51<25:20:46, 42.4it/s]