In [1]:
import pdb
import gym
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import reward as rw
import reward.utils as U



In [2]:
ENV = 'Humanoid-v2'
LOG_DIR = 'logs/humanoid/paper-v0-1'
REPAR = True
REWARD_SCALE = 20.
CLIP_GRAD = float('inf')
GAMMA = 0.99
TARGET_UP_WEIGHT = 0.005
BATCH_SIZE = 256
MAX_STEPS = 40e6

In [3]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

In [4]:
env = rw.envs.GymEnv(ENV)
env = rw.envs.wrappers.ActionBound(env)
runner = rw.runners.SingleRunner(env)
batcher = rw.batchers.ReplayBatcher(
    runner=runner,
    batch_size=256,
    replay_buffer_maxlen=1e6,
    learning_freq=1,
    grad_steps_per_batch=1,
    transforms=[
#         rw.batchers.transforms.StateRunNorm(),
        rw.batchers.transforms.RewardConstScaler(REWARD_SCALE),
    ],
)

state_features = batcher.get_state_info().shape[0]
num_actions = batcher.get_action_info().shape[0]

Choosing the latest nvidia driver: /usr/lib/nvidia-390, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-390']
Choosing the latest nvidia driver: /usr/lib/nvidia-390, among ['/usr/lib/nvidia-375', '/usr/lib/nvidia-390']
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
class PolicyNN(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_units=256,
                 activation=nn.ReLU, log_std_range=(-20, 2)):
        super().__init__()
        self.log_std_range = log_std_range
        
        layers = []
        layers += [nn.Linear(num_inputs, hidden_units), activation()]
        layers += [nn.Linear(hidden_units, hidden_units), activation()]
        self.layers = nn.Sequential(*layers)
        
        self.mean = nn.Linear(hidden_units, num_outputs)
        self.mean.weight.data.uniform_(-3e-3, 3e-3)
        self.mean.bias.data.uniform_(-3e-3, 3e-3)
        
        self.log_std = nn.Linear(hidden_units, num_outputs)
        self.log_std.weight.data.uniform_(-3e-3, 3e-3)
        self.log_std.bias.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, x):
        x = self.layers(x)
        mean = self.mean(x)
        log_std = self.log_std(x).clamp(*self.log_std_range)
        return mean, log_std        

In [6]:
class ValueNN(nn.Module):
    def __init__(self, num_inputs, hidden_units=256, activation=nn.ReLU):
        super().__init__()
        
        layers = []
        layers += [nn.Linear(num_inputs, hidden_units), activation()]
        layers += [nn.Linear(hidden_units, hidden_units), activation()]
        final_layer = nn.Linear(hidden_units, 1)
        final_layer.weight.data.uniform_(-3e-3, 3e-3)
        final_layer.bias.data.uniform_(-3e-3, 3e-3)
        layers += [final_layer]
        
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.layers(x)

In [7]:
class QValueNN(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_units=256, activation=nn.ReLU):
        super().__init__()
        
        layers = []
        layers += [nn.Linear(num_inputs + num_actions, hidden_units), activation()]
        layers += [nn.Linear(hidden_units, hidden_units), activation()]
        final_layer = nn.Linear(hidden_units, 1)
        final_layer.weight.data.uniform_(-3e-3, 3e-3)
        final_layer.bias.data.uniform_(-3e-3, 3e-3)
        layers += [final_layer]
        
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x):
        state, action = x
        x = torch.cat([state, action], dim=1)
        return self.layers(x)

In [8]:
class TanhNormalPolicy(rw.policy.BasePolicy):
    def create_dist(self, state):
        parameters = self.nn(state)
        mean, log_std = parameters
        return rw.distributions.TanhNormal(loc=mean, scale=log_std.exp())

    def get_action(self, state, step):
        dist = self.create_dist(state=state)
        action = U.to_np(dist.sample())
        assert not np.isnan(action).any()
        return action

In [9]:
p_nn = PolicyNN(num_inputs=state_features, num_outputs=num_actions).to(device)
v_nn = ValueNN(num_inputs=state_features).to(device)
v_nn_target = ValueNN(num_inputs=state_features).to(device).eval()
q1_nn = QValueNN(num_inputs=state_features, num_actions=num_actions).to(device)
q2_nn = QValueNN(num_inputs=state_features, num_actions=num_actions).to(device)

In [10]:
U.copy_weights(from_nn=v_nn, to_nn=v_nn_target, weight=1.)

In [11]:
policy = TanhNormalPolicy(nn=p_nn)

In [12]:
p_opt = torch.optim.Adam(p_nn.parameters(), lr=3e-4)
v_opt = torch.optim.Adam(v_nn.parameters(), lr=3e-4)
q1_opt = torch.optim.Adam(q1_nn.parameters(), lr=3e-4)
q2_opt = torch.optim.Adam(q2_nn.parameters(), lr=3e-4)

In [13]:
logger = U.Logger(LOG_DIR)

Writing logs to: logs/humanoid/paper-v0-1


In [14]:
batcher.populate(n=1000, get_action_fn=policy.get_action)

Populating Replay Buffer...


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [None]:
for batch in batcher.get_batches(MAX_STEPS, policy.get_action):
    batch = batch.to_tensor().concat_batch()

    ##### Calculate losses ######
    q1_batch = q1_nn((batch.state_t, batch.action))
    q2_batch = q2_nn((batch.state_t, batch.action))
    v_batch = v_nn(batch.state_t)

    dist = policy.create_dist(batch.state_t)
    if REPAR:
        action, pre_tanh_action = dist.rsample_with_pre()
    else:
        action, pre_tanh_action = dist.sample_with_pre()
    log_prob = dist.log_prob_pre(pre_tanh_action).sum(-1, keepdim=True)

    # Q loss
    v_target_tp1 = v_nn_target(batch.state_tp1)
    q_value_tp1 = U.estimators.td_target(rewards=batch.reward, dones=batch.done,
                                         v_tp1=v_target_tp1, gamma=GAMMA)
    q1_loss = F.mse_loss(q1_batch, q_value_tp1.detach())
    q2_loss = F.mse_loss(q2_batch, q_value_tp1.detach())

    # V loss    
    q1_new_t = q1_nn((batch.state_t, action))
    q2_new_t = q2_nn((batch.state_t, action))
    q_new_t = torch.min(q1_new_t, q2_new_t)
    next_value = q_new_t - log_prob
    v_loss = F.mse_loss(v_batch, next_value.detach())

    # Policy loss    
    if REPAR:        
        p_loss = (log_prob - q_new_t).mean()
    else:
        next_log_prob = q_new_t - v_batch
        p_loss = (log_prob * (log_prob - next_log_prob).detach()).mean()
    # Policy regularization losses
    mean_loss = 1e-3 * dist.loc.pow(2).mean()
    log_std_loss = 1e-3 * dist.scale.log().pow(2).mean()
    pre_tanh_loss = 0 * pre_tanh_action.pow(2).sum(1).mean()
    # Combine all losses
    p_loss += mean_loss + log_std_loss + pre_tanh_loss

    ###### Optimize ######
    q1_opt.zero_grad()
    q1_loss.backward()
#     torch.nn.utils.clip_grad_norm_(q1_nn.parameters(), CLIP_GRAD)
    q1_grad = U.mean_grad(q1_nn)
    q1_opt.step()

    q2_opt.zero_grad()
    q2_loss.backward()
#     torch.nn.utils.clip_grad_norm_(q2_nn.parameters(), CLIP_GRAD)
    q2_grad = U.mean_grad(q2_nn)
    q2_opt.step()

    v_opt.zero_grad()
    v_loss.backward()
#     torch.nn.utils.clip_grad_norm_(v_nn.parameters(), CLIP_GRAD)
    v_grad = U.mean_grad(v_nn)
    v_opt.step()

    p_opt.zero_grad()
    p_loss.backward()
#     torch.nn.utils.clip_grad_norm_(p_nn.parameters(), CLIP_GRAD)
    p_grad = U.mean_grad(p_nn)
    p_opt.step()

    ###### Update target value network ######
    U.copy_weights(from_nn=v_nn, to_nn=v_nn_target, weight=TARGET_UP_WEIGHT)

    ###### Write logs ######
    if batcher.num_steps % 4000 == 0 and batcher.runner.rewards:
        batcher.write_logs(logger)    

        logger.add_log('policy/loss', p_loss)
        logger.add_log('v/loss', v_loss)
        logger.add_log('q1/loss', q1_loss)
        logger.add_log('q2/loss', q2_loss)

        logger.add_log('policy/grad', p_grad)
        logger.add_log('v/grad', v_grad)
        logger.add_log('q1/grad', q1_grad)
        logger.add_log('q2/grad', q2_grad)

        logger.add_histogram('policy/log_prob', log_prob)
        logger.add_histogram('policy/mean', dist.loc)
        logger.add_histogram('policy/std', dist.scale.exp())
        logger.add_histogram('v/value', v_batch)
        logger.add_histogram('q1/value', q1_batch)
        logger.add_histogram('q2/value', q2_batch)

        logger.log(step=batcher.num_steps)

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=40000000), HTML(value='')), layout=Layout(dis…


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   121.18
Env/Length/Episode (New)                            |    24.49
Env/Reward/Episode (Last 50)                        |   178.31
Env/Length/Episode (Last 50)                        |    34.74
policy/loss                                         | -1071.56
v/loss                                              |  3609.77
q1/loss                                             |  8477.33
q2/loss                                             |  8528.21
policy/grad                                         |     0.01
v/grad                                              |  -375.73
q1/grad                                             |   160.64
q2/grad                                             |    48.89
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   379.81
Env/Length/Episode (New)                            |    78.71
Env/Reward/Episode (Last 50)                        |   376.54
Env/Length/Episode (Last 50)                        |    78.08
policy/loss                                         | -2135.46
v/loss                                              |  7274.07
q1/loss                                             |  7576.76
q2/loss                                             |  9372.70
policy/grad                                         |    -0.01
v/grad                                              |  1775.26
q1/grad                                             |    97.51
q2/grad                                             |  -486.52
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   437.22
Env/Length/Episode (New)                            |    88.09
Env/Reward/Episode (Last 50)                        |   440.59
Env/Length/Episode (Last 50)                        |    88.56
policy/loss                                         | -2538.03
v/loss                                              |  3858.49
q1/loss                                             | 12747.73
q2/loss                                             | 13180.04
policy/grad                                         |     0.01
v/grad                                              |  -270.66
q1/grad                                             |  -389.09
q2/grad                                             |  -182.70
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   547.01
Env/Length/Episode (New)                            |   110.11
Env/Reward/Episode (Last 50)                        |   530.44
Env/Length/Episode (Last 50)                        |   106.32
policy/loss                                         | -3025.18
v/loss                                              |  3673.85
q1/loss                                             |  7223.64
q2/loss                                             |  7452.43
policy/grad                                         |    -0.02
v/grad                                              |   261.32
q1/grad                                             |   748.79
q2/grad                                             |   395.08
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   472.94
Env/Length/Episode (New)                            |    98.44
Env/Reward/Episode (Last 50)                        |   477.25
Env/Length/Episode (Last 50)                        |    98.32
policy/loss                                         | -3178.41
v/loss                                              |  4266.91
q1/loss                                             |  6185.70
q2/loss                                             |  6122.18
policy/grad                                         |    -0.00
v/grad                                              |   921.86
q1/grad                                             |   -67.88
q2/grad                                             |   -75.42
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   644.77
Env/Length/Episode (New)                            |   131.70
Env/Reward/Episode (Last 50)                        |   654.50
Env/Length/Episode (Last 50)                        |   132.88
policy/loss                                         | -3017.27
v/loss                                              |  5098.58
q1/loss                                             | 31413.42
q2/loss                                             | 32794.40
policy/grad                                         |    -0.02
v/grad                                              |   642.67
q1/grad                                             |   703.30
q2/grad                                             |   441.52
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   814.56
Env/Length/Episode (New)                            |   163.68
Env/Reward/Episode (Last 50)                        |   707.26
Env/Length/Episode (Last 50)                        |   141.56
policy/loss                                         | -3353.17
v/loss                                              | 13482.08
q1/loss                                             | 11647.98
q2/loss                                             | 12810.77
policy/grad                                         |    -0.01
v/grad                                              | -1170.64
q1/grad                                             |  -643.57
q2/grad                                             |  -466.31
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   681.27
Env/Length/Episode (New)                            |   133.13
Env/Reward/Episode (Last 50)                        |   681.87
Env/Length/Episode (Last 50)                        |   133.08
policy/loss                                         | -3431.60
v/loss                                              |  4071.11
q1/loss                                             | 12548.01
q2/loss                                             | 12891.91
policy/grad                                         |     0.01
v/grad                                              |   659.92
q1/grad                                             |  -690.16
q2/grad                                             |  -190.21
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   714.61
Env/Length/Episode (New)                            |   141.21
Env/Reward/Episode (Last 50)                        |   780.09
Env/Length/Episode (Last 50)                        |   154.58
policy/loss                                         | -3556.01
v/loss                                              |  5495.24
q1/loss                                             | 10303.03
q2/loss                                             | 11762.28
policy/grad                                         |    -0.01
v/grad                                              |  1616.88
q1/grad                                             | -1371.30
q2/grad                                             | -1363.77
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   849.47
Env/Length/Episode (New)                            |   171.25
Env/Reward/Episode (Last 50)                        |   866.10
Env/Length/Episode (Last 50)                        |   172.20
policy/loss                                         | -3977.36
v/loss                                              |  6099.03
q1/loss                                             | 11234.90
q2/loss                                             | 10989.29
policy/grad                                         |    -0.02
v/grad                                              |  1159.71
q1/grad                                             |   252.03
q2/grad                                             |   128.97
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |   793.22
Env/Length/Episode (New)                            |   158.23
Env/Reward/Episode (Last 50)                        |   957.31
Env/Length/Episode (Last 50)                        |   191.08
policy/loss                                         | -3882.23
v/loss                                              |  6116.76
q1/loss                                             | 10610.53
q2/loss                                             | 10264.12
policy/grad                                         |    -0.02
v/grad                                              |  1681.17
q1/grad                                             |  -337.65
q2/grad                                             |    35.44
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  1218.59
Env/Length/Episode (New)                            |   243.88
Env/Reward/Episode (Last 50)                        |  1268.07
Env/Length/Episode (Last 50)                        |   251.28
policy/loss                                         | -4298.20
v/loss                                              |  9957.43
q1/loss                                             | 18361.29
q2/loss                                             | 18449.90
policy/grad                                         |    -0.02
v/grad                                              | -2066.41
q1/grad                                             |   522.92
q2/grad                                             |   435.27
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  1408.62
Env/Length/Episode (New)                            |   282.00
Env/Reward/Episode (Last 50)                        |  1347.96
Env/Length/Episode (Last 50)                        |   271.76
policy/loss                                         | -4667.45
v/loss                                              | 25304.14
q1/loss                                             | 16575.69
q2/loss                                             | 16667.47
policy/grad                                         |     0.02
v/grad                                              | -6126.04
q1/grad                                             |  1706.17
q2/grad                                             |  1612.70
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  1245.08
Env/Length/Episode (New)                            |   243.17
Env/Reward/Episode (Last 50)                        |  1652.76
Env/Length/Episode (Last 50)                        |   328.50
policy/loss                                         | -5050.30
v/loss                                              |  6127.49
q1/loss                                             | 22809.15
q2/loss                                             | 24453.50
policy/grad                                         |     0.00
v/grad                                              |  -139.52
q1/grad                                             |   665.60
q2/grad                                             |   904.71
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2492.54
Env/Length/Episode (New)                            |   498.38
Env/Reward/Episode (Last 50)                        |  2243.95
Env/Length/Episode (Last 50)                        |   445.12
policy/loss                                         | -5046.15
v/loss                                              |  6620.61
q1/loss                                             | 16226.08
q2/loss                                             | 15604.48
policy/grad                                         |    -0.01
v/grad                                              |  -987.27
q1/grad                                             |  1146.49
q2/grad                                             |   522.71
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  1926.96
Env/Length/Episode (New)                            |   379.50
Env/Reward/Episode (Last 50)                        |  2197.82
Env/Length/Episode (Last 50)                        |   435.72
policy/loss                                         | -5121.60
v/loss                                              |  9012.69
q1/loss                                             | 18162.49
q2/loss                                             | 20087.64
policy/grad                                         |     0.03
v/grad                                              | -1774.17
q1/grad                                             |  1284.40
q2/grad                                             |  1383.99
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2405.97
Env/Length/Episode (New)                            |   468.12
Env/Reward/Episode (Last 50)                        |  2839.10
Env/Length/Episode (Last 50)                        |   553.90
policy/loss                                         | -5358.92
v/loss                                              |  7203.07
q1/loss                                             | 16828.64
q2/loss                                             | 16899.02
policy/grad                                         |     0.02
v/grad                                              |   196.73
q1/grad                                             |    18.12
q2/grad                                             |   113.88
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  1949.36
Env/Length/Episode (New)                            |   375.82
Env/Reward/Episode (Last 50)                        |  2383.10
Env/Length/Episode (Last 50)                        |   460.16
policy/loss                                         | -5355.61
v/loss                                              |  5973.74
q1/loss                                             | 22047.38
q2/loss                                             | 17524.64
policy/grad                                         |    -0.03
v/grad                                              |   990.37
q1/grad                                             |   338.24
q2/grad                                             |   -32.56
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4011.98
Env/Length/Episode (New)                            |   771.33
Env/Reward/Episode (Last 50)                        |  3494.99
Env/Length/Episode (Last 50)                        |   675.82
policy/loss                                         | -5280.72
v/loss                                              | 12498.31
q1/loss                                             | 19994.70
q2/loss                                             | 23162.39
policy/grad                                         |    -0.05
v/grad                                              |   287.48
q1/grad                                             |   282.78
q2/grad                                             |  -206.08
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2587.43
Env/Length/Episode (New)                            |   491.38
Env/Reward/Episode (Last 50)                        |  2759.86
Env/Length/Episode (Last 50)                        |   526.56
policy/loss                                         | -5691.58
v/loss                                              | 13815.54
q1/loss                                             | 25457.74
q2/loss                                             | 24842.08
policy/grad                                         |     0.03
v/grad                                              | -3396.20
q1/grad                                             |  1250.62
q2/grad                                             |   820.66
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3438.47
Env/Length/Episode (New)                            |   648.00
Env/Reward/Episode (Last 50)                        |  2895.66
Env/Length/Episode (Last 50)                        |   554.32
policy/loss                                         | -5995.30
v/loss                                              | 15331.59
q1/loss                                             | 26886.15
q2/loss                                             | 27820.18
policy/grad                                         |     0.01
v/grad                                              |  1618.72
q1/grad                                             |  -597.55
q2/grad                                             | -1631.86
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3095.07
Env/Length/Episode (New)                            |   590.00
Env/Reward/Episode (Last 50)                        |  2915.02
Env/Length/Episode (Last 50)                        |   556.52
policy/loss                                         | -6351.99
v/loss                                              | 14832.57
q1/loss                                             | 16815.44
q2/loss                                             | 17957.47
policy/grad                                         |     0.05
v/grad                                              | -3519.86
q1/grad                                             |   641.85
q2/grad                                             |  1451.54
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2128.04
Env/Length/Episode (New)                            |   410.67
Env/Reward/Episode (Last 50)                        |  2650.61
Env/Length/Episode (Last 50)                        |   506.04
policy/loss                                         | -6383.82
v/loss                                              |  8801.83
q1/loss                                             | 19065.43
q2/loss                                             | 18971.91
policy/grad                                         |    -0.06
v/grad                                              | -2217.81
q1/grad                                             |  -342.48
q2/grad                                             |   455.42
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3246.39
Env/Length/Episode (New)                            |   634.33
Env/Reward/Episode (Last 50)                        |  3249.84
Env/Length/Episode (Last 50)                        |   618.94
policy/loss                                         | -6438.49
v/loss                                              | 12142.27
q1/loss                                             | 28490.59
q2/loss                                             | 23511.70
policy/grad                                         |    -0.04
v/grad                                              |  -738.04
q1/grad                                             |  -348.86
q2/grad                                             |  -231.05
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2696.42
Env/Length/Episode (New)                            |   509.25
Env/Reward/Episode (Last 50)                        |  3224.90
Env/Length/Episode (Last 50)                        |   609.98
policy/loss                                         | -6206.79
v/loss                                              |  9308.51
q1/loss                                             | 30053.04
q2/loss                                             | 24931.86
policy/grad                                         |     0.06
v/grad                                              |  2027.66
q1/grad                                             | -2237.97
q2/grad                                             | -1912.90
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3061.81
Env/Length/Episode (New)                            |   599.43
Env/Reward/Episode (Last 50)                        |  2901.51
Env/Length/Episode (Last 50)                        |   558.30
policy/loss                                         | -6533.31
v/loss                                              | 10564.31
q1/loss                                             | 24926.81
q2/loss                                             | 27154.28
policy/grad                                         |    -0.06
v/grad                                              |  2772.25
q1/grad                                             |  -760.99
q2/grad                                             |  -232.35
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2551.48
Env/Length/Episode (New)                            |   485.57
Env/Reward/Episode (Last 50)                        |  2863.44
Env/Length/Episode (Last 50)                        |   549.44
policy/loss                                         | -6815.43
v/loss                                              |  8519.67
q1/loss                                             | 18990.33
q2/loss                                             | 20331.84
policy/grad                                         |     0.00
v/grad                                              | -1090.52
q1/grad                                             |   502.16
q2/grad                                             |   416.58
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3057.44
Env/Length/Episode (New)                            |   585.67
Env/Reward/Episode (Last 50)                        |  3236.57
Env/Length/Episode (Last 50)                        |   626.56
policy/loss                                         | -7128.33
v/loss                                              | 11017.22
q1/loss                                             | 12843.30
q2/loss                                             | 11879.28
policy/grad                                         |    -0.02
v/grad                                              | -3602.15
q1/grad                                             |  1178.90
q2/grad                                             |   616.58
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3600.46
Env/Length/Episode (New)                            |   687.33
Env/Reward/Episode (Last 50)                        |  3204.52
Env/Length/Episode (Last 50)                        |   625.12
policy/loss                                         | -6764.71
v/loss                                              |  5535.52
q1/loss                                             | 13675.78
q2/loss                                             |  7797.74
policy/grad                                         |    -0.01
v/grad                                              |   186.41
q1/grad                                             | -1588.74
q2/grad                                             |  -367.78
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3719.83
Env/Length/Episode (New)                            |   695.43
Env/Reward/Episode (Last 50)                        |  3567.72
Env/Length/Episode (Last 50)                        |   677.00
policy/loss                                         | -7187.10
v/loss                                              | 16803.28
q1/loss                                             | 30738.44
q2/loss                                             | 31755.86
policy/grad                                         |    -0.07
v/grad                                              |  3305.35
q1/grad                                             |  -992.21
q2/grad                                             | -1506.09
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4328.57
Env/Length/Episode (New)                            |   818.60
Env/Reward/Episode (Last 50)                        |  3638.03
Env/Length/Episode (Last 50)                        |   688.26
policy/loss                                         | -7440.87
v/loss                                              |  6864.04
q1/loss                                             | 12729.04
q2/loss                                             | 14445.44
policy/grad                                         |    -0.01
v/grad                                              |   447.72
q1/grad                                             |  -102.01
q2/grad                                             |  -118.98
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  2416.73
Env/Length/Episode (New)                            |   461.14
Env/Reward/Episode (Last 50)                        |  3064.69
Env/Length/Episode (Last 50)                        |   583.74
policy/loss                                         | -7158.03
v/loss                                              |  8321.10
q1/loss                                             | 20766.14
q2/loss                                             | 18081.94
policy/grad                                         |    -0.04
v/grad                                              | -1620.71
q1/grad                                             |  1266.48
q2/grad                                             |  1015.41
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3826.07
Env/Length/Episode (New)                            |   726.80
Env/Reward/Episode (Last 50)                        |  3893.59
Env/Length/Episode (Last 50)                        |   735.22
policy/loss                                         | -7486.17
v/loss                                              | 16253.33
q1/loss                                             | 21257.12
q2/loss                                             | 21912.03
policy/grad                                         |    -0.02
v/grad                                              |  4368.38
q1/grad                                             | -1645.86
q2/grad                                             | -1660.93
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                           |   4286.28
Env/Length/Episode (New)                           |    820.40
Env/Reward/Episode (Last 50)                       |   4308.58
Env/Length/Episode (Last 50)                       |    813.70
policy/loss                                        |  -7216.17
v/loss                                             |   5283.92
q1/loss                                            | 588889.06
q2/loss                                            | 595656.38
policy/grad                                        |     -0.01
v/grad                                             |     77.68
q1/grad                                            |   2228.56
q2/grad                                            |   2355.81
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3103.88
Env/Length/Episode (New)                            |   589.57
Env/Reward/Episode (Last 50)                        |  4288.13
Env/Length/Episode (Last 50)                        |   809.06
policy/loss                                         | -7794.43
v/loss                                              |  4694.55
q1/loss                                             | 12240.68
q2/loss                                             |  9995.11
policy/grad                                         |    -0.03
v/grad                                              |  -580.85
q1/grad                                             |   113.98
q2/grad                                             |   550.02
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4866.63
Env/Length/Episode (New)                            |   921.80
Env/Reward/Episode (Last 50)                        |  4184.44
Env/Length/Episode (Last 50)                        |   792.18
policy/loss                                         | -7755.92
v/loss                                              |  5209.96
q1/loss                                             | 18079.68
q2/loss                                             | 14687.42
policy/grad                                         |     0.01
v/grad                                              | -1675.58
q1/grad                                             |   131.45
q2/grad                                             |   228.53
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4154.27
Env/Length/Episode (New)                            |   780.60
Env/Reward/Episode (Last 50)                        |  4018.09
Env/Length/Episode (Last 50)                        |   761.38
policy/loss                                         | -7728.66
v/loss                                              |  7760.10
q1/loss                                             | 15147.27
q2/loss                                             | 18380.06
policy/grad                                         |    -0.00
v/grad                                              |  -769.59
q1/grad                                             |  -516.97
q2/grad                                             |   465.03
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4273.89
Env/Length/Episode (New)                            |   817.80
Env/Reward/Episode (Last 50)                        |  3971.16
Env/Length/Episode (Last 50)                        |   745.76
policy/loss                                         | -7725.40
v/loss                                              |  8557.05
q1/loss                                             | 21025.47
q2/loss                                             | 17454.13
policy/grad                                         |     0.07
v/grad                                              |   912.15
q1/grad                                             |  -942.62
q2/grad                                             |  -686.44
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5102.72
Env/Length/Episode (New)                            |   923.25
Env/Reward/Episode (Last 50)                        |  4057.50
Env/Length/Episode (Last 50)                        |   746.62
policy/loss                                         | -7916.68
v/loss                                              |  6229.81
q1/loss                                             | 12909.37
q2/loss                                             | 11223.59
policy/grad                                         |    -0.01
v/grad                                              |   297.53
q1/grad                                             |  -501.25
q2/grad                                             |  -660.85
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4257.63
Env/Length/Episode (New)                            |   766.25
Env/Reward/Episode (Last 50)                        |  3646.05
Env/Length/Episode (Last 50)                        |   671.22
policy/loss                                         | -7714.76
v/loss                                              |  5521.52
q1/loss                                             | 14489.20
q2/loss                                             | 12516.44
policy/grad                                         |    -0.01
v/grad                                              |  -220.97
q1/grad                                             |  -413.99
q2/grad                                             |  -529.10
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3779.59
Env/Length/Episode (New)                            |   683.83
Env/Reward/Episode (Last 50)                        |  3738.24
Env/Length/Episode (Last 50)                        |   686.74
policy/loss                                         | -7996.53
v/loss                                              |  8356.35
q1/loss                                             | 12530.21
q2/loss                                             | 19124.89
policy/grad                                         |     0.02
v/grad                                              | -2249.61
q1/grad                                             |   742.45
q2/grad                                             |    93.98
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4058.86
Env/Length/Episode (New)                            |   741.40
Env/Reward/Episode (Last 50)                        |  3272.33
Env/Length/Episode (Last 50)                        |   602.14
policy/loss                                         | -7860.06
v/loss                                              |  8269.83
q1/loss                                             | 13219.56
q2/loss                                             | 13160.43
policy/grad                                         |    -0.01
v/grad                                              |   576.02
q1/grad                                             |  -479.91
q2/grad                                             |  -453.25
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4694.87
Env/Length/Episode (New)                            |   872.75
Env/Reward/Episode (Last 50)                        |  3807.83
Env/Length/Episode (Last 50)                        |   704.24
policy/loss                                         | -7992.64
v/loss                                              | 11265.33
q1/loss                                             | 13829.15
q2/loss                                             | 14521.29
policy/grad                                         |    -0.01
v/grad                                              |   820.30
q1/grad                                             |  -543.87
q2/grad                                             |  -463.77
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5271.24
Env/Length/Episode (New)                            |   960.25
Env/Reward/Episode (Last 50)                        |  4835.84
Env/Length/Episode (Last 50)                        |   886.00
policy/loss                                         | -7896.74
v/loss                                              |  3940.21
q1/loss                                             | 11905.53
q2/loss                                             | 12705.56
policy/grad                                         |    -0.07
v/grad                                              |  -239.74
q1/grad                                             | -1392.06
q2/grad                                             | -1826.62
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4866.45
Env/Length/Episode (New)                            |   885.75
Env/Reward/Episode (Last 50)                        |  4649.38
Env/Length/Episode (Last 50)                        |   841.08
policy/loss                                         | -7704.78
v/loss                                              | 10387.02
q1/loss                                             | 42761.13
q2/loss                                             | 37959.73
policy/grad                                         |     0.00
v/grad                                              | -2558.83
q1/grad                                             |  1002.41
q2/grad                                             |  1493.32
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4313.97
Env/Length/Episode (New)                            |   796.67
Env/Reward/Episode (Last 50)                        |  4552.64
Env/Length/Episode (Last 50)                        |   829.08
policy/loss                                         | -7977.40
v/loss                                              |  7959.07
q1/loss                                             | 15223.92
q2/loss                                             | 12455.34
policy/grad                                         |    -0.02
v/grad                                              |  -659.87
q1/grad                                             |  -131.92
q2/grad                                             |  -554.81
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3838.55
Env/Length/Episode (New)                            |   690.86
Env/Reward/Episode (Last 50)                        |  4533.84
Env/Length/Episode (Last 50)                        |   819.40
policy/loss                                         | -7957.25
v/loss                                              |  5610.64
q1/loss                                             | 14904.37
q2/loss                                             |  9819.08
policy/grad                                         |    -0.06
v/grad                                              | -1429.55
q1/grad                                             |   918.96
q2/grad                                             |    46.10
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5556.52
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  4823.65
Env/Length/Episode (Last 50)                        |   870.68
policy/loss                                         | -7949.09
v/loss                                              | 19548.39
q1/loss                                             | 13755.74
q2/loss                                             | 13611.50
policy/grad                                         |    -0.01
v/grad                                              | -4716.46
q1/grad                                             |  1309.72
q2/grad                                             |   805.74
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5590.69
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5121.01
Env/Length/Episode (Last 50)                        |   922.12
policy/loss                                         | -7862.24
v/loss                                              |  6354.64
q1/loss                                             | 14777.66
q2/loss                                             | 12596.85
policy/grad                                         |    -0.02
v/grad                                              | -1390.93
q1/grad                                             |  -861.80
q2/grad                                             | -1469.07
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5599.67
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5062.06
Env/Length/Episode (Last 50)                        |   908.28
policy/loss                                         | -7841.18
v/loss                                              | 11214.53
q1/loss                                             | 14554.54
q2/loss                                             | 16657.38
policy/grad                                         |     0.02
v/grad                                              |  -423.86
q1/grad                                             |   276.77
q2/grad                                             |   554.29
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3809.34
Env/Length/Episode (New)                            |   695.33
Env/Reward/Episode (Last 50)                        |  4571.70
Env/Length/Episode (Last 50)                        |   822.28
policy/loss                                         | -7822.12
v/loss                                              |  7602.89
q1/loss                                             | 10902.43
q2/loss                                             |  9335.22
policy/grad                                         |     0.05
v/grad                                              | -2176.44
q1/grad                                             |  -830.06
q2/grad                                             |  -460.28
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4234.01
Env/Length/Episode (New)                            |   759.00
Env/Reward/Episode (Last 50)                        |  4641.58
Env/Length/Episode (Last 50)                        |   835.04
policy/loss                                         | -7936.25
v/loss                                              |  9570.49
q1/loss                                             | 11732.00
q2/loss                                             | 14528.49
policy/grad                                         |    -0.03
v/grad                                              | -2042.60
q1/grad                                             |   640.14
q2/grad                                             |   225.62
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5001.04
Env/Length/Episode (New)                            |   888.50
Env/Reward/Episode (Last 50)                        |  4519.09
Env/Length/Episode (Last 50)                        |   808.82
policy/loss                                         | -8085.25
v/loss                                              | 12191.27
q1/loss                                             | 12293.03
q2/loss                                             | 13732.74
policy/grad                                         |     0.02
v/grad                                              | -1165.06
q1/grad                                             |    60.32
q2/grad                                             |   132.57
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5477.66
Env/Length/Episode (New)                            |   983.00
Env/Reward/Episode (Last 50)                        |  4763.54
Env/Length/Episode (Last 50)                        |   852.14
policy/loss                                         | -7996.01
v/loss                                              | 20483.45
q1/loss                                             | 15119.37
q2/loss                                             | 10093.69
policy/grad                                         |     0.00
v/grad                                              |  2504.80
q1/grad                                             | -1459.52
q2/grad                                             |  -693.13
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5645.73
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  4904.52
Env/Length/Episode (Last 50)                        |   887.70
policy/loss                                         | -8124.99
v/loss                                              |  7224.60
q1/loss                                             |  8328.24
q2/loss                                             | 11277.10
policy/grad                                         |     0.01
v/grad                                              | -1880.35
q1/grad                                             |   529.33
q2/grad                                             |   639.67
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  3932.51
Env/Length/Episode (New)                            |   714.40
Env/Reward/Episode (Last 50)                        |  4179.67
Env/Length/Episode (Last 50)                        |   750.36
policy/loss                                         | -8027.45
v/loss                                              | 15907.47
q1/loss                                             | 13577.07
q2/loss                                             | 10095.84
policy/grad                                         |    -0.02
v/grad                                              | -4078.35
q1/grad                                             |  1325.81
q2/grad                                             |   633.94
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5522.07
Env/Length/Episode (New)                            |   999.50
Env/Reward/Episode (Last 50)                        |  4705.91
Env/Length/Episode (Last 50)                        |   839.62
policy/loss                                         | -7845.53
v/loss                                              | 18681.93
q1/loss                                             | 22268.96
q2/loss                                             | 28031.53
policy/grad                                         |     0.01
v/grad                                              | -2465.75
q1/grad                                             |  1123.18
q2/grad                                             |  1223.67
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5567.09
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5080.39
Env/Length/Episode (Last 50)                        |   910.58
policy/loss                                         | -7901.28
v/loss                                              | 10608.94
q1/loss                                             | 14294.40
q2/loss                                             | 19419.90
policy/grad                                         |     0.01
v/grad                                              | -1570.82
q1/grad                                             |  2149.47
q2/grad                                             |  1904.88
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5530.63
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5131.17
Env/Length/Episode (Last 50)                        |   921.52
policy/loss                                         | -7985.71
v/loss                                              |  3494.01
q1/loss                                             | 12276.58
q2/loss                                             | 23646.40
policy/grad                                         |     0.03
v/grad                                              |  -428.11
q1/grad                                             |  -839.60
q2/grad                                             |  -504.91
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5179.74
Env/Length/Episode (New)                            |   931.75
Env/Reward/Episode (Last 50)                        |  4974.74
Env/Length/Episode (Last 50)                        |   896.82
policy/loss                                         | -7950.10
v/loss                                              |  5897.01
q1/loss                                             |  5470.27
q2/loss                                             |  7157.86
policy/grad                                         |    -0.05
v/grad                                              |  -803.09
q1/grad                                             |  -253.96
q2/grad                                             |   112.33
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                           |   4583.47
Env/Length/Episode (New)                           |    827.50
Env/Reward/Episode (Last 50)                       |   5123.64
Env/Length/Episode (Last 50)                       |    924.76
policy/loss                                        |  -8097.20
v/loss                                             |  13285.60
q1/loss                                            | 372652.25
q2/loss                                            | 401217.78
policy/grad                                        |     -0.03
v/grad                                             |   -272.66
q1/grad                                            |   2147.22
q2/grad                                            |   1793.36
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5621.85
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  4848.42
Env/Length/Episode (Last 50)                        |   866.94
policy/loss                                         | -7814.63
v/loss                                              |  9430.94
q1/loss                                             | 34433.97
q2/loss                                             | 30712.77
policy/grad                                         |     0.03
v/grad                                              |  1802.53
q1/grad                                             | -1389.16
q2/grad                                             | -1707.96
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5568.73
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5306.03
Env/Length/Episode (Last 50)                        |   950.54
policy/loss                                         | -7937.35
v/loss                                              |  7839.04
q1/loss                                             | 13252.40
q2/loss                                             | 11202.13
policy/grad                                         |    -0.00
v/grad                                              |   426.56
q1/grad                                             | -1044.95
q2/grad                                             | -1562.13
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4433.29
Env/Length/Episode (New)                            |   804.60
Env/Reward/Episode (Last 50)                        |  5342.06
Env/Length/Episode (Last 50)                        |   957.22
policy/loss                                         | -8018.47
v/loss                                              |  5537.83
q1/loss                                             | 12587.38
q2/loss                                             |  7533.87
policy/grad                                         |    -0.02
v/grad                                              |  -795.10
q1/grad                                             |  -320.68
q2/grad                                             |  -645.81
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4894.84
Env/Length/Episode (New)                            |   862.60
Env/Reward/Episode (Last 50)                        |  4986.06
Env/Length/Episode (Last 50)                        |   891.52
policy/loss                                         | -7968.25
v/loss                                              |  7143.19
q1/loss                                             | 10814.61
q2/loss                                             | 10172.52
policy/grad                                         |     0.02
v/grad                                              | -2074.24
q1/grad                                             |   136.19
q2/grad                                             |   -44.21
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5634.40
Env/Length/Episode (New)                            |   999.25
Env/Reward/Episode (Last 50)                        |  5099.63
Env/Length/Episode (Last 50)                        |   899.22
policy/loss                                         | -7882.94
v/loss                                              |  7493.60
q1/loss                                             |  9237.82
q2/loss                                             |  6645.61
policy/grad                                         |    -0.05
v/grad                                              | -1486.44
q1/grad                                             |  -375.36
q2/grad                                             |  -552.62
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4838.35
Env/Length/Episode (New)                            |   845.60
Env/Reward/Episode (Last 50)                        |  5500.54
Env/Length/Episode (Last 50)                        |   966.24
policy/loss                                         | -8159.27
v/loss                                              |  7179.66
q1/loss                                             |  7350.26
q2/loss                                             |  7018.14
policy/grad                                         |     0.00
v/grad                                              | -2669.54
q1/grad                                             |  1551.78
q2/grad                                             |   881.75
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5773.76
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5293.12
Env/Length/Episode (Last 50)                        |   922.34
policy/loss                                         | -7920.82
v/loss                                              | 18992.65
q1/loss                                             | 14547.81
q2/loss                                             | 17415.48
policy/grad                                         |     0.02
v/grad                                              |   689.88
q1/grad                                             |  -833.64
q2/grad                                             |  -650.04
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4687.62
Env/Length/Episode (New)                            |   830.00
Env/Reward/Episode (Last 50)                        |  5251.19
Env/Length/Episode (Last 50)                        |   918.50
policy/loss                                         | -8064.03
v/loss                                              |  6463.51
q1/loss                                             | 11430.35
q2/loss                                             |  9717.38
policy/grad                                         |     0.03
v/grad                                              |  -677.36
q1/grad                                             |  -112.96
q2/grad                                             |    -5.12
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4558.94
Env/Length/Episode (New)                            |   803.50
Env/Reward/Episode (Last 50)                        |  4661.12
Env/Length/Episode (Last 50)                        |   819.12
policy/loss                                         | -8065.23
v/loss                                              |  5779.26
q1/loss                                             |  9393.78
q2/loss                                             |  8097.22
policy/grad                                         |     0.06
v/grad                                              |   863.73
q1/grad                                             |  -382.92
q2/grad                                             |   147.73
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5749.05
Env/Length/Episode (New)                            |   990.00
Env/Reward/Episode (Last 50)                        |  5017.84
Env/Length/Episode (Last 50)                        |   874.88
policy/loss                                         | -8144.07
v/loss                                              |  2157.22
q1/loss                                             |  7748.20
q2/loss                                             |  6827.33
policy/grad                                         |    -0.01
v/grad                                              |  -119.72
q1/grad                                             |   980.50
q2/grad                                             |   704.26
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4309.47
Env/Length/Episode (New)                            |   758.60
Env/Reward/Episode (Last 50)                        |  5247.33
Env/Length/Episode (Last 50)                        |   912.58
policy/loss                                         | -8033.36
v/loss                                              |  4546.67
q1/loss                                             | 13868.13
q2/loss                                             | 12410.72
policy/grad                                         |     0.00
v/grad                                              |  -505.96
q1/grad                                             |   910.63
q2/grad                                             |   257.59
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4478.80
Env/Length/Episode (New)                            |   791.20
Env/Reward/Episode (Last 50)                        |  5033.99
Env/Length/Episode (Last 50)                        |   884.58
policy/loss                                         | -7962.19
v/loss                                              |  8657.40
q1/loss                                             | 12684.73
q2/loss                                             | 11372.69
policy/grad                                         |    -0.00
v/grad                                              |   128.48
q1/grad                                             | -1294.25
q2/grad                                             | -1031.14
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4263.29
Env/Length/Episode (New)                            |   754.00
Env/Reward/Episode (Last 50)                        |  5088.91
Env/Length/Episode (Last 50)                        |   884.50
policy/loss                                         | -8125.19
v/loss                                              |  5356.37
q1/loss                                             | 10382.73
q2/loss                                             | 11964.29
policy/grad                                         |     0.06
v/grad                                              |   319.23
q1/grad                                             | -1383.94
q2/grad                                             | -1222.77
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                           | 


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4345.62
Env/Length/Episode (New)                            |   753.80
Env/Reward/Episode (Last 50)                        |  4641.44
Env/Length/Episode (Last 50)                        |   812.12
policy/loss                                         | -8062.68
v/loss                                              |  5943.91
q1/loss                                             | 17105.22
q2/loss                                             | 12221.91
policy/grad                                         |    -0.03
v/grad                                              |   428.67
q1/grad                                             |  -376.78
q2/grad                                             |   -21.33
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5782.07
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5077.72
Env/Length/Episode (Last 50)                        |   874.30
policy/loss                                         | -7993.03
v/loss                                              |  8418.08
q1/loss                                             | 18553.38
q2/loss                                             | 16435.84
policy/grad                                         |    -0.03
v/grad                                              |  -154.04
q1/grad                                             |   122.35
q2/grad                                             |  -243.74
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4758.74
Env/Length/Episode (New)                            |   825.40
Env/Reward/Episode (Last 50)                        |  5114.03
Env/Length/Episode (Last 50)                        |   889.72
policy/loss                                         | -7998.02
v/loss                                              |  5189.73
q1/loss                                             | 20971.95
q2/loss                                             | 21851.54
policy/grad                                         |     0.04
v/grad                                              |  1596.01
q1/grad                                             | -1215.09
q2/grad                                             | -1129.13
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5818.78
Env/Length/Episode (New)                            |  1000.00
Env/Reward/Episode (Last 50)                        |  5302.41
Env/Length/Episode (Last 50)                        |   919.10
policy/loss                                         | -8116.57
v/loss                                              | 11948.73
q1/loss                                             |  8101.26
q2/loss                                             | 13993.79
policy/grad                                         |    -0.00
v/grad                                              | -1845.24
q1/grad                                             |   305.43
q2/grad                                             |   508.23
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                          |  


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  5206.27
Env/Length/Episode (New)                            |   898.80
Env/Reward/Episode (Last 50)                        |  5331.57
Env/Length/Episode (Last 50)                        |   919.04
policy/loss                                         | -8166.26
v/loss                                              |  8062.24
q1/loss                                             | 11660.27
q2/loss                                             |  6286.27
policy/grad                                         |     0.02
v/grad                                              | -1443.33
q1/grad                                             |   344.43
q2/grad                                             |   219.62
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                           | 


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4614.19
Env/Length/Episode (New)                            |   791.60
Env/Reward/Episode (Last 50)                        |  4916.89
Env/Length/Episode (Last 50)                        |   852.10
policy/loss                                         | -8197.66
v/loss                                              |  8437.56
q1/loss                                             | 14729.13
q2/loss                                             |  8293.15
policy/grad                                         |     0.03
v/grad                                              |  1262.92
q1/grad                                             |  -805.08
q2/grad                                             | -1196.58
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |


--------------------------------------------------------------
Env/Reward/Episode (New)                            |  4411.86
Env/Length/Episode (New)                            |   771.20
Env/Reward/Episode (Last 50)                        |  4885.32
Env/Length/Episode (Last 50)                        |   846.98
policy/loss                                         | -8003.99
v/loss                                              |  4958.33
q1/loss                                             | 17207.22
q2/loss                                             | 10595.80
policy/grad                                         |    -0.00
v/grad                                              |   216.82
q1/grad                                             |  -255.74
q2/grad                                             |  -279.51
--------------------------------------------------------------

--------------------------------------------------------------
Env/Reward/Episode (New)                            |