In [1]:
import argparse
import gym
import datetime
import os
import random
import tempfile
import numpy as np
import pickle

import ray
from ray import tune
from ray.tune.logger import Logger, UnifiedLogger, pretty_print
from ray.rllib.env.multi_agent_env import make_multi_agent
from ray.rllib.examples.models.shared_weights_model import TF2SharedWeightsModel
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.rllib.agents.ppo import ppo
from ray.rllib.models import ModelCatalog
from environment_rllib import MyEnv
from settings.initial_settings import *
from settings.reset_conditions import reset_conditions
#from modules.models import MyConv2DModel_v0B_Small_CBAM_1DConv_Share
from modules.models import DenseNetModelLarge
from tensorflow.keras.utils import plot_model
from modules.savers import save_conditions

import matplotlib.pyplot as plt

import tensorflow as tf

PROJECT = "UCAV"
TRIAL_ID = 2
TRIAL = 'test_' + str(TRIAL_ID)
EVAL_FREQ = 1
CONTINUAL = False

def custom_log_creator(custom_path, custom_str):
    timestr = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
    logdir_prefix = "{}_{}".format(custom_str, timestr)

    def logger_creator(config):
        if not os.path.exists(custom_path):
            os.makedirs(custom_path)
        logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=custom_path)
        return UnifiedLogger(config, logdir, loggers=None)

    return logger_creator

ray.shutdown()
ray.init(ignore_reinit_error=True, log_to_driver=False)

ModelCatalog.register_custom_model('my_model', DenseNetModelLarge)

# config = {"env": MyEnv,
#           "num_workers": NUM_WORKERS,
#           "num_gpus": NUM_GPUS,
#           "num_cpus_per_worker": NUM_CPUS_PER_WORKER,
#           "num_sgd_iter": NUM_SGD_ITER,
#           "lr": LEARNING_RATE,
#           "gamma": GAMMA,  # default=0.99
#           "model": {"custom_model": "my_model"}
#           # "framework": framework
#           }  # use tensorflow 2
config = {"env": MyEnv,"num_gpus": 0,"num_workers": 1, "num_cpus_per_worker": 0,"num_gpus": 0}
conditions_dir = os.path.join('./' + PROJECT + '/conditions/')

if not os.path.exists(conditions_dir):
    os.makedirs(conditions_dir)
save_conditions(conditions_dir)

# PPOTrainer()は、try_import_tfを使うと、なぜかTensorflowのeager modeのエラーになる。

trainer = ppo.PPOTrainer(config=config,
                         logger_creator=custom_log_creator(
                             os.path.expanduser("./" + PROJECT + "/logs"), TRIAL))

if CONTINUAL:
    # Continual learning: Need to specify the checkpoint
    model_path = PROJECT + '/checkpoints/' + 'fwd_1/checkpoint_000351/checkpoint-351'
    trainer.restore(checkpoint_path=model_path)

models_dir = os.path.join('./' + PROJECT + '/models/')
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
text_name = models_dir + TRIAL + '.txt'
with open(text_name, "w") as fp:
    trainer.get_policy().model.base_model.summary(print_fn=lambda x: fp.write(x + "\r\n"))
png_name = models_dir + TRIAL + '.png'
plot_model(trainer.get_policy().model.base_model, to_file=png_name, show_shapes=True)

# Instanciate the evaluation env
eval_env = MyEnv({})

# Define checkpoint dir
check_point_dir = os.path.join('./' + PROJECT + '/checkpoints/', TRIAL)
if not os.path.exists(check_point_dir):
    os.makedirs(check_point_dir)

2021-10-06 16:56:51,311	INFO services.py:1265 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-10-06 16:57:00,549	INFO trainer.py:714 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2021-10-06 16:57:00,549	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2021-10-06 16:57:00,550	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')




In [None]:

# Training & evaluation
record_mode = 1
results_dir = os.path.join('./' + PROJECT + '/results/')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
results_file = results_dir + TRIAL + '.pkl'
for steps in range(10001):
    # Training
    results = trainer.train()

    # Evaluation
    if steps % EVAL_FREQ == 0:
        print(f'\n----------------- Evaluation at steps:{steps} starting ! -----------------')
        print(pretty_print(results))
        check_point = trainer.save(checkpoint_dir=check_point_dir)
        win = 0
        for i in range(NUM_EVAL):
            # print(f'\nEvaluation {i}:')
            obs = eval_env.reset()
            done = False
            
            step_num = 0
            trajectory_length = 100
            env_blue_pos = [0]
            env_red_pos = [0]
            env_mrm_pos = [0]
            if record_mode == 0:
                file_name = "test_num" + str(steps) +str(i)
                video = cv2.VideoWriter(file_name+'.mp4',0x00000020,20.0,(eval_env.WINNDOW_SIZE_lon,eval_env.WINDOW_SIZE_lat))

            while not done:
                action_dict = {}
                for j in range(eval_env.blue_num):
                    if not eval_env.blue[j].hitpoint == 0:
                        action_dict['blue_' + str(j)] = trainer.compute_action(obs['blue_' + str(j)])

                obs, rewards, dones, infos = eval_env.step(action_dict)
                done = dones["__all__"]
                #print(f'rewards:{rewards}')
                if record_mode == 0:
                    img = eval_env.render_movie(file_name,step_num)
                    video.write(img.astype('unit8'))
                elif record_mode == 1:
                    eval_env.render()
                elif record_mode == 2:
                    eval_env.render()
                    
                #env_blue_pos_temp, env_red_pos_temp, env_mrm_pos_temp = render_env.copy_from_env(eval_env)
                
                #env_blue_pos.append(env_blue_pos_temp)
                #env_red_pos.append(env_red_pos_temp)
                #env_mrm_pos.append(env_mrm_pos_temp)
                #step_num = step_num + 1
                
                
            #del env_blue_pos[0]
            #del env_red_pos[0]
            #del env_mrm_pos[0]
            
            #hist_blue_pos = np.vstack(env_blue_pos)
            #hist_red_pos = np.vstack(env_red_pos)
            #hist_mrm_pos = np.vstack(env_mrm_pos)
            
            #f = open(results_file,'wb')
            #pickle.dump(emv_blue_pos,f)
            #pickle.dump(emv_red_pos,f)
            #pickle.dump(emv_mrm_pos,f)
            #f.close()
            
            if record_mode == 0:
                video.release()

ray.shutdown()




----------------- Evaluation at steps:0 starting ! -----------------
agent_timesteps_total: 4000
custom_metrics: {}
date: 2021-10-06_16-57-12
done: false
episode_len_mean: 562.3333333333334
episode_media: {}
episode_reward_max: 3.8927000000000005
episode_reward_mean: 0.5288666666666701
episode_reward_min: -1.1783999999999923
episodes_this_iter: 3
episodes_total: 3
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 2.0723531246185303
        entropy_coeff: 0.0
        kl: 0.007047607097774744
        model: {}
        policy_loss: -0.011758591048419476
        total_loss: 0.06090806424617767
        vf_explained_var: -0.31641122698783875
        vf_loss: 0.07125714421272278
  num_agent_steps_sampled: 4000
  num_agent_steps_trained: 4000
  num_steps_sampled: 4000
  num_steps_trained: 4000
iterat


----------------- Evaluation at steps:5 starting ! -----------------
agent_timesteps_total: 24000
custom_metrics: {}
date: 2021-10-06_16-58-50
done: false
episode_len_mean: 652.0588235294117
episode_media: {}
episode_reward_max: 3.8941500000000002
episode_reward_mean: -1.028549999999998
episode_reward_min: -2.1996000000000007
episodes_this_iter: 3
episodes_total: 17
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 2.0125885009765625
        entropy_coeff: 0.0
        kl: 0.006026575341820717
        model: {}
        policy_loss: -0.011828646995127201
        total_loss: 0.04203527048230171
        vf_explained_var: -0.4901210069656372
        vf_loss: 0.052658602595329285
  num_agent_steps_sampled: 24000
  num_agent_steps_trained: 24000
  num_steps_sampled: 24000
  num_steps_trained: 24000



----------------- Evaluation at steps:10 starting ! -----------------
agent_timesteps_total: 44000
custom_metrics: {}
date: 2021-10-06_17-00-23
done: false
episode_len_mean: 696.8709677419355
episode_media: {}
episode_reward_max: 3.8941500000000002
episode_reward_mean: -1.1091354838709646
episode_reward_min: -2.1996000000000007
episodes_this_iter: 3
episodes_total: 31
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.9267640113830566
        entropy_coeff: 0.0
        kl: 0.008496327325701714
        model: {}
        policy_loss: -0.01509306114166975
        total_loss: -0.002140073338523507
        vf_explained_var: 0.13809950649738312
        vf_loss: 0.011253723874688148
  num_agent_steps_sampled: 44000
  num_agent_steps_trained: 44000
  num_steps_sampled: 44000
  num_steps_trained: 440


----------------- Evaluation at steps:15 starting ! -----------------
agent_timesteps_total: 64000
custom_metrics: {}
date: 2021-10-06_17-01-54
done: false
episode_len_mean: 706.8222222222222
episode_media: {}
episode_reward_max: 3.906
episode_reward_mean: -0.8681599999999973
episode_reward_min: -2.1996000000000007
episodes_this_iter: 3
episodes_total: 45
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.9124277830123901
        entropy_coeff: 0.0
        kl: 0.011132133193314075
        model: {}
        policy_loss: -0.023399489000439644
        total_loss: -0.01129877008497715
        vf_explained_var: -0.16688518226146698
        vf_loss: 0.00987429078668356
  num_agent_steps_sampled: 64000
  num_agent_steps_trained: 64000
  num_steps_sampled: 64000
  num_steps_trained: 64000
iterations


----------------- Evaluation at steps:20 starting ! -----------------
agent_timesteps_total: 84000
custom_metrics: {}
date: 2021-10-06_17-03-37
done: false
episode_len_mean: 724.3333333333334
episode_media: {}
episode_reward_max: 3.906
episode_reward_mean: -0.9605771929824541
episode_reward_min: -2.21
episodes_this_iter: 3
episodes_total: 57
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.7461572885513306
        entropy_coeff: 0.0
        kl: 0.009154144674539566
        model: {}
        policy_loss: -0.013402668759226799
        total_loss: 0.017993221059441566
        vf_explained_var: -0.13262398540973663
        vf_loss: 0.029565060511231422
  num_agent_steps_sampled: 84000
  num_agent_steps_trained: 84000
  num_steps_sampled: 84000
  num_steps_trained: 84000
iterations_since_restor


----------------- Evaluation at steps:25 starting ! -----------------
agent_timesteps_total: 104000
custom_metrics: {}
date: 2021-10-06_17-05-04
done: false
episode_len_mean: 720.6805555555555
episode_media: {}
episode_reward_max: 3.906
episode_reward_mean: -0.9403076388888868
episode_reward_min: -2.2139499999999996
episodes_this_iter: 3
episodes_total: 72
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.7040303945541382
        entropy_coeff: 0.0
        kl: 0.010849838145077229
        model: {}
        policy_loss: -0.01725655049085617
        total_loss: 0.009599575772881508
        vf_explained_var: -0.3707568049430847
        vf_loss: 0.024686163291335106
  num_agent_steps_sampled: 104000
  num_agent_steps_trained: 104000
  num_steps_sampled: 104000
  num_steps_trained: 104000
iterat


----------------- Evaluation at steps:30 starting ! -----------------
agent_timesteps_total: 124000
custom_metrics: {}
date: 2021-10-06_17-06-35
done: false
episode_len_mean: 710.9069767441861
episode_media: {}
episode_reward_max: 3.906
episode_reward_mean: -0.8889726744186025
episode_reward_min: -2.2139499999999996
episodes_this_iter: 3
episodes_total: 86
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.6277014017105103
        entropy_coeff: 0.0
        kl: 0.00815170630812645
        model: {}
        policy_loss: -0.0059013632126152515
        total_loss: 0.020602524280548096
        vf_explained_var: -0.1844886988401413
        vf_loss: 0.02487354539334774
  num_agent_steps_sampled: 124000
  num_agent_steps_trained: 124000
  num_steps_sampled: 124000
  num_steps_trained: 124000
iterat


----------------- Evaluation at steps:35 starting ! -----------------
agent_timesteps_total: 144000
custom_metrics: {}
date: 2021-10-06_17-08-09
done: false
episode_len_mean: 705.37
episode_media: {}
episode_reward_max: 3.906
episode_reward_mean: -0.9229744999999978
episode_reward_min: -2.2139499999999996
episodes_this_iter: 4
episodes_total: 102
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.7100833654403687
        entropy_coeff: 0.0
        kl: 0.009020685218274593
        model: {}
        policy_loss: -0.019472721964120865
        total_loss: -0.0047296457923948765
        vf_explained_var: 0.0454041063785553
        vf_loss: 0.012938937172293663
  num_agent_steps_sampled: 144000
  num_agent_steps_trained: 144000
  num_steps_sampled: 144000
  num_steps_trained: 144000
iterations_sin


----------------- Evaluation at steps:40 starting ! -----------------
agent_timesteps_total: 164000
custom_metrics: {}
date: 2021-10-06_17-09-45
done: false
episode_len_mean: 707.05
episode_media: {}
episode_reward_max: 3.93265
episode_reward_mean: -0.9308334999999979
episode_reward_min: -2.2139499999999996
episodes_this_iter: 3
episodes_total: 117
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.6437134742736816
        entropy_coeff: 0.0
        kl: 0.009958391077816486
        model: {}
        policy_loss: -0.014323724433779716
        total_loss: -0.0015422562137246132
        vf_explained_var: -0.07743627578020096
        vf_loss: 0.010789795778691769
  num_agent_steps_sampled: 164000
  num_agent_steps_trained: 164000
  num_steps_sampled: 164000
  num_steps_trained: 164000
iterations


----------------- Evaluation at steps:45 starting ! -----------------
agent_timesteps_total: 184000
custom_metrics: {}
date: 2021-10-06_17-11-27
done: false
episode_len_mean: 687.72
episode_media: {}
episode_reward_max: 3.93265
episode_reward_mean: -0.8679769999999981
episode_reward_min: -2.2139499999999996
episodes_this_iter: 3
episodes_total: 132
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.6026180982589722
        entropy_coeff: 0.0
        kl: 0.009984048083424568
        model: {}
        policy_loss: -0.010036241263151169
        total_loss: 0.011390515603125095
        vf_explained_var: 0.04433160275220871
        vf_loss: 0.019429950043559074
  num_agent_steps_sampled: 184000
  num_agent_steps_trained: 184000
  num_steps_sampled: 184000
  num_steps_trained: 184000
iterations_si


----------------- Evaluation at steps:50 starting ! -----------------
agent_timesteps_total: 204000
custom_metrics: {}
date: 2021-10-06_17-12-45
done: false
episode_len_mean: 686.32
episode_media: {}
episode_reward_max: 3.93265
episode_reward_mean: -0.929471999999998
episode_reward_min: -2.2139499999999996
episodes_this_iter: 3
episodes_total: 146
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.6221400499343872
        entropy_coeff: 0.0
        kl: 0.01062043011188507
        model: {}
        policy_loss: -0.019348403438925743
        total_loss: 0.004713508300483227
        vf_explained_var: -0.05166850611567497
        vf_loss: 0.02193782851099968
  num_agent_steps_sampled: 204000
  num_agent_steps_trained: 204000
  num_steps_sampled: 204000
  num_steps_trained: 204000
iterations_sinc


----------------- Evaluation at steps:55 starting ! -----------------
agent_timesteps_total: 224000
custom_metrics: {}
date: 2021-10-06_17-14-17
done: false
episode_len_mean: 663.88
episode_media: {}
episode_reward_max: 3.93265
episode_reward_mean: -0.8970114999999979
episode_reward_min: -2.2043000000000026
episodes_this_iter: 2
episodes_total: 162
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.6834207773208618
        entropy_coeff: 0.0
        kl: 0.010563583113253117
        model: {}
        policy_loss: -0.01249787025153637
        total_loss: 0.003294644644483924
        vf_explained_var: -0.022247420623898506
        vf_loss: 0.013679804280400276
  num_agent_steps_sampled: 224000
  num_agent_steps_trained: 224000
  num_steps_sampled: 224000
  num_steps_trained: 224000
iterations_s


----------------- Evaluation at steps:60 starting ! -----------------
agent_timesteps_total: 244000
custom_metrics: {}
date: 2021-10-06_17-15-35
done: false
episode_len_mean: 664.67
episode_media: {}
episode_reward_max: 3.93265
episode_reward_mean: -0.8475469999999982
episode_reward_min: -2.2043000000000026
episodes_this_iter: 3
episodes_total: 176
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.545778512954712
        entropy_coeff: 0.0
        kl: 0.01106109656393528
        model: {}
        policy_loss: -0.019631650298833847
        total_loss: -0.004431749694049358
        vf_explained_var: 0.014531513676047325
        vf_loss: 0.012987674213945866
  num_agent_steps_sampled: 244000
  num_agent_steps_trained: 244000
  num_steps_sampled: 244000
  num_steps_trained: 244000
iterations_si


----------------- Evaluation at steps:65 starting ! -----------------
agent_timesteps_total: 264000
custom_metrics: {}
date: 2021-10-06_17-17-11
done: false
episode_len_mean: 672.38
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.7482119999999982
episode_reward_min: -2.2043000000000026
episodes_this_iter: 4
episodes_total: 191
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.539736032485962
        entropy_coeff: 0.0
        kl: 0.008046630769968033
        model: {}
        policy_loss: -0.01525516901165247
        total_loss: 0.028243787586688995
        vf_explained_var: -0.10787113755941391
        vf_loss: 0.04188963398337364
  num_agent_steps_sampled: 264000
  num_agent_steps_trained: 264000
  num_steps_sampled: 264000
  num_steps_trained: 264000
iter


----------------- Evaluation at steps:70 starting ! -----------------
agent_timesteps_total: 284000
custom_metrics: {}
date: 2021-10-06_17-18-44
done: false
episode_len_mean: 671.98
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.6167589999999984
episode_reward_min: -2.2043000000000026
episodes_this_iter: 3
episodes_total: 206
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.6089879274368286
        entropy_coeff: 0.0
        kl: 0.010088684968650341
        model: {}
        policy_loss: -0.02312445640563965
        total_loss: 0.0014533945359289646
        vf_explained_var: -0.31974464654922485
        vf_loss: 0.0225601177662611
  num_agent_steps_sampled: 284000
  num_agent_steps_trained: 284000
  num_steps_sampled: 284000
  num_steps_trained: 284000
ite


----------------- Evaluation at steps:75 starting ! -----------------
agent_timesteps_total: 304000
custom_metrics: {}
date: 2021-10-06_17-20-17
done: false
episode_len_mean: 684.78
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.4079724999999982
episode_reward_min: -2.204749999999999
episodes_this_iter: 3
episodes_total: 220
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.43300199508667
        entropy_coeff: 0.0
        kl: 0.012813219800591469
        model: {}
        policy_loss: -0.018693862482905388
        total_loss: 0.03512212261557579
        vf_explained_var: -0.2992680072784424
        vf_loss: 0.05125334486365318
  num_agent_steps_sampled: 304000
  num_agent_steps_trained: 304000
  num_steps_sampled: 304000
  num_steps_trained: 304000
iterati


----------------- Evaluation at steps:80 starting ! -----------------
agent_timesteps_total: 324000
custom_metrics: {}
date: 2021-10-06_17-21-48
done: false
episode_len_mean: 687.78
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.3588814999999984
episode_reward_min: -2.204749999999999
episodes_this_iter: 2
episodes_total: 233
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.3757001161575317
        entropy_coeff: 0.0
        kl: 0.01160306017845869
        model: {}
        policy_loss: -0.012109427712857723
        total_loss: 0.0047531211748719215
        vf_explained_var: -0.061927665024995804
        vf_loss: 0.014541932381689548
  num_agent_steps_sampled: 324000
  num_agent_steps_trained: 324000
  num_steps_sampled: 324000
  num_steps_trained: 324000
i


----------------- Evaluation at steps:85 starting ! -----------------
agent_timesteps_total: 344000
custom_metrics: {}
date: 2021-10-06_17-23-15
done: false
episode_len_mean: 680.98
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.2093534999999984
episode_reward_min: -2.204749999999999
episodes_this_iter: 2
episodes_total: 249
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.3509647846221924
        entropy_coeff: 0.0
        kl: 0.0150296064093709
        model: {}
        policy_loss: -0.022697949782013893
        total_loss: 0.0007293277885764837
        vf_explained_var: -0.056182581931352615
        vf_loss: 0.020421354100108147
  num_agent_steps_sampled: 344000
  num_agent_steps_trained: 344000
  num_steps_sampled: 344000
  num_steps_trained: 344000
it


----------------- Evaluation at steps:90 starting ! -----------------
agent_timesteps_total: 364000
custom_metrics: {}
date: 2021-10-06_17-24-59
done: false
episode_len_mean: 683.54
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.05929849999999824
episode_reward_min: -2.204749999999999
episodes_this_iter: 4
episodes_total: 264
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.3173338174819946
        entropy_coeff: 0.0
        kl: 0.01210454199463129
        model: {}
        policy_loss: -0.021061090752482414
        total_loss: 0.0016662618145346642
        vf_explained_var: 0.3183707594871521
        vf_loss: 0.020306440070271492
  num_agent_steps_sampled: 364000
  num_agent_steps_trained: 364000
  num_steps_sampled: 364000
  num_steps_trained: 364000
ite


----------------- Evaluation at steps:95 starting ! -----------------
agent_timesteps_total: 384000
custom_metrics: {}
date: 2021-10-06_17-26-31
done: false
episode_len_mean: 692.44
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.11157649999999801
episode_reward_min: -2.204749999999999
episodes_this_iter: 3
episodes_total: 278
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.3152180910110474
        entropy_coeff: 0.0
        kl: 0.00975228101015091
        model: {}
        policy_loss: -0.02058863453567028
        total_loss: -0.010787300765514374
        vf_explained_var: 0.4198153614997864
        vf_loss: 0.007850880734622478
  num_agent_steps_sampled: 384000
  num_agent_steps_trained: 384000
  num_steps_sampled: 384000
  num_steps_trained: 384000
iter


----------------- Evaluation at steps:100 starting ! -----------------
agent_timesteps_total: 404000
custom_metrics: {}
date: 2021-10-06_17-28-10
done: false
episode_len_mean: 699.14
episode_media: {}
episode_reward_max: 3.9354000000000005
episode_reward_mean: -0.054067499999997874
episode_reward_min: -2.2133500000000037
episodes_this_iter: 3
episodes_total: 290
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.2389700412750244
        entropy_coeff: 0.0
        kl: 0.010936051607131958
        model: {}
        policy_loss: -0.020082183182239532
        total_loss: 0.01356405671685934
        vf_explained_var: -0.027440158650279045
        vf_loss: 0.0314590260386467
  num_agent_steps_sampled: 404000
  num_agent_steps_trained: 404000
  num_steps_sampled: 404000
  num_steps_trained: 404000



----------------- Evaluation at steps:105 starting ! -----------------
agent_timesteps_total: 424000
custom_metrics: {}
date: 2021-10-06_17-29-40
done: false
episode_len_mean: 712.84
episode_media: {}
episode_reward_max: 3.9131500000000004
episode_reward_mean: -0.21408499999999794
episode_reward_min: -2.2133500000000037
episodes_this_iter: 2
episodes_total: 304
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.2207056283950806
        entropy_coeff: 0.0
        kl: 0.014793254435062408
        model: {}
        policy_loss: -0.012854558415710926
        total_loss: -0.0002892371267080307
        vf_explained_var: 0.2885720729827881
        vf_loss: 0.009606671519577503
  num_agent_steps_sampled: 424000
  num_agent_steps_trained: 424000
  num_steps_sampled: 424000
  num_steps_trained: 424000


----------------- Evaluation at steps:110 starting ! -----------------
agent_timesteps_total: 444000
custom_metrics: {}
date: 2021-10-06_17-30-50
done: false
episode_len_mean: 718.29
episode_media: {}
episode_reward_max: 3.914900000000001
episode_reward_mean: -0.3125909999999981
episode_reward_min: -2.2133500000000037
episodes_this_iter: 2
episodes_total: 317
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.2159957885742188
        entropy_coeff: 0.0
        kl: 0.011909636668860912
        model: {}
        policy_loss: -0.013837353326380253
        total_loss: 0.030100734904408455
        vf_explained_var: -0.02303868718445301
        vf_loss: 0.04155615344643593
  num_agent_steps_sampled: 444000
  num_agent_steps_trained: 444000
  num_steps_sampled: 444000
  num_steps_trained: 444000
it


----------------- Evaluation at steps:115 starting ! -----------------
agent_timesteps_total: 464000
custom_metrics: {}
date: 2021-10-06_17-32-21
done: false
episode_len_mean: 705.16
episode_media: {}
episode_reward_max: 3.914900000000001
episode_reward_mean: -0.2817309999999979
episode_reward_min: -2.2133500000000037
episodes_this_iter: 4
episodes_total: 333
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.214281678199768
        entropy_coeff: 0.0
        kl: 0.008397897705435753
        model: {}
        policy_loss: -0.013195278123021126
        total_loss: 0.019765274599194527
        vf_explained_var: 0.10381222516298294
        vf_loss: 0.03128097206354141
  num_agent_steps_sampled: 464000
  num_agent_steps_trained: 464000
  num_steps_sampled: 464000
  num_steps_trained: 464000
iter


----------------- Evaluation at steps:120 starting ! -----------------
agent_timesteps_total: 484000
custom_metrics: {}
date: 2021-10-06_17-33-35
done: false
episode_len_mean: 697.27
episode_media: {}
episode_reward_max: 3.914900000000001
episode_reward_mean: -0.298209499999998
episode_reward_min: -2.2133500000000037
episodes_this_iter: 3
episodes_total: 350
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0776877403259277
        entropy_coeff: 0.0
        kl: 0.010929360054433346
        model: {}
        policy_loss: -0.012577989138662815
        total_loss: 0.01069817878305912
        vf_explained_var: 0.17049826681613922
        vf_loss: 0.021090297028422356
  num_agent_steps_sampled: 484000
  num_agent_steps_trained: 484000
  num_steps_sampled: 484000
  num_steps_trained: 484000
iter


----------------- Evaluation at steps:125 starting ! -----------------
agent_timesteps_total: 504000
custom_metrics: {}
date: 2021-10-06_17-35-09
done: false
episode_len_mean: 702.06
episode_media: {}
episode_reward_max: 3.914900000000001
episode_reward_mean: -0.308025999999998
episode_reward_min: -2.2133500000000037
episodes_this_iter: 2
episodes_total: 364
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.333127737045288
        entropy_coeff: 0.0
        kl: 0.010485298000276089
        model: {}
        policy_loss: -0.01613648608326912
        total_loss: -0.0028954176232218742
        vf_explained_var: 0.14112251996994019
        vf_loss: 0.011144005693495274
  num_agent_steps_sampled: 504000
  num_agent_steps_trained: 504000
  num_steps_sampled: 504000
  num_steps_trained: 504000
ite


----------------- Evaluation at steps:130 starting ! -----------------
agent_timesteps_total: 524000
custom_metrics: {}
date: 2021-10-06_17-36-32
done: false
episode_len_mean: 685.81
episode_media: {}
episode_reward_max: 3.914900000000001
episode_reward_mean: -0.0648539999999983
episode_reward_min: -2.2133500000000037
episodes_this_iter: 3
episodes_total: 379
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.178169846534729
        entropy_coeff: 0.0
        kl: 0.00935938861221075
        model: {}
        policy_loss: 0.0027430083137005568
        total_loss: 0.018753038719296455
        vf_explained_var: 0.4334986209869385
        vf_loss: 0.01413815189152956
  num_agent_steps_sampled: 524000
  num_agent_steps_trained: 524000
  num_steps_sampled: 524000
  num_steps_trained: 524000
iterat


----------------- Evaluation at steps:135 starting ! -----------------
agent_timesteps_total: 544000
custom_metrics: {}
date: 2021-10-06_17-37-50
done: false
episode_len_mean: 663.3
episode_media: {}
episode_reward_max: 3.914900000000001
episode_reward_mean: 0.14065800000000153
episode_reward_min: -2.181649999999998
episodes_this_iter: 2
episodes_total: 395
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1522232294082642
        entropy_coeff: 0.0
        kl: 0.008523802272975445
        model: {}
        policy_loss: -0.009105348028242588
        total_loss: 0.011238187551498413
        vf_explained_var: 0.09545319527387619
        vf_loss: 0.018638774752616882
  num_agent_steps_sampled: 544000
  num_agent_steps_trained: 544000
  num_steps_sampled: 544000
  num_steps_trained: 544000
iter


----------------- Evaluation at steps:140 starting ! -----------------
agent_timesteps_total: 564000
custom_metrics: {}
date: 2021-10-06_17-39-18
done: false
episode_len_mean: 672.26
episode_media: {}
episode_reward_max: 3.916999999999999
episode_reward_mean: 0.2727455000000014
episode_reward_min: -2.169350000000001
episodes_this_iter: 3
episodes_total: 407
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.172990322113037
        entropy_coeff: 0.0
        kl: 0.010723710991442204
        model: {}
        policy_loss: -0.01831020414829254
        total_loss: 0.02276861108839512
        vf_explained_var: -0.09134241938591003
        vf_loss: 0.038934070616960526
  num_agent_steps_sampled: 564000
  num_agent_steps_trained: 564000
  num_steps_sampled: 564000
  num_steps_trained: 564000
iterat


----------------- Evaluation at steps:145 starting ! -----------------
agent_timesteps_total: 584000
custom_metrics: {}
date: 2021-10-06_17-40-41
done: false
episode_len_mean: 658.33
episode_media: {}
episode_reward_max: 3.916999999999999
episode_reward_mean: 0.3609315000000015
episode_reward_min: -2.171750000000002
episodes_this_iter: 4
episodes_total: 423
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.2108699083328247
        entropy_coeff: 0.0
        kl: 0.01069526094943285
        model: {}
        policy_loss: -0.012975340709090233
        total_loss: 0.003289744257926941
        vf_explained_var: 0.019642014056444168
        vf_loss: 0.01412603072822094
  num_agent_steps_sampled: 584000
  num_agent_steps_trained: 584000
  num_steps_sampled: 584000
  num_steps_trained: 584000
itera


----------------- Evaluation at steps:150 starting ! -----------------
agent_timesteps_total: 604000
custom_metrics: {}
date: 2021-10-06_17-42-17
done: false
episode_len_mean: 674.19
episode_media: {}
episode_reward_max: 3.916999999999999
episode_reward_mean: 0.25184000000000123
episode_reward_min: -2.171750000000002
episodes_this_iter: 2
episodes_total: 436
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.3311620950698853
        entropy_coeff: 0.0
        kl: 0.01185584720224142
        model: {}
        policy_loss: -0.026247741654515266
        total_loss: -0.02120809070765972
        vf_explained_var: 0.06402864307165146
        vf_loss: 0.0026684857439249754
  num_agent_steps_sampled: 604000
  num_agent_steps_trained: 604000
  num_steps_sampled: 604000
  num_steps_trained: 604000
ite


----------------- Evaluation at steps:155 starting ! -----------------
agent_timesteps_total: 624000
custom_metrics: {}
date: 2021-10-06_17-43-31
done: false
episode_len_mean: 696.47
episode_media: {}
episode_reward_max: 3.916999999999999
episode_reward_mean: 0.1009640000000011
episode_reward_min: -2.192050000000002
episodes_this_iter: 2
episodes_total: 449
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.200956106185913
        entropy_coeff: 0.0
        kl: 0.012156404554843903
        model: {}
        policy_loss: -0.017172647640109062
        total_loss: 0.0006960007594898343
        vf_explained_var: 0.17724327743053436
        vf_loss: 0.015437372028827667
  num_agent_steps_sampled: 624000
  num_agent_steps_trained: 624000
  num_steps_sampled: 624000
  num_steps_trained: 624000
iter


----------------- Evaluation at steps:160 starting ! -----------------
agent_timesteps_total: 644000
custom_metrics: {}
date: 2021-10-06_17-45-03
done: false
episode_len_mean: 717.92
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.13931599999999889
episode_reward_min: -2.192050000000002
episodes_this_iter: 3
episodes_total: 462
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1594083309173584
        entropy_coeff: 0.0
        kl: 0.009064274840056896
        model: {}
        policy_loss: -0.01568211242556572
        total_loss: -0.0068799639120697975
        vf_explained_var: 0.08113566786050797
        vf_loss: 0.006989290472120047
  num_agent_steps_sampled: 644000
  num_agent_steps_trained: 644000
  num_steps_sampled: 644000
  num_steps_trained: 644000



----------------- Evaluation at steps:165 starting ! -----------------
agent_timesteps_total: 664000
custom_metrics: {}
date: 2021-10-06_17-46-43
done: false
episode_len_mean: 729.49
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.27220299999999864
episode_reward_min: -2.192050000000002
episodes_this_iter: 3
episodes_total: 475
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1660557985305786
        entropy_coeff: 0.0
        kl: 0.009342367760837078
        model: {}
        policy_loss: -0.017580442130565643
        total_loss: -0.0041137635707855225
        vf_explained_var: 0.13675548136234283
        vf_loss: 0.011598209850490093
  num_agent_steps_sampled: 664000
  num_agent_steps_trained: 664000
  num_steps_sampled: 664000
  num_steps_trained: 664000


----------------- Evaluation at steps:170 starting ! -----------------
agent_timesteps_total: 684000
custom_metrics: {}
date: 2021-10-06_17-48-14
done: false
episode_len_mean: 745.48
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.4924014999999987
episode_reward_min: -2.192050000000002
episodes_this_iter: 3
episodes_total: 489
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.2016687393188477
        entropy_coeff: 0.0
        kl: 0.01121972780674696
        model: {}
        policy_loss: -0.018222512677311897
        total_loss: 0.004433728288859129
        vf_explained_var: -0.13105632364749908
        vf_loss: 0.020412301644682884
  num_agent_steps_sampled: 684000
  num_agent_steps_trained: 684000
  num_steps_sampled: 684000
  num_steps_trained: 684000
it


----------------- Evaluation at steps:175 starting ! -----------------
agent_timesteps_total: 704000
custom_metrics: {}
date: 2021-10-06_17-49-52
done: false
episode_len_mean: 727.61
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.5142979999999987
episode_reward_min: -2.192050000000002
episodes_this_iter: 3
episodes_total: 503
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0788655281066895
        entropy_coeff: 0.0
        kl: 0.008992154151201248
        model: {}
        policy_loss: -0.01655161939561367
        total_loss: 0.0030005655717104673
        vf_explained_var: -0.036535222083330154
        vf_loss: 0.017753759399056435
  num_agent_steps_sampled: 704000
  num_agent_steps_trained: 704000
  num_steps_sampled: 704000
  num_steps_trained: 704000



----------------- Evaluation at steps:180 starting ! -----------------
agent_timesteps_total: 724000
custom_metrics: {}
date: 2021-10-06_17-51-15
done: false
episode_len_mean: 726.37
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.5581964999999984
episode_reward_min: -2.192050000000002
episodes_this_iter: 3
episodes_total: 518
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1403952836990356
        entropy_coeff: 0.0
        kl: 0.010672002099454403
        model: {}
        policy_loss: -0.01568268984556198
        total_loss: 0.007261691149324179
        vf_explained_var: -0.03723028302192688
        vf_loss: 0.02080998197197914
  num_agent_steps_sampled: 724000
  num_agent_steps_trained: 724000
  num_steps_sampled: 724000
  num_steps_trained: 724000
ite


----------------- Evaluation at steps:185 starting ! -----------------
agent_timesteps_total: 744000
custom_metrics: {}
date: 2021-10-06_17-52-55
done: false
episode_len_mean: 737.59
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.6220629999999985
episode_reward_min: -2.201249999999999
episodes_this_iter: 2
episodes_total: 531
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1758981943130493
        entropy_coeff: 0.0
        kl: 0.01030999980866909
        model: {}
        policy_loss: -0.020259512588381767
        total_loss: -0.0036949911154806614
        vf_explained_var: 0.3525034785270691
        vf_loss: 0.014502523466944695
  num_agent_steps_sampled: 744000
  num_agent_steps_trained: 744000
  num_steps_sampled: 744000
  num_steps_trained: 744000
it


----------------- Evaluation at steps:190 starting ! -----------------
agent_timesteps_total: 764000
custom_metrics: {}
date: 2021-10-06_17-54-28
done: false
episode_len_mean: 730.16
episode_media: {}
episode_reward_max: 3.9209500000000004
episode_reward_mean: -0.5532764999999983
episode_reward_min: -2.201249999999999
episodes_this_iter: 3
episodes_total: 546
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0339723825454712
        entropy_coeff: 0.0
        kl: 0.009550503455102444
        model: {}
        policy_loss: -0.016282545402646065
        total_loss: 0.011672494001686573
        vf_explained_var: -0.018213551491498947
        vf_loss: 0.0260449405759573
  num_agent_steps_sampled: 764000
  num_agent_steps_trained: 764000
  num_steps_sampled: 764000
  num_steps_trained: 764000
it


----------------- Evaluation at steps:195 starting ! -----------------
agent_timesteps_total: 784000
custom_metrics: {}
date: 2021-10-06_17-55-43
done: false
episode_len_mean: 716.15
episode_media: {}
episode_reward_max: 3.914099999999999
episode_reward_mean: -0.5129969999999985
episode_reward_min: -2.2288999999999994
episodes_this_iter: 3
episodes_total: 560
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.080740213394165
        entropy_coeff: 0.0
        kl: 0.0098353810608387
        model: {}
        policy_loss: -0.015662208199501038
        total_loss: 0.0251690112054348
        vf_explained_var: -0.05113959312438965
        vf_loss: 0.0388641394674778
  num_agent_steps_sampled: 784000
  num_agent_steps_trained: 784000
  num_steps_sampled: 784000
  num_steps_trained: 784000
iteratio


----------------- Evaluation at steps:200 starting ! -----------------
agent_timesteps_total: 804000
custom_metrics: {}
date: 2021-10-06_17-57-04
done: false
episode_len_mean: 712.2
episode_media: {}
episode_reward_max: 3.9244000000000003
episode_reward_mean: -0.5208964999999984
episode_reward_min: -2.2288999999999994
episodes_this_iter: 3
episodes_total: 574
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0617129802703857
        entropy_coeff: 0.0
        kl: 0.008842975832521915
        model: {}
        policy_loss: -0.016530966386198997
        total_loss: 0.006799977272748947
        vf_explained_var: -0.014589645899832249
        vf_loss: 0.021562345325946808
  num_agent_steps_sampled: 804000
  num_agent_steps_trained: 804000
  num_steps_sampled: 804000
  num_steps_trained: 804000



----------------- Evaluation at steps:205 starting ! -----------------
agent_timesteps_total: 824000
custom_metrics: {}
date: 2021-10-06_17-58-36
done: false
episode_len_mean: 683.05
episode_media: {}
episode_reward_max: 3.9244000000000003
episode_reward_mean: -0.2600399999999984
episode_reward_min: -2.2288999999999994
episodes_this_iter: 2
episodes_total: 591
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0299382209777832
        entropy_coeff: 0.0
        kl: 0.011145583353936672
        model: {}
        policy_loss: -0.01953793689608574
        total_loss: -0.00014508821186609566
        vf_explained_var: 0.20568564534187317
        vf_loss: 0.017163727432489395
  num_agent_steps_sampled: 824000
  num_agent_steps_trained: 824000
  num_steps_sampled: 824000
  num_steps_trained: 824000


----------------- Evaluation at steps:210 starting ! -----------------
agent_timesteps_total: 844000
custom_metrics: {}
date: 2021-10-06_18-00-10
done: false
episode_len_mean: 679.07
episode_media: {}
episode_reward_max: 3.9244000000000003
episode_reward_mean: -0.17944799999999844
episode_reward_min: -2.2288999999999994
episodes_this_iter: 2
episodes_total: 606
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0007094144821167
        entropy_coeff: 0.0
        kl: 0.011995342560112476
        model: {}
        policy_loss: -0.018643323332071304
        total_loss: 0.005611212458461523
        vf_explained_var: -0.06207117810845375
        vf_loss: 0.021855471655726433
  num_agent_steps_sampled: 844000
  num_agent_steps_trained: 844000
  num_steps_sampled: 844000
  num_steps_trained: 844000


----------------- Evaluation at steps:215 starting ! -----------------
agent_timesteps_total: 864000
custom_metrics: {}
date: 2021-10-06_18-01-50
done: false
episode_len_mean: 672.56
episode_media: {}
episode_reward_max: 3.9244000000000003
episode_reward_mean: -0.11912649999999832
episode_reward_min: -2.2288999999999994
episodes_this_iter: 3
episodes_total: 622
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0335700511932373
        entropy_coeff: 0.0
        kl: 0.01099656242877245
        model: {}
        policy_loss: -0.01582174375653267
        total_loss: -0.0020618485286831856
        vf_explained_var: 0.327629953622818
        vf_loss: 0.011560583487153053
  num_agent_steps_sampled: 864000
  num_agent_steps_trained: 864000
  num_steps_sampled: 864000
  num_steps_trained: 864000
it


----------------- Evaluation at steps:220 starting ! -----------------
agent_timesteps_total: 884000
custom_metrics: {}
date: 2021-10-06_18-03-10
done: false
episode_len_mean: 658.3
episode_media: {}
episode_reward_max: 3.9244000000000003
episode_reward_mean: -0.025963499999998266
episode_reward_min: -2.2288999999999994
episodes_this_iter: 4
episodes_total: 638
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0849944353103638
        entropy_coeff: 0.0
        kl: 0.009931027889251709
        model: {}
        policy_loss: -0.019631490111351013
        total_loss: 0.0187385231256485
        vf_explained_var: -0.13241516053676605
        vf_loss: 0.03638380765914917
  num_agent_steps_sampled: 884000
  num_agent_steps_trained: 884000
  num_steps_sampled: 884000
  num_steps_trained: 884000
it


----------------- Evaluation at steps:225 starting ! -----------------
agent_timesteps_total: 904000
custom_metrics: {}
date: 2021-10-06_18-04-23
done: false
episode_len_mean: 656.48
episode_media: {}
episode_reward_max: 3.9244000000000003
episode_reward_mean: 0.0828195000000019
episode_reward_min: -2.211050000000006
episodes_this_iter: 2
episodes_total: 651
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9941574335098267
        entropy_coeff: 0.0
        kl: 0.00912988930940628
        model: {}
        policy_loss: -0.019995423033833504
        total_loss: -0.002712732646614313
        vf_explained_var: -0.2395157665014267
        vf_loss: 0.01545671746134758
  num_agent_steps_sampled: 904000
  num_agent_steps_trained: 904000
  num_steps_sampled: 904000
  num_steps_trained: 904000
iter


----------------- Evaluation at steps:230 starting ! -----------------
agent_timesteps_total: 924000
custom_metrics: {}
date: 2021-10-06_18-05-35
done: false
episode_len_mean: 657.44
episode_media: {}
episode_reward_max: 3.9104499999999986
episode_reward_mean: 0.19033250000000212
episode_reward_min: -2.211050000000006
episodes_this_iter: 3
episodes_total: 666
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0867624282836914
        entropy_coeff: 0.0
        kl: 0.010557050816714764
        model: {}
        policy_loss: -0.021693699061870575
        total_loss: -0.00029861056827940047
        vf_explained_var: 0.12367865443229675
        vf_loss: 0.019283676519989967
  num_agent_steps_sampled: 924000
  num_agent_steps_trained: 924000
  num_steps_sampled: 924000
  num_steps_trained: 924000


----------------- Evaluation at steps:235 starting ! -----------------
agent_timesteps_total: 944000
custom_metrics: {}
date: 2021-10-06_18-07-08
done: false
episode_len_mean: 660.13
episode_media: {}
episode_reward_max: 3.9108500000000004
episode_reward_mean: 0.20920750000000218
episode_reward_min: -2.211050000000006
episodes_this_iter: 3
episodes_total: 681
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9518505930900574
        entropy_coeff: 0.0
        kl: 0.012807996943593025
        model: {}
        policy_loss: -0.019815417006611824
        total_loss: 4.8543304728809744e-05
        vf_explained_var: 0.1159471720457077
        vf_loss: 0.017302365973591805
  num_agent_steps_sampled: 944000
  num_agent_steps_trained: 944000
  num_steps_sampled: 944000
  num_steps_trained: 944000
i


----------------- Evaluation at steps:240 starting ! -----------------
agent_timesteps_total: 964000
custom_metrics: {}
date: 2021-10-06_18-08-45
done: false
episode_len_mean: 673.61
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.1984375000000024
episode_reward_min: -2.2022000000000004
episodes_this_iter: 3
episodes_total: 694
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0197861194610596
        entropy_coeff: 0.0
        kl: 0.010588503442704678
        model: {}
        policy_loss: -0.0162314772605896
        total_loss: 0.01353600062429905
        vf_explained_var: 0.15697748959064484
        vf_loss: 0.027649778872728348
  num_agent_steps_sampled: 964000
  num_agent_steps_trained: 964000
  num_steps_sampled: 964000
  num_steps_trained: 964000
itera


----------------- Evaluation at steps:245 starting ! -----------------
agent_timesteps_total: 984000
custom_metrics: {}
date: 2021-10-06_18-10-16
done: false
episode_len_mean: 688.72
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.10609200000000252
episode_reward_min: -2.216600000000002
episodes_this_iter: 3
episodes_total: 707
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9621076583862305
        entropy_coeff: 0.0
        kl: 0.010715186595916748
        model: {}
        policy_loss: -0.02236819453537464
        total_loss: 0.0010799664305523038
        vf_explained_var: 0.27420324087142944
        vf_loss: 0.02130512334406376
  num_agent_steps_sampled: 984000
  num_agent_steps_trained: 984000
  num_steps_sampled: 984000
  num_steps_trained: 984000
ite


----------------- Evaluation at steps:250 starting ! -----------------
agent_timesteps_total: 1004000
custom_metrics: {}
date: 2021-10-06_18-11-41
done: false
episode_len_mean: 691.83
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.14697450000000253
episode_reward_min: -2.216600000000002
episodes_this_iter: 2
episodes_total: 722
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9402800798416138
        entropy_coeff: 0.0
        kl: 0.008276713080704212
        model: {}
        policy_loss: -0.01592620462179184
        total_loss: 0.006953957490622997
        vf_explained_var: 0.10714472085237503
        vf_loss: 0.021224824711680412
  num_agent_steps_sampled: 1004000
  num_agent_steps_trained: 1004000
  num_steps_sampled: 1004000
  num_steps_trained: 100400


----------------- Evaluation at steps:255 starting ! -----------------
agent_timesteps_total: 1024000
custom_metrics: {}
date: 2021-10-06_18-13-01
done: false
episode_len_mean: 700.84
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.07629400000000258
episode_reward_min: -2.216600000000002
episodes_this_iter: 4
episodes_total: 738
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9449978470802307
        entropy_coeff: 0.0
        kl: 0.012308895587921143
        model: {}
        policy_loss: -0.02609892375767231
        total_loss: -0.005663982126861811
        vf_explained_var: 0.243376225233078
        vf_loss: 0.017973164096474648
  num_agent_steps_sampled: 1024000
  num_agent_steps_trained: 1024000
  num_steps_sampled: 1024000
  num_steps_trained: 1024000


----------------- Evaluation at steps:260 starting ! -----------------
agent_timesteps_total: 1044000
custom_metrics: {}
date: 2021-10-06_18-14-42
done: false
episode_len_mean: 686.73
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.13581350000000267
episode_reward_min: -2.216600000000002
episodes_this_iter: 3
episodes_total: 753
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1285151243209839
        entropy_coeff: 0.0
        kl: 0.010269063524901867
        model: {}
        policy_loss: -0.01994761824607849
        total_loss: 0.004084372892975807
        vf_explained_var: -0.2976583242416382
        vf_loss: 0.02197817713022232
  num_agent_steps_sampled: 1044000
  num_agent_steps_trained: 1044000
  num_steps_sampled: 1044000
  num_steps_trained: 1044000


----------------- Evaluation at steps:265 starting ! -----------------
agent_timesteps_total: 1064000
custom_metrics: {}
date: 2021-10-06_18-16-15
done: false
episode_len_mean: 692.19
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.21768450000000267
episode_reward_min: -2.216600000000002
episodes_this_iter: 4
episodes_total: 768
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.1070855855941772
        entropy_coeff: 0.0
        kl: 0.009815171360969543
        model: {}
        policy_loss: -0.02178492769598961
        total_loss: 0.023402687162160873
        vf_explained_var: -0.06732503324747086
        vf_loss: 0.043224580585956573
  num_agent_steps_sampled: 1064000
  num_agent_steps_trained: 1064000
  num_steps_sampled: 1064000
  num_steps_trained: 10640


----------------- Evaluation at steps:270 starting ! -----------------
agent_timesteps_total: 1084000
custom_metrics: {}
date: 2021-10-06_18-17-54
done: false
episode_len_mean: 686.9
episode_media: {}
episode_reward_max: 3.9280500000000007
episode_reward_mean: 0.2680255000000024
episode_reward_min: -2.216600000000002
episodes_this_iter: 4
episodes_total: 782
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0088489055633545
        entropy_coeff: 0.0
        kl: 0.010865654796361923
        model: {}
        policy_loss: -0.028887374326586723
        total_loss: 0.004031309392303228
        vf_explained_var: 0.10526605695486069
        vf_loss: 0.030745552852749825
  num_agent_steps_sampled: 1084000
  num_agent_steps_trained: 1084000
  num_steps_sampled: 1084000
  num_steps_trained: 1084000


----------------- Evaluation at steps:275 starting ! -----------------
agent_timesteps_total: 1104000
custom_metrics: {}
date: 2021-10-06_18-19-16
done: false
episode_len_mean: 667.04
episode_media: {}
episode_reward_max: 3.9238000000000004
episode_reward_mean: 0.31921300000000236
episode_reward_min: -2.2521999999999993
episodes_this_iter: 3
episodes_total: 799
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.028112769126892
        entropy_coeff: 0.0
        kl: 0.009893066249787807
        model: {}
        policy_loss: -0.02106343023478985
        total_loss: -0.005570553243160248
        vf_explained_var: 0.307933509349823
        vf_loss: 0.0135142607614398
  num_agent_steps_sampled: 1104000
  num_agent_steps_trained: 1104000
  num_steps_sampled: 1104000
  num_steps_trained: 1104000
i


----------------- Evaluation at steps:280 starting ! -----------------
agent_timesteps_total: 1124000
custom_metrics: {}
date: 2021-10-06_18-20-42
done: false
episode_len_mean: 659.63
episode_media: {}
episode_reward_max: 3.9238000000000004
episode_reward_mean: 0.6001130000000025
episode_reward_min: -2.2521999999999993
episodes_this_iter: 3
episodes_total: 814
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9819862246513367
        entropy_coeff: 0.0
        kl: 0.008438865654170513
        model: {}
        policy_loss: -0.016983307898044586
        total_loss: -0.00913187488913536
        vf_explained_var: 0.36805260181427
        vf_loss: 0.006163663696497679
  num_agent_steps_sampled: 1124000
  num_agent_steps_trained: 1124000
  num_steps_sampled: 1124000
  num_steps_trained: 1124000



----------------- Evaluation at steps:285 starting ! -----------------
agent_timesteps_total: 1144000
custom_metrics: {}
date: 2021-10-06_18-22-20
done: false
episode_len_mean: 665.29
episode_media: {}
episode_reward_max: 3.9222499999999996
episode_reward_mean: 0.5075990000000025
episode_reward_min: -2.2521999999999993
episodes_this_iter: 3
episodes_total: 827
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9656625986099243
        entropy_coeff: 0.0
        kl: 0.01069041807204485
        model: {}
        policy_loss: -0.0257788747549057
        total_loss: -0.0025271119084209204
        vf_explained_var: 0.27613839507102966
        vf_loss: 0.0211136806756258
  num_agent_steps_sampled: 1144000
  num_agent_steps_trained: 1144000
  num_steps_sampled: 1144000
  num_steps_trained: 1144000



----------------- Evaluation at steps:290 starting ! -----------------
agent_timesteps_total: 1164000
custom_metrics: {}
date: 2021-10-06_18-23-36
done: false
episode_len_mean: 688.67
episode_media: {}
episode_reward_max: 3.9149499999999993
episode_reward_mean: 0.3543960000000027
episode_reward_min: -2.2521999999999993
episodes_this_iter: 3
episodes_total: 840
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.8684203028678894
        entropy_coeff: 0.0
        kl: 0.010511703789234161
        model: {}
        policy_loss: -0.024898536503314972
        total_loss: -0.009349931962788105
        vf_explained_var: 0.06202877312898636
        vf_loss: 0.01344626396894455
  num_agent_steps_sampled: 1164000
  num_agent_steps_trained: 1164000
  num_steps_sampled: 1164000
  num_steps_trained: 11640


----------------- Evaluation at steps:295 starting ! -----------------
agent_timesteps_total: 1184000
custom_metrics: {}
date: 2021-10-06_18-24-54
done: false
episode_len_mean: 693.3
episode_media: {}
episode_reward_max: 3.9149499999999993
episode_reward_mean: 0.3341250000000025
episode_reward_min: -2.2521999999999993
episodes_this_iter: 3
episodes_total: 854
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9410198330879211
        entropy_coeff: 0.0
        kl: 0.010151552967727184
        model: {}
        policy_loss: -0.024886924773454666
        total_loss: -0.013803244568407536
        vf_explained_var: 0.3151650130748749
        vf_loss: 0.009053368121385574
  num_agent_steps_sampled: 1184000
  num_agent_steps_trained: 1184000
  num_steps_sampled: 1184000
  num_steps_trained: 118400


----------------- Evaluation at steps:300 starting ! -----------------
agent_timesteps_total: 1204000
custom_metrics: {}
date: 2021-10-06_18-26-28
done: false
episode_len_mean: 696.4
episode_media: {}
episode_reward_max: 3.9149499999999993
episode_reward_mean: 0.36053800000000286
episode_reward_min: -2.2521999999999993
episodes_this_iter: 2
episodes_total: 867
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9060288071632385
        entropy_coeff: 0.0
        kl: 0.010377941653132439
        model: {}
        policy_loss: -0.016073590144515038
        total_loss: -0.004046086687594652
        vf_explained_var: 0.17883038520812988
        vf_loss: 0.009951913729310036
  num_agent_steps_sampled: 1204000
  num_agent_steps_trained: 1204000
  num_steps_sampled: 1204000
  num_steps_trained: 1204


----------------- Evaluation at steps:305 starting ! -----------------
agent_timesteps_total: 1224000
custom_metrics: {}
date: 2021-10-06_18-28-22
done: false
episode_len_mean: 709.37
episode_media: {}
episode_reward_max: 3.91
episode_reward_mean: 0.39922900000000333
episode_reward_min: -2.2521999999999993
episodes_this_iter: 3
episodes_total: 880
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9902704954147339
        entropy_coeff: 0.0
        kl: 0.009936320595443249
        model: {}
        policy_loss: -0.023835744708776474
        total_loss: 0.0030157878063619137
        vf_explained_var: 0.13891419768333435
        vf_loss: 0.02486426942050457
  num_agent_steps_sampled: 1224000
  num_agent_steps_trained: 1224000
  num_steps_sampled: 1224000
  num_steps_trained: 1224000
iterations


----------------- Evaluation at steps:310 starting ! -----------------
agent_timesteps_total: 1244000
custom_metrics: {}
date: 2021-10-06_18-29-40
done: false
episode_len_mean: 728.24
episode_media: {}
episode_reward_max: 3.9118
episode_reward_mean: 0.25843050000000334
episode_reward_min: -2.2193500000000004
episodes_this_iter: 4
episodes_total: 894
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.8922392725944519
        entropy_coeff: 0.0
        kl: 0.006075539160519838
        model: {}
        policy_loss: -0.015992075204849243
        total_loss: 0.006522668991237879
        vf_explained_var: 0.31123775243759155
        vf_loss: 0.021299632266163826
  num_agent_steps_sampled: 1244000
  num_agent_steps_trained: 1244000
  num_steps_sampled: 1244000
  num_steps_trained: 1244000
iteratio


----------------- Evaluation at steps:315 starting ! -----------------
agent_timesteps_total: 1264000
custom_metrics: {}
date: 2021-10-06_18-31-06
done: false
episode_len_mean: 753.29
episode_media: {}
episode_reward_max: 3.9118
episode_reward_mean: 0.15782900000000324
episode_reward_min: -2.2193500000000004
episodes_this_iter: 4
episodes_total: 907
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9946664571762085
        entropy_coeff: 0.0
        kl: 0.01144097838550806
        model: {}
        policy_loss: -0.026933657005429268
        total_loss: -0.00153116206638515
        vf_explained_var: -0.06427808851003647
        vf_loss: 0.023114297538995743
  num_agent_steps_sampled: 1264000
  num_agent_steps_trained: 1264000
  num_steps_sampled: 1264000
  num_steps_trained: 1264000
iteratio


----------------- Evaluation at steps:320 starting ! -----------------
agent_timesteps_total: 1284000
custom_metrics: {}
date: 2021-10-06_18-32-42
done: false
episode_len_mean: 746.97
episode_media: {}
episode_reward_max: 3.9118
episode_reward_mean: 0.09906500000000308
episode_reward_min: -2.2193500000000004
episodes_this_iter: 3
episodes_total: 921
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9226502776145935
        entropy_coeff: 0.0
        kl: 0.013201835565268993
        model: {}
        policy_loss: -0.026115747168660164
        total_loss: -0.009923763573169708
        vf_explained_var: 0.25564491748809814
        vf_loss: 0.013551619835197926
  num_agent_steps_sampled: 1284000
  num_agent_steps_trained: 1284000
  num_steps_sampled: 1284000
  num_steps_trained: 1284000
iterati


----------------- Evaluation at steps:325 starting ! -----------------
agent_timesteps_total: 1304000
custom_metrics: {}
date: 2021-10-06_18-34-25
done: false
episode_len_mean: 735.92
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: 0.19275550000000283
episode_reward_min: -2.2116500000000014
episodes_this_iter: 3
episodes_total: 936
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9780256152153015
        entropy_coeff: 0.0
        kl: 0.013880361802875996
        model: {}
        policy_loss: -0.019379014149308205
        total_loss: 0.0030135244596749544
        vf_explained_var: 0.30734580755233765
        vf_loss: 0.019616467878222466
  num_agent_steps_sampled: 1304000
  num_agent_steps_trained: 1304000
  num_steps_sampled: 1304000
  num_steps_trained: 1304000
iterati


----------------- Evaluation at steps:330 starting ! -----------------
agent_timesteps_total: 1324000
custom_metrics: {}
date: 2021-10-06_18-36-13
done: false
episode_len_mean: 726.97
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: 0.2733730000000029
episode_reward_min: -2.2116500000000014
episodes_this_iter: 2
episodes_total: 950
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0777276754379272
        entropy_coeff: 0.0
        kl: 0.012933915480971336
        model: {}
        policy_loss: -0.027204163372516632
        total_loss: -0.017102664336562157
        vf_explained_var: 0.10398470610380173
        vf_loss: 0.007514712400734425
  num_agent_steps_sampled: 1324000
  num_agent_steps_trained: 1324000
  num_steps_sampled: 1324000
  num_steps_trained: 1324000
iteratio


----------------- Evaluation at steps:335 starting ! -----------------
agent_timesteps_total: 1344000
custom_metrics: {}
date: 2021-10-06_18-37-40
done: false
episode_len_mean: 722.67
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: 0.2644415000000027
episode_reward_min: -2.1951500000000026
episodes_this_iter: 3
episodes_total: 964
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9995517134666443
        entropy_coeff: 0.0
        kl: 0.01223970390856266
        model: {}
        policy_loss: -0.03365206718444824
        total_loss: -0.00784225482493639
        vf_explained_var: 0.0572686642408371
        vf_loss: 0.023361874744296074
  num_agent_steps_sampled: 1344000
  num_agent_steps_trained: 1344000
  num_steps_sampled: 1344000
  num_steps_trained: 1344000
iterations_s


----------------- Evaluation at steps:340 starting ! -----------------
agent_timesteps_total: 1364000
custom_metrics: {}
date: 2021-10-06_18-39-04
done: false
episode_len_mean: 722.63
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: 0.13486100000000248
episode_reward_min: -2.1951500000000026
episodes_this_iter: 3
episodes_total: 977
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9721528887748718
        entropy_coeff: 0.0
        kl: 0.014047066681087017
        model: {}
        policy_loss: -0.03356276825070381
        total_loss: -0.01617853157222271
        vf_explained_var: 0.16919270157814026
        vf_loss: 0.014574822038412094
  num_agent_steps_sampled: 1364000
  num_agent_steps_trained: 1364000
  num_steps_sampled: 1364000
  num_steps_trained: 1364000
iteration


----------------- Evaluation at steps:345 starting ! -----------------
agent_timesteps_total: 1384000
custom_metrics: {}
date: 2021-10-06_18-40-40
done: false
episode_len_mean: 719.03
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: 0.18618800000000232
episode_reward_min: -2.1951500000000026
episodes_this_iter: 3
episodes_total: 990
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9456949830055237
        entropy_coeff: 0.0
        kl: 0.013772363774478436
        model: {}
        policy_loss: -0.023626552894711494
        total_loss: 0.003119367640465498
        vf_explained_var: 0.5011782050132751
        vf_loss: 0.02399144321680069
  num_agent_steps_sampled: 1384000
  num_agent_steps_trained: 1384000
  num_steps_sampled: 1384000
  num_steps_trained: 1384000
iterations


----------------- Evaluation at steps:350 starting ! -----------------
agent_timesteps_total: 1404000
custom_metrics: {}
date: 2021-10-06_18-42-08
done: false
episode_len_mean: 711.99
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: -0.01689799999999772
episode_reward_min: -2.2079000000000004
episodes_this_iter: 3
episodes_total: 1004
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9795917272567749
        entropy_coeff: 0.0
        kl: 0.009158072993159294
        model: {}
        policy_loss: -0.021839383989572525
        total_loss: -0.007906002923846245
        vf_explained_var: 0.149654358625412
        vf_loss: 0.012101765722036362
  num_agent_steps_sampled: 1404000
  num_agent_steps_trained: 1404000
  num_steps_sampled: 1404000
  num_steps_trained: 1404000
iterati


----------------- Evaluation at steps:355 starting ! -----------------
agent_timesteps_total: 1424000
custom_metrics: {}
date: 2021-10-06_18-43-53
done: false
episode_len_mean: 700.57
episode_media: {}
episode_reward_max: 3.9193
episode_reward_mean: 0.054123000000002225
episode_reward_min: -2.2079000000000004
episodes_this_iter: 4
episodes_total: 1021
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.068932294845581
        entropy_coeff: 0.0
        kl: 0.014858436770737171
        model: {}
        policy_loss: -0.02398030087351799
        total_loss: -0.008202978409826756
        vf_explained_var: 0.4011589586734772
        vf_loss: 0.012805632315576077
  num_agent_steps_sampled: 1424000
  num_agent_steps_trained: 1424000
  num_steps_sampled: 1424000
  num_steps_trained: 1424000
iteratio


----------------- Evaluation at steps:360 starting ! -----------------
agent_timesteps_total: 1444000
custom_metrics: {}
date: 2021-10-06_18-45-26
done: false
episode_len_mean: 678.04
episode_media: {}
episode_reward_max: 3.9158
episode_reward_mean: 0.1646135000000022
episode_reward_min: -2.2079000000000004
episodes_this_iter: 5
episodes_total: 1039
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.106347918510437
        entropy_coeff: 0.0
        kl: 0.009590290486812592
        model: {}
        policy_loss: -0.025506481528282166
        total_loss: -0.0016668555326759815
        vf_explained_var: 0.40984493494033813
        vf_loss: 0.021921567618846893
  num_agent_steps_sampled: 1444000
  num_agent_steps_trained: 1444000
  num_steps_sampled: 1444000
  num_steps_trained: 1444000
iterati


----------------- Evaluation at steps:365 starting ! -----------------
agent_timesteps_total: 1464000
custom_metrics: {}
date: 2021-10-06_18-46-40
done: false
episode_len_mean: 648.45
episode_media: {}
episode_reward_max: 3.9186999999999985
episode_reward_mean: 0.5285935000000018
episode_reward_min: -2.2079000000000004
episodes_this_iter: 4
episodes_total: 1057
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0027074813842773
        entropy_coeff: 0.0
        kl: 0.011700115166604519
        model: {}
        policy_loss: -0.026403483003377914
        total_loss: -0.0007879795739427209
        vf_explained_var: 0.2121099978685379
        vf_loss: 0.023275481536984444
  num_agent_steps_sampled: 1464000
  num_agent_steps_trained: 1464000
  num_steps_sampled: 1464000
  num_steps_trained: 146


----------------- Evaluation at steps:370 starting ! -----------------
agent_timesteps_total: 1484000
custom_metrics: {}
date: 2021-10-06_18-48-04
done: false
episode_len_mean: 623.71
episode_media: {}
episode_reward_max: 3.9224500000000004
episode_reward_mean: 0.8638055000000017
episode_reward_min: -2.2079000000000004
episodes_this_iter: 4
episodes_total: 1074
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0349748134613037
        entropy_coeff: 0.0
        kl: 0.011393009684979916
        model: {}
        policy_loss: -0.022856030613183975
        total_loss: -0.0036086461041122675
        vf_explained_var: 0.3669031858444214
        vf_loss: 0.016968781128525734
  num_agent_steps_sampled: 1484000
  num_agent_steps_trained: 1484000
  num_steps_sampled: 1484000
  num_steps_trained: 148


----------------- Evaluation at steps:375 starting ! -----------------
agent_timesteps_total: 1504000
custom_metrics: {}
date: 2021-10-06_18-49-25
done: false
episode_len_mean: 611.46
episode_media: {}
episode_reward_max: 3.9224500000000004
episode_reward_mean: 0.9044000000000015
episode_reward_min: -2.2079000000000004
episodes_this_iter: 3
episodes_total: 1089
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.9684184193611145
        entropy_coeff: 0.0
        kl: 0.01474765408784151
        model: {}
        policy_loss: -0.03210223466157913
        total_loss: -0.018420018255710602
        vf_explained_var: 0.16606688499450684
        vf_loss: 0.010732680559158325
  num_agent_steps_sampled: 1504000
  num_agent_steps_trained: 1504000
  num_steps_sampled: 1504000
  num_steps_trained: 15040


----------------- Evaluation at steps:380 starting ! -----------------
agent_timesteps_total: 1524000
custom_metrics: {}
date: 2021-10-06_18-50-50
done: false
episode_len_mean: 588.57
episode_media: {}
episode_reward_max: 3.9224500000000004
episode_reward_mean: 1.2179120000000017
episode_reward_min: -2.2014000000000005
episodes_this_iter: 4
episodes_total: 1106
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.832496702671051
        entropy_coeff: 0.0
        kl: 0.007806989364326
        model: {}
        policy_loss: -0.020149407908320427
        total_loss: -0.0009435725514777005
        vf_explained_var: 0.25420957803726196
        vf_loss: 0.016863739117980003
  num_agent_steps_sampled: 1524000
  num_agent_steps_trained: 1524000
  num_steps_sampled: 1524000
  num_steps_trained: 152400


----------------- Evaluation at steps:385 starting ! -----------------
agent_timesteps_total: 1544000
custom_metrics: {}
date: 2021-10-06_18-52-11
done: false
episode_len_mean: 590.05
episode_media: {}
episode_reward_max: 3.9224500000000004
episode_reward_mean: 1.2280880000000014
episode_reward_min: -2.1920500000000045
episodes_this_iter: 4
episodes_total: 1123
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9710769653320312
        entropy_coeff: 0.0
        kl: 0.010801045224070549
        model: {}
        policy_loss: -0.024321451783180237
        total_loss: -0.0031003537587821484
        vf_explained_var: 0.527294397354126
        vf_loss: 0.017980780452489853
  num_agent_steps_sampled: 1544000
  num_agent_steps_trained: 1544000
  num_steps_sampled: 1544000
  num_steps_trained: 1544


----------------- Evaluation at steps:390 starting ! -----------------
agent_timesteps_total: 1564000
custom_metrics: {}
date: 2021-10-06_18-53-35
done: false
episode_len_mean: 605.37
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 1.2362100000000016
episode_reward_min: -2.203549999999998
episodes_this_iter: 2
episodes_total: 1137
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9840363264083862
        entropy_coeff: 0.0
        kl: 0.007637825794517994
        model: {}
        policy_loss: -0.013117647729814053
        total_loss: 0.0026864330284297466
        vf_explained_var: 0.06338957697153091
        vf_loss: 0.013512732461094856
  num_agent_steps_sampled: 1564000
  num_agent_steps_trained: 1564000
  num_steps_sampled: 1564000
  num_steps_trained: 1564


----------------- Evaluation at steps:395 starting ! -----------------
agent_timesteps_total: 1584000
custom_metrics: {}
date: 2021-10-06_18-55-05
done: false
episode_len_mean: 621.98
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 0.8135145000000017
episode_reward_min: -2.203549999999998
episodes_this_iter: 3
episodes_total: 1153
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9653265476226807
        entropy_coeff: 0.0
        kl: 0.009935555048286915
        model: {}
        policy_loss: -0.0242098867893219
        total_loss: 0.005471035838127136
        vf_explained_var: -0.03928935527801514
        vf_loss: 0.026700260117650032
  num_agent_steps_sampled: 1584000
  num_agent_steps_trained: 1584000
  num_steps_sampled: 1584000
  num_steps_trained: 158400


----------------- Evaluation at steps:400 starting ! -----------------
agent_timesteps_total: 1604000
custom_metrics: {}
date: 2021-10-06_18-56-41
done: false
episode_len_mean: 638.55
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 0.6221980000000022
episode_reward_min: -2.203549999999998
episodes_this_iter: 3
episodes_total: 1167
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9058849811553955
        entropy_coeff: 0.0
        kl: 0.010776178911328316
        model: {}
        policy_loss: -0.02442409284412861
        total_loss: -0.009676402434706688
        vf_explained_var: 0.381454199552536
        vf_loss: 0.011514837853610516
  num_agent_steps_sampled: 1604000
  num_agent_steps_trained: 1604000
  num_steps_sampled: 1604000
  num_steps_trained: 1604000


----------------- Evaluation at steps:405 starting ! -----------------
agent_timesteps_total: 1624000
custom_metrics: {}
date: 2021-10-06_18-58-07
done: false
episode_len_mean: 646.73
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 0.5195620000000022
episode_reward_min: -2.203549999999998
episodes_this_iter: 3
episodes_total: 1182
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.1304736137390137
        entropy_coeff: 0.0
        kl: 0.015001663938164711
        model: {}
        policy_loss: -0.029406515881419182
        total_loss: -0.010379455983638763
        vf_explained_var: -0.1264178454875946
        vf_loss: 0.014526561833918095
  num_agent_steps_sampled: 1624000
  num_agent_steps_trained: 1624000
  num_steps_sampled: 1624000
  num_steps_trained: 1624


----------------- Evaluation at steps:410 starting ! -----------------
agent_timesteps_total: 1644000
custom_metrics: {}
date: 2021-10-06_18-59-39
done: false
episode_len_mean: 648.07
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 0.33826350000000216
episode_reward_min: -2.207100000000003
episodes_this_iter: 2
episodes_total: 1197
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9855207800865173
        entropy_coeff: 0.0
        kl: 0.009100137278437614
        model: {}
        policy_loss: -0.024412840604782104
        total_loss: -0.005309777799993753
        vf_explained_var: 0.09420622885227203
        vf_loss: 0.01637302152812481
  num_agent_steps_sampled: 1644000
  num_agent_steps_trained: 1644000
  num_steps_sampled: 1644000
  num_steps_trained: 1644


----------------- Evaluation at steps:415 starting ! -----------------
agent_timesteps_total: 1664000
custom_metrics: {}
date: 2021-10-06_19-01-18
done: false
episode_len_mean: 672.39
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 0.1365905000000024
episode_reward_min: -2.207100000000003
episodes_this_iter: 3
episodes_total: 1211
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0238502025604248
        entropy_coeff: 0.0
        kl: 0.010953929275274277
        model: {}
        policy_loss: -0.03417437523603439
        total_loss: -0.01790071278810501
        vf_explained_var: 0.19095350801944733
        vf_loss: 0.01298748143017292
  num_agent_steps_sampled: 1664000
  num_agent_steps_trained: 1664000
  num_steps_sampled: 1664000
  num_steps_trained: 1664000


----------------- Evaluation at steps:420 starting ! -----------------
agent_timesteps_total: 1684000
custom_metrics: {}
date: 2021-10-06_19-02-52
done: false
episode_len_mean: 671.98
episode_media: {}
episode_reward_max: 3.9433000000000007
episode_reward_mean: 0.19726850000000237
episode_reward_min: -2.2245500000000025
episodes_this_iter: 4
episodes_total: 1227
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9399458765983582
        entropy_coeff: 0.0
        kl: 0.009832200594246387
        model: {}
        policy_loss: -0.022859634831547737
        total_loss: -0.001544756582006812
        vf_explained_var: 0.28958091139793396
        vf_loss: 0.01836521551012993
  num_agent_steps_sampled: 1684000
  num_agent_steps_trained: 1684000
  num_steps_sampled: 1684000
  num_steps_trained: 168


----------------- Evaluation at steps:425 starting ! -----------------
agent_timesteps_total: 1704000
custom_metrics: {}
date: 2021-10-06_19-04-32
done: false
episode_len_mean: 667.53
episode_media: {}
episode_reward_max: 3.93795
episode_reward_mean: 0.21691250000000217
episode_reward_min: -2.230000000000002
episodes_this_iter: 3
episodes_total: 1242
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.9291951656341553
        entropy_coeff: 0.0
        kl: 0.006946704350411892
        model: {}
        policy_loss: -0.02733270265161991
        total_loss: -0.017590949311852455
        vf_explained_var: 0.3825155794620514
        vf_loss: 0.007657742593437433
  num_agent_steps_sampled: 1704000
  num_agent_steps_trained: 1704000
  num_steps_sampled: 1704000
  num_steps_trained: 1704000
iteratio


----------------- Evaluation at steps:430 starting ! -----------------
agent_timesteps_total: 1724000
custom_metrics: {}
date: 2021-10-06_19-06-00
done: false
episode_len_mean: 673.2
episode_media: {}
episode_reward_max: 3.93795
episode_reward_mean: 0.19554250000000223
episode_reward_min: -2.230000000000002
episodes_this_iter: 3
episodes_total: 1257
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0157088041305542
        entropy_coeff: 0.0
        kl: 0.012608752585947514
        model: {}
        policy_loss: -0.039159681648015976
        total_loss: -0.022886211052536964
        vf_explained_var: 0.27243542671203613
        vf_loss: 0.012490840628743172
  num_agent_steps_sampled: 1724000
  num_agent_steps_trained: 1724000
  num_steps_sampled: 1724000
  num_steps_trained: 1724000
iterati


----------------- Evaluation at steps:435 starting ! -----------------
agent_timesteps_total: 1744000
custom_metrics: {}
date: 2021-10-06_19-07-17
done: false
episode_len_mean: 669.49
episode_media: {}
episode_reward_max: 3.9115
episode_reward_mean: 0.18538600000000216
episode_reward_min: -2.230000000000002
episodes_this_iter: 5
episodes_total: 1273
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 0.8777223825454712
        entropy_coeff: 0.0
        kl: 0.012036032043397427
        model: {}
        policy_loss: -0.02625945396721363
        total_loss: -0.014362511225044727
        vf_explained_var: 0.5634160041809082
        vf_loss: 0.008286130614578724
  num_agent_steps_sampled: 1744000
  num_agent_steps_trained: 1744000
  num_steps_sampled: 1744000
  num_steps_trained: 1744000
iteration


----------------- Evaluation at steps:440 starting ! -----------------
agent_timesteps_total: 1764000
custom_metrics: {}
date: 2021-10-06_19-08-36
done: false
episode_len_mean: 673.24
episode_media: {}
episode_reward_max: 3.9139500000000003
episode_reward_mean: 0.3372885000000022
episode_reward_min: -2.230000000000002
episodes_this_iter: 3
episodes_total: 1286
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0186048746109009
        entropy_coeff: 0.0
        kl: 0.011246994137763977
        model: {}
        policy_loss: -0.0248117595911026
        total_loss: -0.007170907687395811
        vf_explained_var: -0.2326037436723709
        vf_loss: 0.014266752637922764
  num_agent_steps_sampled: 1764000
  num_agent_steps_trained: 1764000
  num_steps_sampled: 1764000
  num_steps_trained: 176400


----------------- Evaluation at steps:445 starting ! -----------------
agent_timesteps_total: 1784000
custom_metrics: {}
date: 2021-10-06_19-10-22
done: false
episode_len_mean: 667.09
episode_media: {}
episode_reward_max: 3.9139500000000003
episode_reward_mean: 0.45752050000000233
episode_reward_min: -2.230000000000002
episodes_this_iter: 3
episodes_total: 1302
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.000773549079895
        entropy_coeff: 0.0
        kl: 0.00953199528157711
        model: {}
        policy_loss: -0.028478924185037613
        total_loss: -0.005739246495068073
        vf_explained_var: 0.019312385469675064
        vf_loss: 0.019880080595612526
  num_agent_steps_sampled: 1784000
  num_agent_steps_trained: 1784000
  num_steps_sampled: 1784000
  num_steps_trained: 1784


----------------- Evaluation at steps:450 starting ! -----------------
agent_timesteps_total: 1804000
custom_metrics: {}
date: 2021-10-06_19-12-10
done: false
episode_len_mean: 656.92
episode_media: {}
episode_reward_max: 3.9139500000000003
episode_reward_mean: 0.45860350000000233
episode_reward_min: -2.230000000000002
episodes_this_iter: 2
episodes_total: 1318
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0514553785324097
        entropy_coeff: 0.0
        kl: 0.00945884920656681
        model: {}
        policy_loss: -0.027965663000941277
        total_loss: -0.005753252189606428
        vf_explained_var: 0.017624225467443466
        vf_loss: 0.01937476173043251
  num_agent_steps_sampled: 1804000
  num_agent_steps_trained: 1804000
  num_steps_sampled: 1804000
  num_steps_trained: 1804


----------------- Evaluation at steps:455 starting ! -----------------
agent_timesteps_total: 1824000
custom_metrics: {}
date: 2021-10-06_19-13-32
done: false
episode_len_mean: 655.87
episode_media: {}
episode_reward_max: 3.9159000000000006
episode_reward_mean: 0.5319905000000024
episode_reward_min: -2.1819500000000005
episodes_this_iter: 3
episodes_total: 1333
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.03117036819458
        entropy_coeff: 0.0
        kl: 0.010514600202441216
        model: {}
        policy_loss: -0.028470611199736595
        total_loss: -0.015947911888360977
        vf_explained_var: 0.3836665749549866
        vf_loss: 0.009368319064378738
  num_agent_steps_sampled: 1824000
  num_agent_steps_trained: 1824000
  num_steps_sampled: 1824000
  num_steps_trained: 182400


----------------- Evaluation at steps:460 starting ! -----------------
agent_timesteps_total: 1844000
custom_metrics: {}
date: 2021-10-06_19-14-57
done: false
episode_len_mean: 652.67
episode_media: {}
episode_reward_max: 3.9159000000000006
episode_reward_mean: 0.6632550000000024
episode_reward_min: -2.2007499999999993
episodes_this_iter: 3
episodes_total: 1349
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0122690200805664
        entropy_coeff: 0.0
        kl: 0.014070413075387478
        model: {}
        policy_loss: -0.026031630113720894
        total_loss: -0.010506493970751762
        vf_explained_var: 0.289519339799881
        vf_loss: 0.01130401249974966
  num_agent_steps_sampled: 1844000
  num_agent_steps_trained: 1844000
  num_steps_sampled: 1844000
  num_steps_trained: 184400


----------------- Evaluation at steps:465 starting ! -----------------
agent_timesteps_total: 1864000
custom_metrics: {}
date: 2021-10-06_19-16-43
done: false
episode_len_mean: 663.38
episode_media: {}
episode_reward_max: 3.9159000000000006
episode_reward_mean: 0.7144060000000023
episode_reward_min: -2.2007499999999993
episodes_this_iter: 3
episodes_total: 1363
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0405025482177734
        entropy_coeff: 0.0
        kl: 0.01550863403826952
        model: {}
        policy_loss: -0.03338352218270302
        total_loss: -0.010600099340081215
        vf_explained_var: 0.2509498596191406
        vf_loss: 0.0181308314204216
  num_agent_steps_sampled: 1864000
  num_agent_steps_trained: 1864000
  num_steps_sampled: 1864000
  num_steps_trained: 1864000



----------------- Evaluation at steps:470 starting ! -----------------
agent_timesteps_total: 1884000
custom_metrics: {}
date: 2021-10-06_19-18-16
done: false
episode_len_mean: 671.39
episode_media: {}
episode_reward_max: 3.9159000000000006
episode_reward_mean: 0.47430550000000227
episode_reward_min: -2.2007499999999993
episodes_this_iter: 2
episodes_total: 1377
experiment_id: 0258895a28d64448a57f5261e765e07b
hostname: DESKTOP
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.30000001192092896
        cur_lr: 4.999999873689376e-05
        entropy: 1.0871529579162598
        entropy_coeff: 0.0
        kl: 0.01084133516997099
        model: {}
        policy_loss: -0.03509129211306572
        total_loss: -0.015090832486748695
        vf_explained_var: 0.06174226105213165
        vf_loss: 0.016748061403632164
  num_agent_steps_sampled: 1884000
  num_agent_steps_trained: 1884000
  num_steps_sampled: 1884000
  num_steps_trained: 1884