In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    !apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
    %pip install -U tf-agents pyvirtualdisplay
    %pip install -U gym>=0.21.0
    %pip install -U gym[box2d,atari,accept-rom-license]

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
[33m0% [Waiting for headers] [Waiting for headers] [1 InRelease 3,626 B/3,626 B 100[0m[33m0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpad.net[0m[33m0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Conn[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
[33m0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait[0m                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
[33m0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [3 InRelease 14.2 kB/88.7 k[0m                                                                               Ign:4 https://developer.download.nvidia.com/compute/cuda/r

This was inspired by using a combination of labs, ray website, https://colab.research.google.com/github/ageron/handson-ml2/blob/master/18_reinforcement_learning.ipynb and https://www.anyscale.com/blog/an-introduction-to-reinforcement-learning-with-openai-gym-rllib-and-google

In [4]:
pip install ray==1.12.0


Collecting ray==1.12.0
  Downloading ray-1.12.0-cp37-cp37m-manylinux2014_x86_64.whl (53.2 MB)
[K     |████████████████████████████████| 53.2 MB 1.4 MB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 34.8 MB/s 
[?25hCollecting frozenlist
  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[K     |████████████████████████████████| 144 kB 49.0 MB/s 
[?25hCollecting virtualenv
  Downloading virtualenv-20.14.1-py2.py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 36.7 MB/s 
Collecting aiosignal
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting distlib<1,>=0.3.1
  Downloading distlib-0.3.4-py2.py3-none-any.whl (461 kB)
[K     |████████████████████████████████| 461 kB 47.8 MB/s 
[?25hCollecting platformdirs<3,>=2
  Downloading p

In [5]:
import psutil
import ray
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total

In [6]:
pip install gym==0.22

Collecting gym==0.22
  Downloading gym-0.22.0.tar.gz (631 kB)
[K     |████████████████████████████████| 631 kB 5.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.22.0-py3-none-any.whl size=708393 sha256=61e904c912c8ed730b61a47f6e775c7259094ab5a57cf6ae3e7a1dba1ea71331
  Stored in directory: /root/.cache/pip/wheels/7d/5e/87/7d50e0179edda70feff5bba05c381041e1c1fd80c6b06a4cc3
Successfully built gym
Installing collected packages: gym
  Attempting uninstall: gym
    Found existing installation: gym 0.23.1
    Uninstalling gym-0.23.1:
      Successfully uninstalled gym-0.23.1
Successfully installed gym-0.22.0


In [7]:
pip install -U tensorboardx

Collecting tensorboardx
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 25.2 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 8.8 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 7.6 MB/s eta 0:00:01[K     |██████████▌                     | 40 kB 7.1 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 4.5 MB/s eta 0:00:01[K     |███████████████▊                | 61 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▎             | 71 kB 5.6 MB/s eta 0:00:01[K     |█████████████████████           | 81 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████▌        | 92 kB 4.8 MB/s eta 0:00:01[K     |██████████████████████████▏     | 102 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████████▊   | 112 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████████████▍| 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 5.3 MB/s 
Inst

In [8]:
pip install lz4

Collecting lz4
  Downloading lz4-4.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 17.2 MB/s eta 0:00:01[K     |▌                               | 20 kB 18.7 MB/s eta 0:00:01[K     |▉                               | 30 kB 16.3 MB/s eta 0:00:01[K     |█                               | 40 kB 5.1 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.4 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.1 MB/s eta 0:00:01[K     |█▉                              | 71 kB 5.9 MB/s eta 0:00:01[K     |██▏                             | 81 kB 6.1 MB/s eta 0:00:01[K     |██▍                             | 92 kB 6.7 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.6 MB/s eta 0:00:01[K     |███                             | 112 kB 5.6 MB/s eta 0:00:01[K     |███▏                            | 122 kB 5.6 MB/s eta 0:00:01[K     |███▌                            

In [9]:
import gym
env = gym.make("CartPole-v0")


  f"The environment {path} is out of date. You should consider "


In [10]:
env.reset()

for i in range(20):

  # env.action_space.sample() produces either 0 (left) or 1 (right).
  observation, reward, done, info = env.step(env.action_space.sample())

  print("step", i, observation, reward, done, info)

env.close()

step 0 [ 0.04664953 -0.15030913 -0.0198714   0.3197851 ] 1.0 False {}
step 1 [ 0.04364334 -0.3451425  -0.0134757   0.60613567] 1.0 False {}
step 2 [ 0.03674049 -0.14983477 -0.00135299  0.30923888] 1.0 False {}
step 3 [ 0.0337438  -0.3449374   0.00483179  0.6014948 ] 1.0 False {}
step 4 [ 0.02684505 -0.14988337  0.01686169  0.31033772] 1.0 False {}
step 5 [ 0.02384738 -0.34524146  0.02306844  0.60829026] 1.0 False {}
step 6 [ 0.01694255 -0.5406782   0.03523425  0.9081489 ] 1.0 False {}
step 7 [ 0.00612899 -0.7362589   0.05339722  1.2116946 ] 1.0 False {}
step 8 [-0.00859619 -0.5418654   0.07763112  0.9362113 ] 1.0 False {}
step 9 [-0.0194335  -0.7379436   0.09635534  1.2522434 ] 1.0 False {}
step 10 [-0.03419237 -0.544179    0.12140021  0.9912294 ] 1.0 False {}
step 11 [-0.04507595 -0.74069804  0.1412248   1.3194424 ] 1.0 False {}
step 12 [-0.05988991 -0.54761547  0.16761364  1.0740842 ] 1.0 False {}
step 13 [-0.07084222 -0.74450773  0.18909533  1.4143316 ] 1.0 False {}
step 14 [-0.0857

  "You are calling 'step()' even though this "


In [11]:
# install dependencies needed for recording videos
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.*

Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.10).
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  x11-utils
0 upgraded, 1 newly installed, 0 to remove and 56 not upgraded.
Need to get 196 kB of archives.
After this operation, 650 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Fetched 196 kB in 1s (276 kB/s)
Selecting previously unselected package x11-utils.
(Reading database ... 157592 files and directories currently installed.)
Preparing to unpack .../x11-utils_7.7+3build1_amd64.deb ...
Unpacking x11-utils (7.7+3build1) ...
Setting up x11-utils (7.7+3build1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Collecting pyvirtualdisplay==0.2.*
  Downloading PyVirtualDisplay-0.2.5-py2.py3-none-any.whl (13 kB)
Collecting EasyProcess
  Downloading EasyProcess

In [12]:
from pyvirtualdisplay import Display
display = Display(visible=False, size=(1400, 900))
_ = display.start()

In [13]:
from gym.wrappers.monitoring.video_recorder import VideoRecorder
before_training = "before_training.mp4"

video = VideoRecorder(env, before_training)
# returns an initial observation
env.reset()
for i in range(200):
  video.capture_frame()
  # env.action_space.sample() produces either 0 (left) or 1 (right).
  observation, reward, done, info = env.step(env.action_space.sample())
  # Not printing this time
  #print("step", i, observation, reward, done, info)

video.close()
env.close()

  "You are calling 'step()' even though this "


In [14]:
from base64 import b64encode
def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  return f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'


In [15]:

from IPython.display import HTML
html = render_mp4(before_training)
HTML(html)

In [16]:
import ray
from ray.rllib.agents.ppo import PPOTrainer
config = {
    "env": "CartPole-v0",
    # Change the following line to `“framework”: “tf”` to use tensorflow
    "framework": "torch",
    "model": {
      "fcnet_hiddens": [32],
      "fcnet_activation": "linear",
    },
}
stop = {"episode_reward_mean": 195}
ray.shutdown()
ray.init(
  num_cpus=3,
  include_dashboard=False,
  ignore_reinit_error=True,
  log_to_driver=False,
)
# execute training 
analysis = ray.tune.run(
  "PPO",
  config=config,
  stop=stop,
  checkpoint_at_end=True,
)

2022-04-24 09:11:19,748	INFO trial_runner.py:803 -- starting PPO_CartPole-v0_80d83_00000


Trial name,status,loc
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156


Trial name,status,loc
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2022-04-24_09-11-44
  done: false
  episode_len_mean: 23.156976744186046
  episode_media: {}
  episode_reward_max: 69.0
  episode_reward_mean: 23.156976744186046
  episode_reward_min: 9.0
  episodes_this_iter: 172
  episodes_total: 172
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6839196100029894
          entropy_coeff: 0.0
          kl: 0.009778247770419302
          policy_loss: -0.020816732550500542
          total_loss: 9.190495002910655
          vf_explained_var: -0.0009381812105896652
          vf_loss: 9.209356138783116
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 4000
    num_agent

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,1,8.63111,4000,23.157,69,9,23.157


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2022-04-24_09-11-53
  done: false
  episode_len_mean: 28.46808510638298
  episode_media: {}
  episode_reward_max: 113.0
  episode_reward_mean: 28.46808510638298
  episode_reward_min: 9.0
  episodes_this_iter: 141
  episodes_total: 313
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6581101596355439
          entropy_coeff: 0.0
          kl: 0.00839860154657178
          policy_loss: -0.020951973316409897
          total_loss: 9.28779222426876
          vf_explained_var: 0.00039148574234336935
          vf_loss: 9.30706446555353
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 8000
    num_agent_ste

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,2,17.1627,8000,28.4681,113,9,28.4681


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2022-04-24_09-12-01
  done: false
  episode_len_mean: 36.716981132075475
  episode_media: {}
  episode_reward_max: 105.0
  episode_reward_mean: 36.716981132075475
  episode_reward_min: 9.0
  episodes_this_iter: 106
  episodes_total: 419
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6318089772296208
          entropy_coeff: 0.0
          kl: 0.0062570579861347
          policy_loss: -0.01412542988696406
          total_loss: 9.374547854802941
          vf_explained_var: -0.0026112207802393103
          vf_loss: 9.387421836647937
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 12000
    num_agent

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,3,25.6591,12000,36.717,105,9,36.717


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2022-04-24_09-12-10
  done: false
  episode_len_mean: 52.82
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 52.82
  episode_reward_min: 12.0
  episodes_this_iter: 66
  episodes_total: 485
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6108681246798525
          entropy_coeff: 0.0
          kl: 0.004364847846616408
          policy_loss: -0.0177167674246174
          total_loss: 9.511695763885333
          vf_explained_var: 0.0001603471335544381
          vf_loss: 9.528539556072603
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,4,34.132,16000,52.82,200,12,52.82


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2022-04-24_09-12-19
  done: false
  episode_len_mean: 78.52
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 78.52
  episode_reward_min: 12.0
  episodes_this_iter: 34
  episodes_total: 519
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.10000000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6066805481269796
          entropy_coeff: 0.0
          kl: 0.0020329963465903697
          policy_loss: -0.010770571940848904
          total_loss: 9.668381600738854
          vf_explained_var: -0.011401888055186118
          vf_loss: 9.678948874114662
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,5,43.1047,20000,78.52,200,12,78.52


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2022-04-24_09-12-27
  done: false
  episode_len_mean: 100.59
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 100.59
  episode_reward_min: 14.0
  episodes_this_iter: 29
  episodes_total: 548
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.05000000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5995812813440958
          entropy_coeff: 0.0
          kl: 0.0020656794005375653
          policy_loss: -0.014189997303389735
          total_loss: 9.720596438582225
          vf_explained_var: 0.003428938183733212
          vf_loss: 9.73468310756068
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,6,51.5312,24000,100.59,200,14,100.59


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2022-04-24_09-12-36
  done: false
  episode_len_mean: 125.56
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 125.56
  episode_reward_min: 19.0
  episodes_this_iter: 24
  episodes_total: 572
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.025000000000000005
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5941716926713143
          entropy_coeff: 0.0
          kl: 0.0024069850937424096
          policy_loss: -0.013166029072336612
          total_loss: 9.724966759835521
          vf_explained_var: -0.0068615899291089785
          vf_loss: 9.73807262707782
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 2800

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,7,60.0251,28000,125.56,200,19,125.56


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2022-04-24_09-12-44
  done: false
  episode_len_mean: 143.11
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 143.11
  episode_reward_min: 23.0
  episodes_this_iter: 26
  episodes_total: 598
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.012500000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.586487994142758
          entropy_coeff: 0.0
          kl: 0.0007287884865551821
          policy_loss: -0.011276548745371
          total_loss: 9.690420924976308
          vf_explained_var: 0.002576983167279151
          vf_loss: 9.701688363475185
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,8,68.4377,32000,143.11,200,23,143.11


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2022-04-24_09-12-52
  done: false
  episode_len_mean: 158.46
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 158.46
  episode_reward_min: 23.0
  episodes_this_iter: 22
  episodes_total: 620
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.006250000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5851640323797862
          entropy_coeff: 0.0
          kl: 0.002799852460836454
          policy_loss: -0.011259971660191333
          total_loss: 9.697878653003324
          vf_explained_var: 0.005126863269395726
          vf_loss: 9.709121132922428
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,9,76.8015,36000,158.46,200,23,158.46


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2022-04-24_09-13-01
  done: false
  episode_len_mean: 168.14
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 168.14
  episode_reward_min: 31.0
  episodes_this_iter: 22
  episodes_total: 642
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0031250000000000006
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5794236979176921
          entropy_coeff: 0.0
          kl: 0.0016919468924543849
          policy_loss: -0.011348220141183946
          total_loss: 9.67035953767838
          vf_explained_var: -2.1851383229737642e-05
          vf_loss: 9.681702500004922
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 40000
    num_agent_steps_trained: 40

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,10,85.3933,40000,168.14,200,31,168.14


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2022-04-24_09-13-10
  done: false
  episode_len_mean: 173.72
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 173.72
  episode_reward_min: 31.0
  episodes_this_iter: 22
  episodes_total: 664
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0015625000000000003
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5790254623659196
          entropy_coeff: 0.0
          kl: 0.00040699922062195057
          policy_loss: -0.009263232823020669
          total_loss: 9.656951559743574
          vf_explained_var: 0.0009052336856883059
          vf_loss: 9.666214212807276
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,11,93.8813,44000,173.72,200,31,173.72


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2022-04-24_09-13-18
  done: false
  episode_len_mean: 178.77
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 178.77
  episode_reward_min: 31.0
  episodes_this_iter: 21
  episodes_total: 685
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0007812500000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.56957894441902
          entropy_coeff: 0.0
          kl: 0.0015169642715566487
          policy_loss: -0.01115473327777719
          total_loss: 9.671778877832557
          vf_explained_var: 0.004163157747637841
          vf_loss: 9.682932447618054
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 48000
    num_agent_steps_trained: 48000
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,12,102.424,48000,178.77,200,31,178.77


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 52000
  custom_metrics: {}
  date: 2022-04-24_09-13-27
  done: false
  episode_len_mean: 186.14
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 186.14
  episode_reward_min: 36.0
  episodes_this_iter: 21
  episodes_total: 706
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0003906250000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5753160252366015
          entropy_coeff: 0.0
          kl: 0.003192505415115012
          policy_loss: -0.011150707923857275
          total_loss: 9.659635301815566
          vf_explained_var: -0.02457510002197758
          vf_loss: 9.67078477695424
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 52000
    num_agent_steps_trained: 52000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,13,110.827,52000,186.14,200,36,186.14


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 56000
  custom_metrics: {}
  date: 2022-04-24_09-13-35
  done: false
  episode_len_mean: 189.26
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 189.26
  episode_reward_min: 36.0
  episodes_this_iter: 20
  episodes_total: 726
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.00019531250000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5821328469502028
          entropy_coeff: 0.0
          kl: 0.002013003285206207
          policy_loss: -0.011152190756132847
          total_loss: 9.659624739616149
          vf_explained_var: -0.02332429321863318
          vf_loss: 9.670776602016982
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 56000
    num_agent_steps_trained: 5600

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,14,119.297,56000,189.26,200,36,189.26


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 60000
  custom_metrics: {}
  date: 2022-04-24_09-13-44
  done: false
  episode_len_mean: 186.4
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 186.4
  episode_reward_min: 26.0
  episodes_this_iter: 23
  episodes_total: 749
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.765625000000002e-05
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5622739823274715
          entropy_coeff: 0.0
          kl: 0.002663161546694063
          policy_loss: -0.012205908185131448
          total_loss: 9.638259566727506
          vf_explained_var: 0.00523521868131494
          vf_loss: 9.65046529872443
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 60000
    num_agent_steps_trained: 60000
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,15,127.888,60000,186.4,200,26,186.4


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 64000
  custom_metrics: {}
  date: 2022-04-24_09-13-52
  done: false
  episode_len_mean: 190.27
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 190.27
  episode_reward_min: 26.0
  episodes_this_iter: 20
  episodes_total: 769
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.882812500000001e-05
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5574787519952302
          entropy_coeff: 0.0
          kl: 0.001753975090084643
          policy_loss: -0.009764933493989771
          total_loss: 9.65657841261997
          vf_explained_var: 0.005827916758034819
          vf_loss: 9.666343340309718
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 64000
    num_agent_steps_trained: 64000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,16,136.482,64000,190.27,200,26,190.27


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 68000
  custom_metrics: {}
  date: 2022-04-24_09-14-01
  done: false
  episode_len_mean: 191.24
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 191.24
  episode_reward_min: 26.0
  episodes_this_iter: 21
  episodes_total: 790
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.4414062500000005e-05
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5503572053165846
          entropy_coeff: 0.0
          kl: 0.003918760887190871
          policy_loss: -0.011244714290143983
          total_loss: 9.632140828204411
          vf_explained_var: 0.001134013552819529
          vf_loss: 9.643385536439958
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 68000
    num_agent_steps_trained: 6800

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,17,144.952,68000,191.24,200,26,191.24


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 72000
  custom_metrics: {}
  date: 2022-04-24_09-14-09
  done: false
  episode_len_mean: 190.12
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 190.12
  episode_reward_min: 26.0
  episodes_this_iter: 21
  episodes_total: 811
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.2207031250000002e-05
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5547392613144331
          entropy_coeff: 0.0
          kl: 0.002478076267506706
          policy_loss: -0.010638751840639499
          total_loss: 9.615228980074647
          vf_explained_var: -0.006971975359865414
          vf_loss: 9.625867731853198
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 72000
    num_agent_steps_trained: 720

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,18,153.358,72000,190.12,200,26,190.12


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 76000
  custom_metrics: {}
  date: 2022-04-24_09-14-18
  done: false
  episode_len_mean: 191.89
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 191.89
  episode_reward_min: 26.0
  episodes_this_iter: 21
  episodes_total: 832
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 6.103515625000001e-06
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5613882101992125
          entropy_coeff: 0.0
          kl: 0.002601680132097555
          policy_loss: -0.010177657599010135
          total_loss: 9.609634516316076
          vf_explained_var: 0.01579469653867906
          vf_loss: 9.619812157333538
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 76000
    num_agent_steps_trained: 76000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,19,161.88,76000,191.89,200,26,191.89


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 80000
  custom_metrics: {}
  date: 2022-04-24_09-14-26
  done: false
  episode_len_mean: 193.46
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 193.46
  episode_reward_min: 126.0
  episodes_this_iter: 20
  episodes_total: 852
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.0517578125000006e-06
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5624343554178873
          entropy_coeff: 0.0
          kl: 5.71605358428249e-05
          policy_loss: -0.007414397085586223
          total_loss: 9.640259366394371
          vf_explained_var: -0.015875320537115937
          vf_loss: 9.647673763767365
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 80000
    num_agent_steps_trained: 80

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,20,170.302,80000,193.46,200,126,193.46


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 84000
  custom_metrics: {}
  date: 2022-04-24_09-14-35
  done: false
  episode_len_mean: 192.24
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 192.24
  episode_reward_min: 122.0
  episodes_this_iter: 21
  episodes_total: 873
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.5258789062500003e-06
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5690332411437906
          entropy_coeff: 0.0
          kl: 0.00145728788929186
          policy_loss: -0.008561740640891335
          total_loss: 9.591369492520569
          vf_explained_var: 0.008418364037749588
          vf_loss: 9.599931235979962
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 84000
    num_agent_steps_trained: 8400

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,21,178.687,84000,192.24,200,122,192.24


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 88000
  custom_metrics: {}
  date: 2022-04-24_09-14-43
  done: false
  episode_len_mean: 191.59
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 191.59
  episode_reward_min: 122.0
  episodes_this_iter: 21
  episodes_total: 894
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 7.629394531250001e-07
          cur_lr: 5.0000000000000016e-05
          entropy: 0.567722192566882
          entropy_coeff: 0.0
          kl: 0.00289199062975686
          policy_loss: -0.009148495732956837
          total_loss: 9.580978888850058
          vf_explained_var: 0.0009784463913209977
          vf_loss: 9.590127391199912
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 88000
    num_agent_steps_trained: 88000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,22,187.236,88000,191.59,200,122,191.59


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 92000
  custom_metrics: {}
  date: 2022-04-24_09-14-52
  done: false
  episode_len_mean: 189.81
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 189.81
  episode_reward_min: 121.0
  episodes_this_iter: 22
  episodes_total: 916
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.814697265625001e-07
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5668606261412302
          entropy_coeff: 0.0
          kl: 0.0020208417508371895
          policy_loss: -0.007697106066650601
          total_loss: 9.552147306421752
          vf_explained_var: -0.015425552027199857
          vf_loss: 9.559844442593153
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 92000
    num_agent_steps_trained: 92

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,23,195.587,92000,189.81,200,121,189.81


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 96000
  custom_metrics: {}
  date: 2022-04-24_09-15-00
  done: false
  episode_len_mean: 181.95
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 181.95
  episode_reward_min: 73.0
  episodes_this_iter: 25
  episodes_total: 941
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.9073486328125004e-07
          cur_lr: 5.0000000000000016e-05
          entropy: 0.558614698533089
          entropy_coeff: 0.0
          kl: 0.005347022793602956
          policy_loss: -0.01871847453457053
          total_loss: 9.596814995427286
          vf_explained_var: -0.1288243080339124
          vf_loss: 9.615533489309332
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 96000
    num_agent_steps_trained: 96000
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,24,204.023,96000,181.95,200,73,181.95


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 100000
  custom_metrics: {}
  date: 2022-04-24_09-15-09
  done: false
  episode_len_mean: 179.42
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 179.42
  episode_reward_min: 73.0
  episodes_this_iter: 22
  episodes_total: 963
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.9073486328125004e-07
          cur_lr: 5.0000000000000016e-05
          entropy: 0.572360090350592
          entropy_coeff: 0.0
          kl: 0.00316656577069303
          policy_loss: -0.009217159061502386
          total_loss: 9.586631885651618
          vf_explained_var: -0.02613584373586921
          vf_loss: 9.595849066908642
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 100000
    num_agent_steps_trained: 1000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,25,212.357,100000,179.42,200,73,179.42


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 104000
  custom_metrics: {}
  date: 2022-04-24_09-15-17
  done: false
  episode_len_mean: 178.66
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 178.66
  episode_reward_min: 73.0
  episodes_this_iter: 20
  episodes_total: 983
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.536743164062502e-08
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5680396663886245
          entropy_coeff: 0.0
          kl: 0.0012103082405986456
          policy_loss: -0.008785517565825934
          total_loss: 9.674835549631426
          vf_explained_var: -0.013925858466855942
          vf_loss: 9.683621109172861
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 104000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,26,220.801,104000,178.66,200,73,178.66


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 108000
  custom_metrics: {}
  date: 2022-04-24_09-15-26
  done: false
  episode_len_mean: 179.49
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 179.49
  episode_reward_min: 73.0
  episodes_this_iter: 22
  episodes_total: 1005
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.768371582031251e-08
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5691961933207769
          entropy_coeff: 0.0
          kl: 0.00025335783376250436
          policy_loss: -0.006658322848780181
          total_loss: 9.583373374323692
          vf_explained_var: 0.07805346808125896
          vf_loss: 9.590031698698638
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 108000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,27,229.23,108000,179.49,200,73,179.49


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 112000
  custom_metrics: {}
  date: 2022-04-24_09-15-34
  done: false
  episode_len_mean: 182.64
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 182.64
  episode_reward_min: 73.0
  episodes_this_iter: 22
  episodes_total: 1027
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.3841857910156255e-08
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5647007225021239
          entropy_coeff: 0.0
          kl: 0.0017369103776414922
          policy_loss: -0.007693417276686398
          total_loss: 9.558698248094128
          vf_explained_var: 0.05542719453893682
          vf_loss: 9.566391650579309
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 112000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,28,237.752,112000,182.64,200,73,182.64


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 116000
  custom_metrics: {}
  date: 2022-04-24_09-15-43
  done: false
  episode_len_mean: 183.56
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 183.56
  episode_reward_min: 25.0
  episodes_this_iter: 23
  episodes_total: 1050
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.1920928955078127e-08
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5602951606755615
          entropy_coeff: 0.0
          kl: 0.0033025067302891106
          policy_loss: -0.007693423643227546
          total_loss: 9.506006798692928
          vf_explained_var: 0.07515301781315957
          vf_loss: 9.513700204254478
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 116000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,29,246.219,116000,183.56,200,25,183.56


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 120000
  custom_metrics: {}
  date: 2022-04-24_09-15-51
  done: false
  episode_len_mean: 185.45
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 185.45
  episode_reward_min: 25.0
  episodes_this_iter: 20
  episodes_total: 1070
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.960464477539064e-09
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5662627904004948
          entropy_coeff: 0.0
          kl: 0.0002891961488881297
          policy_loss: -0.005806333413447744
          total_loss: 9.585081822385071
          vf_explained_var: 0.010686926623826386
          vf_loss: 9.590888161813059
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 120000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,30,254.695,120000,185.45,200,25,185.45


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 124000
  custom_metrics: {}
  date: 2022-04-24_09-16-00
  done: false
  episode_len_mean: 186.27
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 186.27
  episode_reward_min: 22.0
  episodes_this_iter: 21
  episodes_total: 1091
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.980232238769532e-09
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5684262174431995
          entropy_coeff: 0.0
          kl: 0.0010512359856061939
          policy_loss: -0.007796977707735633
          total_loss: 9.567833448225452
          vf_explained_var: 0.007319008663136472
          vf_loss: 9.575630431021413
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 124000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,31,263.255,124000,186.27,200,22,186.27


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 128000
  custom_metrics: {}
  date: 2022-04-24_09-16-08
  done: false
  episode_len_mean: 188.02
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 188.02
  episode_reward_min: 22.0
  episodes_this_iter: 21
  episodes_total: 1112
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.490116119384766e-09
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5838359405276596
          entropy_coeff: 0.0
          kl: 0.001517433209312169
          policy_loss: -0.010925018904550422
          total_loss: 9.58956216996716
          vf_explained_var: 0.014379340538414576
          vf_loss: 9.600487201444563
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 128000
    num_agent_steps_trained: 128

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,32,271.775,128000,188.02,200,22,188.02


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 132000
  custom_metrics: {}
  date: 2022-04-24_09-16-17
  done: false
  episode_len_mean: 186.95
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 186.95
  episode_reward_min: 22.0
  episodes_this_iter: 22
  episodes_total: 1134
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 7.45058059692383e-10
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5792416502711594
          entropy_coeff: 0.0
          kl: 0.0016636862316673489
          policy_loss: -0.005920381160072421
          total_loss: 9.555989306460145
          vf_explained_var: 0.03983911968046619
          vf_loss: 9.561909707387288
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 132000
    num_agent_steps_trained: 132

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,33,280.33,132000,186.95,200,22,186.95


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 136000
  custom_metrics: {}
  date: 2022-04-24_09-16-25
  done: false
  episode_len_mean: 192.39
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 192.39
  episode_reward_min: 22.0
  episodes_this_iter: 20
  episodes_total: 1154
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.725290298461915e-10
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5781540152206216
          entropy_coeff: 0.0
          kl: 0.0019049488690346837
          policy_loss: -0.00526002020705291
          total_loss: 9.481499882154568
          vf_explained_var: 0.04099181678987319
          vf_loss: 9.486759924119518
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 136000
    num_agent_steps_trained: 136

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,34,288.882,136000,192.39,200,22,192.39


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 140000
  custom_metrics: {}
  date: 2022-04-24_09-16-34
  done: false
  episode_len_mean: 188.83
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 188.83
  episode_reward_min: 22.0
  episodes_this_iter: 22
  episodes_total: 1176
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.8626451492309574e-10
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5609233637650808
          entropy_coeff: 0.0
          kl: 0.001462210041786328
          policy_loss: -0.0052856561337267195
          total_loss: 9.491655298458632
          vf_explained_var: 0.054971131970805505
          vf_loss: 9.496940951193533
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 140000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,35,297.428,140000,188.83,200,22,188.83


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 144000
  custom_metrics: {}
  date: 2022-04-24_09-16-43
  done: false
  episode_len_mean: 187.54
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 187.54
  episode_reward_min: 42.0
  episodes_this_iter: 22
  episodes_total: 1198
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.313225746154787e-11
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5658022277457739
          entropy_coeff: 0.0
          kl: 0.0006242887538240258
          policy_loss: -0.003431901664182704
          total_loss: 9.522793139180829
          vf_explained_var: 0.056476282240242086
          vf_loss: 9.526225042855868
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 144000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,36,305.941,144000,187.54,200,42,187.54


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 148000
  custom_metrics: {}
  date: 2022-04-24_09-16-51
  done: false
  episode_len_mean: 185.16
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 185.16
  episode_reward_min: 42.0
  episodes_this_iter: 22
  episodes_total: 1220
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.6566128730773935e-11
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5555374029823529
          entropy_coeff: 0.0
          kl: 0.00035504934320858066
          policy_loss: -0.009292597603553566
          total_loss: 9.483041994033321
          vf_explained_var: -0.005578195215553366
          vf_loss: 9.492334602725121
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 148000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,37,314.407,148000,185.16,200,42,185.16


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 152000
  custom_metrics: {}
  date: 2022-04-24_09-17-00
  done: false
  episode_len_mean: 186.46
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 186.46
  episode_reward_min: 42.0
  episodes_this_iter: 20
  episodes_total: 1240
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.3283064365386967e-11
          cur_lr: 5.0000000000000016e-05
          entropy: 0.570781216057398
          entropy_coeff: 0.0
          kl: 0.001072348294300299
          policy_loss: -0.0034508399024445525
          total_loss: 9.488938189065584
          vf_explained_var: 0.0240933935488424
          vf_loss: 9.492389025739444
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 152000
    num_agent_steps_trained: 152

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,38,322.868,152000,186.46,200,42,186.46


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 156000
  custom_metrics: {}
  date: 2022-04-24_09-17-08
  done: false
  episode_len_mean: 185.72
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 185.72
  episode_reward_min: 27.0
  episodes_this_iter: 23
  episodes_total: 1263
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.1641532182693484e-11
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5677070617675781
          entropy_coeff: 0.0
          kl: 0.0013173030773421174
          policy_loss: -0.002490641094583978
          total_loss: 9.477876507338657
          vf_explained_var: -0.12742194898666875
          vf_loss: 9.480367120107015
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 156000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,39,331.396,156000,185.72,200,27,185.72


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 160000
  custom_metrics: {}
  date: 2022-04-24_09-17-17
  done: false
  episode_len_mean: 188.01
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 188.01
  episode_reward_min: 27.0
  episodes_this_iter: 20
  episodes_total: 1283
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.820766091346742e-12
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5654508854753227
          entropy_coeff: 0.0
          kl: 3.149049012599877e-05
          policy_loss: -0.0014007880602793026
          total_loss: 9.428747105342085
          vf_explained_var: -0.1048680753477158
          vf_loss: 9.430147892941712
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 160000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,40,339.929,160000,188.01,200,27,188.01


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 164000
  custom_metrics: {}
  date: 2022-04-24_09-17-25
  done: false
  episode_len_mean: 189.25
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 189.25
  episode_reward_min: 27.0
  episodes_this_iter: 21
  episodes_total: 1304
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.910383045673371e-12
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5689993535959592
          entropy_coeff: 0.0
          kl: 0.002951876988499978
          policy_loss: -0.0031461782212699615
          total_loss: 9.393226293338243
          vf_explained_var: -0.05793058397949383
          vf_loss: 9.396372469009892
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 164000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,41,348.392,164000,189.25,200,27,189.25


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 168000
  custom_metrics: {}
  date: 2022-04-24_09-17-34
  done: false
  episode_len_mean: 188.64
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 188.64
  episode_reward_min: 27.0
  episodes_this_iter: 21
  episodes_total: 1325
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.4551915228366855e-12
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5558628489894252
          entropy_coeff: 0.0
          kl: 0.00044432568362390897
          policy_loss: 0.0001707740438481172
          total_loss: 9.478968351630755
          vf_explained_var: -0.21728787928499202
          vf_loss: 9.478797568044355
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 168000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,42,356.772,168000,188.64,200,27,188.64


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 172000
  custom_metrics: {}
  date: 2022-04-24_09-17-42
  done: false
  episode_len_mean: 191.67
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 191.67
  episode_reward_min: 131.0
  episodes_this_iter: 21
  episodes_total: 1346
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 7.275957614183427e-13
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5646834116469147
          entropy_coeff: 0.0
          kl: 0.001484836862967142
          policy_loss: -0.0017234864174037851
          total_loss: 9.429816459327617
          vf_explained_var: -0.14129817287127178
          vf_loss: 9.4315399354504
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 172000
    num_agent_steps_trained: 17

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,43,365.272,172000,191.67,200,131,191.67


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 176000
  custom_metrics: {}
  date: 2022-04-24_09-17-51
  done: false
  episode_len_mean: 191.31
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 191.31
  episode_reward_min: 127.0
  episodes_this_iter: 22
  episodes_total: 1368
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.6379788070917137e-13
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5524164936875784
          entropy_coeff: 0.0
          kl: 0.0004347559752113507
          policy_loss: -0.006042644545756361
          total_loss: 9.481778462727865
          vf_explained_var: -0.16947289295093987
          vf_loss: 9.487821132906022
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 176000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,44,373.743,176000,191.31,200,127,191.31


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 180000
  custom_metrics: {}
  date: 2022-04-24_09-17-59
  done: false
  episode_len_mean: 192.16
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 192.16
  episode_reward_min: 127.0
  episodes_this_iter: 20
  episodes_total: 1388
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.8189894035458568e-13
          cur_lr: 5.0000000000000016e-05
          entropy: 0.571206086489462
          entropy_coeff: 0.0
          kl: 0.00047820868277194964
          policy_loss: -0.004297135241570011
          total_loss: 9.350988490607149
          vf_explained_var: -0.12084831569784431
          vf_loss: 9.355285653760356
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 180000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,45,382.237,180000,192.16,200,127,192.16


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 184000
  custom_metrics: {}
  date: 2022-04-24_09-18-08
  done: false
  episode_len_mean: 191.85
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 191.85
  episode_reward_min: 127.0
  episodes_this_iter: 20
  episodes_total: 1408
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.094947017729284e-14
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5719224590127187
          entropy_coeff: 0.0
          kl: 0.00035613183837561327
          policy_loss: 0.0016725645470683293
          total_loss: 9.250369630834108
          vf_explained_var: -0.13478652155527504
          vf_loss: 9.248697051181589
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 184000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,46,390.767,184000,191.85,200,127,191.85


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 188000
  custom_metrics: {}
  date: 2022-04-24_09-18-16
  done: false
  episode_len_mean: 194.45
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 194.45
  episode_reward_min: 127.0
  episodes_this_iter: 20
  episodes_total: 1428
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.547473508864642e-14
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5722068511670636
          entropy_coeff: 0.0
          kl: 0.00045525014773053677
          policy_loss: 0.0010879357655843099
          total_loss: 9.425004536618468
          vf_explained_var: -0.17635956374547815
          vf_loss: 9.423916596238332
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 188000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,47,399.305,188000,194.45,200,127,194.45


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 192000
  custom_metrics: {}
  date: 2022-04-24_09-18-25
  done: false
  episode_len_mean: 193.73
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 193.73
  episode_reward_min: 127.0
  episodes_this_iter: 21
  episodes_total: 1449
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.273736754432321e-14
          cur_lr: 5.0000000000000016e-05
          entropy: 0.578575655465485
          entropy_coeff: 0.0
          kl: 0.0025631012530251502
          policy_loss: -0.005469098724725265
          total_loss: 9.585345767646707
          vf_explained_var: -0.2471023113496842
          vf_loss: 9.590814879632765
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 192000
    num_agent_steps_trained: 19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,48,407.858,192000,193.73,200,127,193.73


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 196000
  custom_metrics: {}
  date: 2022-04-24_09-18-33
  done: false
  episode_len_mean: 192.67
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 192.67
  episode_reward_min: 139.0
  episodes_this_iter: 22
  episodes_total: 1471
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.1368683772161605e-14
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5793785535520123
          entropy_coeff: 0.0
          kl: 0.0006892615349706058
          policy_loss: 0.0012484980296463735
          total_loss: 9.354372935654014
          vf_explained_var: -0.28513176146373953
          vf_loss: 9.353124482144592
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 196000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,49,416.384,196000,192.67,200,139,192.67


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 200000
  custom_metrics: {}
  date: 2022-04-24_09-18-42
  done: false
  episode_len_mean: 189.01
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 189.01
  episode_reward_min: 98.0
  episodes_this_iter: 22
  episodes_total: 1493
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.6843418860808026e-15
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5751144515570774
          entropy_coeff: 0.0
          kl: 0.0005385970526518931
          policy_loss: -0.004146970231686869
          total_loss: 9.379499357490129
          vf_explained_var: -0.2730767379524887
          vf_loss: 9.383646313862133
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 200000
    num_agent_steps_trained: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,50,424.888,200000,189.01,200,98,189.01


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 204000
  custom_metrics: {}
  date: 2022-04-24_09-18-51
  done: false
  episode_len_mean: 186.6
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 186.6
  episode_reward_min: 98.0
  episodes_this_iter: 22
  episodes_total: 1515
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.8421709430404013e-15
          cur_lr: 5.0000000000000016e-05
          entropy: 0.577980641780361
          entropy_coeff: 0.0
          kl: 0.0030553529398679134
          policy_loss: -0.007904138968836877
          total_loss: 9.452188515406784
          vf_explained_var: -0.2579213676914092
          vf_loss: 9.460092637872183
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 204000
    num_agent_steps_trained: 2040

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,51,433.408,204000,186.6,200,98,186.6


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 208000
  custom_metrics: {}
  date: 2022-04-24_09-18-59
  done: false
  episode_len_mean: 179.35
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 179.35
  episode_reward_min: 98.0
  episodes_this_iter: 24
  episodes_total: 1539
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.4210854715202006e-15
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5762204343272793
          entropy_coeff: 0.0
          kl: 0.0028698095941400406
          policy_loss: 0.0017025529487078549
          total_loss: 9.281833611765215
          vf_explained_var: -0.31574045676057055
          vf_loss: 9.280131033415435
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 208000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,52,441.952,208000,179.35,200,98,179.35


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 212000
  custom_metrics: {}
  date: 2022-04-24_09-19-08
  done: false
  episode_len_mean: 177.97
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 177.97
  episode_reward_min: 98.0
  episodes_this_iter: 22
  episodes_total: 1561
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 7.105427357601003e-16
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5734012847305626
          entropy_coeff: 0.0
          kl: 0.00320931537244541
          policy_loss: -0.0011741512825572363
          total_loss: 9.21807260667124
          vf_explained_var: -0.2954997319047169
          vf_loss: 9.219246733573176
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 212000
    num_agent_steps_trained: 2120

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,53,450.353,212000,177.97,200,98,177.97


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 216000
  custom_metrics: {}
  date: 2022-04-24_09-19-16
  done: false
  episode_len_mean: 180.08
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 180.08
  episode_reward_min: 121.0
  episodes_this_iter: 21
  episodes_total: 1582
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.5527136788005016e-16
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5761338897289768
          entropy_coeff: 0.0
          kl: 0.0031979973077282415
          policy_loss: -0.0020300497255858873
          total_loss: 9.423355493237896
          vf_explained_var: -0.17377544199266742
          vf_loss: 9.425385564373386
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 216000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,54,458.92,216000,180.08,200,121,180.08


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 220000
  custom_metrics: {}
  date: 2022-04-24_09-19-25
  done: false
  episode_len_mean: 180.3
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 180.3
  episode_reward_min: 73.0
  episodes_this_iter: 21
  episodes_total: 1603
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.7763568394002508e-16
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5779854436715444
          entropy_coeff: 0.0
          kl: 0.0005758779365542789
          policy_loss: 0.0024359838135780826
          total_loss: 9.316075578299902
          vf_explained_var: -0.38024312027039064
          vf_loss: 9.313639599277128
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 220000
    num_agent_steps_trained: 22

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,55,467.397,220000,180.3,200,73,180.3


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 224000
  custom_metrics: {}
  date: 2022-04-24_09-19-33
  done: false
  episode_len_mean: 182.4
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 182.4
  episode_reward_min: 23.0
  episodes_this_iter: 21
  episodes_total: 1624
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 8.881784197001254e-17
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5783438412092066
          entropy_coeff: 0.0
          kl: 0.00023320565779565643
          policy_loss: 0.003085373680517879
          total_loss: 9.38633152490021
          vf_explained_var: -0.4707480283834601
          vf_loss: 9.383246146991688
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 224000
    num_agent_steps_trained: 22400

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,56,475.956,224000,182.4,200,23,182.4


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 228000
  custom_metrics: {}
  date: 2022-04-24_09-19-42
  done: false
  episode_len_mean: 187.24
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 187.24
  episode_reward_min: 23.0
  episodes_this_iter: 22
  episodes_total: 1646
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.440892098500627e-17
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5751623835614933
          entropy_coeff: 0.0
          kl: 0.0028390353274866604
          policy_loss: 8.045964303516572e-05
          total_loss: 9.168579373821135
          vf_explained_var: -0.5618781067991769
          vf_loss: 9.168498912421606
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 228000
    num_agent_steps_trained: 22

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,57,484.424,228000,187.24,200,23,187.24


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 232000
  custom_metrics: {}
  date: 2022-04-24_09-19-50
  done: false
  episode_len_mean: 188.65
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 188.65
  episode_reward_min: 23.0
  episodes_this_iter: 21
  episodes_total: 1667
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.2204460492503135e-17
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5861041665077209
          entropy_coeff: 0.0
          kl: 0.0030544035560643233
          policy_loss: 0.002638512377136497
          total_loss: 9.180256869203301
          vf_explained_var: -0.4779194654316031
          vf_loss: 9.177618332319362
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 232000
    num_agent_steps_trained: 23

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,58,492.906,232000,188.65,200,23,188.65


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 236000
  custom_metrics: {}
  date: 2022-04-24_09-19-59
  done: false
  episode_len_mean: 185.99
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 185.99
  episode_reward_min: 23.0
  episodes_this_iter: 23
  episodes_total: 1690
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.1102230246251568e-17
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5863507937359553
          entropy_coeff: 0.0
          kl: 0.003426102894368321
          policy_loss: 0.005568124538147322
          total_loss: 9.021968883596442
          vf_explained_var: -0.558405287099141
          vf_loss: 9.016400734583536
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 236000
    num_agent_steps_trained: 2360

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,59,501.422,236000,185.99,200,23,185.99


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 240000
  custom_metrics: {}
  date: 2022-04-24_09-20-07
  done: false
  episode_len_mean: 181.07
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 181.07
  episode_reward_min: 23.0
  episodes_this_iter: 23
  episodes_total: 1713
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.551115123125784e-18
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5768563134054984
          entropy_coeff: 0.0
          kl: 0.001212835624017105
          policy_loss: -0.008208702401488379
          total_loss: 9.215693051840669
          vf_explained_var: -0.5698948378844928
          vf_loss: 9.223901762500885
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 240000
    num_agent_steps_trained: 240

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,60,510.005,240000,181.07,200,23,181.07


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 244000
  custom_metrics: {}
  date: 2022-04-24_09-20-16
  done: false
  episode_len_mean: 173.14
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 173.14
  episode_reward_min: 40.0
  episodes_this_iter: 26
  episodes_total: 1739
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.775557561562892e-18
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5765879242010014
          entropy_coeff: 0.0
          kl: 0.002554927044133164
          policy_loss: 0.001292708299813732
          total_loss: 9.10390821528691
          vf_explained_var: -0.6374127469396078
          vf_loss: 9.102615517954673
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 244000
    num_agent_steps_trained: 24400

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,61,518.486,244000,173.14,200,40,173.14


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 248000
  custom_metrics: {}
  date: 2022-04-24_09-20-24
  done: false
  episode_len_mean: 170.07
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 170.07
  episode_reward_min: 40.0
  episodes_this_iter: 22
  episodes_total: 1761
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.387778780781446e-18
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5841460552907759
          entropy_coeff: 0.0
          kl: 0.0018236408033486865
          policy_loss: 0.003939533714325197
          total_loss: 8.984915590798982
          vf_explained_var: -0.5282676070608119
          vf_loss: 8.980976045772593
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 248000
    num_agent_steps_trained: 248

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,62,526.819,248000,170.07,200,40,170.07


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 252000
  custom_metrics: {}
  date: 2022-04-24_09-20-33
  done: false
  episode_len_mean: 162.98
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 162.98
  episode_reward_min: 100.0
  episodes_this_iter: 26
  episodes_total: 1787
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 6.93889390390723e-19
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5833410634148506
          entropy_coeff: 0.0
          kl: 0.002125195291586068
          policy_loss: 0.005782305345099459
          total_loss: 9.170904133909492
          vf_explained_var: -0.6178305671420149
          vf_loss: 9.165121810666976
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 252000
    num_agent_steps_trained: 2520

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,63,535.355,252000,162.98,200,100,162.98


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 256000
  custom_metrics: {}
  date: 2022-04-24_09-20-41
  done: false
  episode_len_mean: 161.44
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 161.44
  episode_reward_min: 100.0
  episodes_this_iter: 25
  episodes_total: 1812
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.469446951953615e-19
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5795802759867843
          entropy_coeff: 0.0
          kl: 0.0008766694211577043
          policy_loss: 0.007054938381958392
          total_loss: 9.1859718215081
          vf_explained_var: -0.6726752779817069
          vf_loss: 9.178916850654028
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 256000
    num_agent_steps_trained: 2560

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,64,543.821,256000,161.44,200,100,161.44


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 260000
  custom_metrics: {}
  date: 2022-04-24_09-20-50
  done: false
  episode_len_mean: 166.7
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 166.7
  episode_reward_min: 85.0
  episodes_this_iter: 22
  episodes_total: 1834
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.7347234759768074e-19
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5896087288215596
          entropy_coeff: 0.0
          kl: 0.0002552454950774454
          policy_loss: 0.00563842246949833
          total_loss: 8.556963749854795
          vf_explained_var: -0.423495287728566
          vf_loss: 8.55132532376115
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 260000
    num_agent_steps_trained: 260000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,65,552.328,260000,166.7,200,85,166.7


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 264000
  custom_metrics: {}
  date: 2022-04-24_09-20-59
  done: false
  episode_len_mean: 164.52
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 164.52
  episode_reward_min: 64.0
  episodes_this_iter: 25
  episodes_total: 1859
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 8.673617379884037e-20
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5832664574987145
          entropy_coeff: 0.0
          kl: 0.0003662909070381057
          policy_loss: 0.004230213741100924
          total_loss: 8.551849018630161
          vf_explained_var: -0.4954159342473553
          vf_loss: 8.547618835203108
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 264000
    num_agent_steps_trained: 264

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,66,560.842,264000,164.52,200,64,164.52


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 268000
  custom_metrics: {}
  date: 2022-04-24_09-21-07
  done: false
  episode_len_mean: 170.07
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 170.07
  episode_reward_min: 64.0
  episodes_this_iter: 23
  episodes_total: 1882
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.3368086899420186e-20
          cur_lr: 5.0000000000000016e-05
          entropy: 0.582492329741037
          entropy_coeff: 0.0
          kl: 0.00035804752781384917
          policy_loss: 0.007500285398896023
          total_loss: 8.74727355023866
          vf_explained_var: -0.5866491427985571
          vf_loss: 8.739773249882523
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 268000
    num_agent_steps_trained: 268

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,67,569.297,268000,170.07,200,64,170.07


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 272000
  custom_metrics: {}
  date: 2022-04-24_09-21-16
  done: false
  episode_len_mean: 170.42
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 170.42
  episode_reward_min: 64.0
  episodes_this_iter: 24
  episodes_total: 1906
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.1684043449710093e-20
          cur_lr: 5.0000000000000016e-05
          entropy: 0.589167522102274
          entropy_coeff: 0.0
          kl: 0.004339234827223342
          policy_loss: 0.002880404486511183
          total_loss: 8.768358557711366
          vf_explained_var: -0.42745801332176375
          vf_loss: 8.765478142871652
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 272000
    num_agent_steps_trained: 272

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,68,577.822,272000,170.42,200,64,170.42


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 276000
  custom_metrics: {}
  date: 2022-04-24_09-21-25
  done: false
  episode_len_mean: 168.18
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 168.18
  episode_reward_min: 64.0
  episodes_this_iter: 24
  episodes_total: 1930
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0842021724855046e-20
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5872558982141556
          entropy_coeff: 0.0
          kl: 0.00026275159880303144
          policy_loss: 0.008031877092215964
          total_loss: 8.505886462426954
          vf_explained_var: -0.6362809618955018
          vf_loss: 8.497854576572296
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 276000
    num_agent_steps_trained: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,69,586.778,276000,168.18,200,64,168.18


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 280000
  custom_metrics: {}
  date: 2022-04-24_09-21-33
  done: false
  episode_len_mean: 165.94
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 165.94
  episode_reward_min: 64.0
  episodes_this_iter: 26
  episodes_total: 1956
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.421010862427523e-21
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5920108074142086
          entropy_coeff: 0.0
          kl: 0.0033549051364427426
          policy_loss: 0.005642980521404615
          total_loss: 8.481618097777007
          vf_explained_var: -0.5640326612739153
          vf_loss: 8.475975097635741
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 280000
    num_agent_steps_trained: 280

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,70,595.23,280000,165.94,200,64,165.94


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 284000
  custom_metrics: {}
  date: 2022-04-24_09-21-42
  done: false
  episode_len_mean: 160.55
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 160.55
  episode_reward_min: 79.0
  episodes_this_iter: 25
  episodes_total: 1981
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.7105054312137616e-21
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5925544429850834
          entropy_coeff: 0.0
          kl: 0.0005795000784447138
          policy_loss: 0.007499216374270218
          total_loss: 8.453160793294188
          vf_explained_var: -0.5819881795555033
          vf_loss: 8.445661570692575
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 284000
    num_agent_steps_trained: 28

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,71,603.717,284000,160.55,200,79,160.55


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 288000
  custom_metrics: {}
  date: 2022-04-24_09-21-50
  done: false
  episode_len_mean: 157.4
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 157.4
  episode_reward_min: 23.0
  episodes_this_iter: 27
  episodes_total: 2008
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.3552527156068808e-21
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5855716326544361
          entropy_coeff: 0.0
          kl: 0.004586380095688108
          policy_loss: 0.0021965504283466006
          total_loss: 8.631554273123383
          vf_explained_var: -0.4968906270560398
          vf_loss: 8.629357720959572
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 288000
    num_agent_steps_trained: 2880

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,72,612.08,288000,157.4,200,23,157.4


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 292000
  custom_metrics: {}
  date: 2022-04-24_09-21-58
  done: false
  episode_len_mean: 158.92
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 158.92
  episode_reward_min: 23.0
  episodes_this_iter: 22
  episodes_total: 2030
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 6.776263578034404e-22
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5983270578486946
          entropy_coeff: 0.0
          kl: 0.0034699327505637186
          policy_loss: 0.005292080905568856
          total_loss: 7.655625860152706
          vf_explained_var: -0.5539955777506674
          vf_loss: 7.650333742172488
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 292000
    num_agent_steps_trained: 292

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,73,620.446,292000,158.92,200,23,158.92


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 296000
  custom_metrics: {}
  date: 2022-04-24_09-22-07
  done: false
  episode_len_mean: 164.09
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 164.09
  episode_reward_min: 23.0
  episodes_this_iter: 23
  episodes_total: 2053
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.388131789017202e-22
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5853875093562628
          entropy_coeff: 0.0
          kl: 0.0005868259853793125
          policy_loss: 0.00737135277595371
          total_loss: 8.16406074672617
          vf_explained_var: -0.4261636035416716
          vf_loss: 8.156689415952211
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 296000
    num_agent_steps_trained: 29600

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,74,628.862,296000,164.09,200,23,164.09


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 300000
  custom_metrics: {}
  date: 2022-04-24_09-22-15
  done: false
  episode_len_mean: 171.42
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 171.42
  episode_reward_min: 23.0
  episodes_this_iter: 21
  episodes_total: 2074
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.694065894508601e-22
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5943174884524397
          entropy_coeff: 0.0
          kl: 0.0031722666141177002
          policy_loss: 0.002502162105614139
          total_loss: 7.138015138205661
          vf_explained_var: -0.3596859049412512
          vf_loss: 7.135512985208983
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 300000
    num_agent_steps_trained: 300

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,75,637.248,300000,171.42,200,23,171.42


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 304000
  custom_metrics: {}
  date: 2022-04-24_09-22-24
  done: false
  episode_len_mean: 179.05
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 179.05
  episode_reward_min: 61.0
  episodes_this_iter: 21
  episodes_total: 2095
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 8.470329472543005e-23
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5751462172436458
          entropy_coeff: 0.0
          kl: 0.0019131164407978837
          policy_loss: 0.004337801996578452
          total_loss: 7.125698311354524
          vf_explained_var: -0.3222582479317983
          vf_loss: 7.12136052500817
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 304000
    num_agent_steps_trained: 3040

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,76,645.784,304000,179.05,200,61,179.05


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 308000
  custom_metrics: {}
  date: 2022-04-24_09-22-32
  done: false
  episode_len_mean: 185.99
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 185.99
  episode_reward_min: 61.0
  episodes_this_iter: 20
  episodes_total: 2115
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.2351647362715025e-23
          cur_lr: 5.0000000000000016e-05
          entropy: 0.590102929581878
          entropy_coeff: 0.0
          kl: 0.004596610224980103
          policy_loss: 0.001985705181235267
          total_loss: 6.6664554380601455
          vf_explained_var: -0.3216712075535969
          vf_loss: 6.66446976712955
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 308000
    num_agent_steps_trained: 3080

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,77,654.319,308000,185.99,200,61,185.99


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 312000
  custom_metrics: {}
  date: 2022-04-24_09-22-41
  done: false
  episode_len_mean: 188.69
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 188.69
  episode_reward_min: 61.0
  episodes_this_iter: 22
  episodes_total: 2137
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.1175823681357513e-23
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5972803544613623
          entropy_coeff: 0.0
          kl: 0.0034130513060374574
          policy_loss: 0.0040544798858063195
          total_loss: 7.425169577649845
          vf_explained_var: -0.33098021598272426
          vf_loss: 7.421115094102839
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 312000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,78,662.769,312000,188.69,200,61,188.69


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 316000
  custom_metrics: {}
  date: 2022-04-24_09-22-49
  done: false
  episode_len_mean: 192.09
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 192.09
  episode_reward_min: 86.0
  episodes_this_iter: 20
  episodes_total: 2157
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.0587911840678756e-23
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6037280794112913
          entropy_coeff: 0.0
          kl: 0.0009081130114167203
          policy_loss: 0.007654537885419784
          total_loss: 7.532882459958395
          vf_explained_var: -0.4872387154127962
          vf_loss: 7.525227913036141
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 316000
    num_agent_steps_trained: 31

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,79,671.31,316000,192.09,200,86,192.09


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 320000
  custom_metrics: {}
  date: 2022-04-24_09-22-58
  done: false
  episode_len_mean: 192.51
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 192.51
  episode_reward_min: 68.0
  episodes_this_iter: 22
  episodes_total: 2179
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 5.293955920339378e-24
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6002520016444627
          entropy_coeff: 0.0
          kl: 0.0007947204126714046
          policy_loss: 0.00944104211465005
          total_loss: 8.616502703389814
          vf_explained_var: -0.4549045234598139
          vf_loss: 8.607061653239752
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 320000
    num_agent_steps_trained: 3200

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,80,679.831,320000,192.51,200,68,192.51


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 324000
  custom_metrics: {}
  date: 2022-04-24_09-23-06
  done: false
  episode_len_mean: 189.53
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 189.53
  episode_reward_min: 68.0
  episodes_this_iter: 22
  episodes_total: 2201
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.646977960169689e-24
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5796742738575064
          entropy_coeff: 0.0
          kl: 0.0015702854452084216
          policy_loss: -0.014219289917939453
          total_loss: 8.203294894515826
          vf_explained_var: -0.48173794778444434
          vf_loss: 8.217514198569841
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 324000
    num_agent_steps_trained: 3

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,81,688.201,324000,189.53,200,68,189.53


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 328000
  custom_metrics: {}
  date: 2022-04-24_09-23-15
  done: false
  episode_len_mean: 189.4
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 189.4
  episode_reward_min: 68.0
  episodes_this_iter: 20
  episodes_total: 2221
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.3234889800848445e-24
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5904724501794384
          entropy_coeff: 0.0
          kl: 0.004407113492059118
          policy_loss: 0.004476337483333003
          total_loss: 8.212241318661679
          vf_explained_var: -0.5430393975909038
          vf_loss: 8.207764988048103
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 328000
    num_agent_steps_trained: 32800

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,82,696.7,328000,189.4,200,68,189.4


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 332000
  custom_metrics: {}
  date: 2022-04-24_09-23-24
  done: false
  episode_len_mean: 190.01
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 190.01
  episode_reward_min: 68.0
  episodes_this_iter: 20
  episodes_total: 2241
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 6.617444900424223e-25
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5839983639537647
          entropy_coeff: 0.0
          kl: 0.0013566747391771659
          policy_loss: 0.003682768196668676
          total_loss: 5.873287812356026
          vf_explained_var: -0.17510581183177168
          vf_loss: 5.869605040293868
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 332000
    num_agent_steps_trained: 33

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,83,705.23,332000,190.01,200,68,190.01


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 336000
  custom_metrics: {}
  date: 2022-04-24_09-23-32
  done: false
  episode_len_mean: 190.73
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 190.73
  episode_reward_min: 68.0
  episodes_this_iter: 20
  episodes_total: 2261
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.3087224502121113e-25
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5749405957037402
          entropy_coeff: 0.0
          kl: 0.0006941214280213033
          policy_loss: 0.0014989095429579417
          total_loss: 5.837700118813464
          vf_explained_var: -0.05022854471719393
          vf_loss: 5.836201187872118
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 336000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,84,713.602,336000,190.73,200,68,190.73


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 340000
  custom_metrics: {}
  date: 2022-04-24_09-23-40
  done: false
  episode_len_mean: 193.55
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 193.55
  episode_reward_min: 111.0
  episodes_this_iter: 21
  episodes_total: 2282
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.6543612251060557e-25
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5836896336847736
          entropy_coeff: 0.0
          kl: 0.00019154569277586418
          policy_loss: -0.005443457808465727
          total_loss: 5.535590509958165
          vf_explained_var: -0.11281915326272288
          vf_loss: 5.541033950800537
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 340000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,RUNNING,172.28.0.2:3156,85,721.924,340000,193.55,200,111,193.55


Result for PPO_CartPole-v0_80d83_00000:
  agent_timesteps_total: 344000
  custom_metrics: {}
  date: 2022-04-24_09-23-49
  done: true
  episode_len_mean: 196.11
  episode_media: {}
  episode_reward_max: 200.0
  episode_reward_mean: 196.11
  episode_reward_min: 111.0
  episodes_this_iter: 20
  episodes_total: 2302
  experiment_id: 495e8b46f96c44a5bba45679f3ae141f
  hostname: 4671fbd62325
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 8.271806125530278e-26
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5750028940939134
          entropy_coeff: 0.0
          kl: 0.0005383986887342405
          policy_loss: 0.004769039234166504
          total_loss: 6.518303403803097
          vf_explained_var: -0.1406329942005937
          vf_loss: 6.513534375929063
        model: {}
        num_agent_steps_trained: 128.0
    num_agent_steps_sampled: 344000
    num_agent_steps_trained: 344

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_80d83_00000,TERMINATED,172.28.0.2:3156,86,730.413,344000,196.11,200,111,196.11


2022-04-24 09:23:49,677	INFO tune.py:702 -- Total run time: 750.20 seconds (749.72 seconds for the tuning loop).


In [None]:

parameter_search_config = {
    "env": "CartPole-v0",
    "framework": "torch",

    # Hyperparameter tuning
    "model": {
      "fcnet_hiddens": ray.tune.grid_search([[32], [64]]),
      "fcnet_activation": ray.tune.grid_search(["linear", "relu"]),
      "dueling": ray.tune.grid_search([True, False]),
      "double_q": ray.tune.grid_search([True, False])
    },
    "lr": ray.tune.uniform(1e-7, 1e-2)
}

# To explicitly stop or restart Ray, use the shutdown API.
ray.shutdown()

ray.init(
  num_cpus=12,
  include_dashboard=False,
  ignore_reinit_error=True,
  log_to_driver=False,
)

parameter_search_analysis = ray.tune.run(
  "PPO",
  config=parameter_search_config,
  stop=stop,
  num_samples=5,
  metric="timesteps_total",
  mode="min",
)

print(
  "Best hyperparameters found:",
  parameter_search_analysis.best_config,
)

2022-04-24 09:23:56,238	INFO trial_runner.py:803 -- starting PPO_CartPole-v0_43c3f_00000


# Model building
 

In [None]:
try:
    import pyvirtualdisplay
    display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
except ImportError:
    pass

In [None]:
env.render()

In [None]:
def plot_environment(env, figsize=(5,4)):
    plt.figure(figsize=figsize)
    img = env.render(mode="rgb_array")
    plt.imshow(img)
    plt.axis("off")
    return img

In [None]:
plot_environment(env)
plt.show()

In [None]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [None]:
def render_policy_net(model, n_max_steps=200, seed=42):
    frames = []
    env = gym.make("CartPole-v1")
    env.seed(seed)
    np.random.seed(seed)
    obs = env.reset()
    for step in range(n_max_steps):
        frames.append(env.render(mode="rgb_array"))
        left_proba = model.predict(obs.reshape(1, -1))
        action = int(np.random.rand() > left_proba)
        obs, reward, done, info = env.step(action)
        if done:
            break
    env.close()
    return frames

In [None]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [None]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

K = keras.backend
input_states = keras.layers.Input(shape=[4])
hidden1 = keras.layers.Dense(32, activation="elu")(input_states)
hidden2 = keras.layers.Dense(32, activation="elu")(hidden1)
state_values = keras.layers.Dense(1)(hidden2)
raw_advantages = keras.layers.Dense(n_outputs)(hidden2)
advantages = raw_advantages - K.max(raw_advantages, axis=1, keepdims=True)
Q_values = state_values + advantages
model = keras.models.Model(inputs=[input_states], outputs=[Q_values])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [None]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(learning_rate=7.5e-3)
loss_fn = keras.losses.Huber()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis=1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)
    target_Q_values = (rewards + 
                       (1 - dones) * discount_rate * next_best_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
replay_memory = deque(maxlen=2000)

In [None]:
env.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

rewards = []
best_score = 0

for episode in range(600):
    obs = env.reset()    
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    rewards.append(step)
    if step >= best_score:
        best_weights = model.get_weights()
        best_score = step
    print("\rEpisode: {}, Steps: {}, eps: {:.3f}".format(episode, step + 1, epsilon), end="")
    if episode >= 50:
        training_step(batch_size)
        if episode % 50 == 0:
            target.set_weights(model.get_weights())

model.set_weights(best_weights)

In [None]:
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Sum of rewards")
plt.show()